Initial commit

fbd3199c · fengzch-das · fbd3199c · fbd3199c · fbd3199c · fbd3199c
Commit fbd3199c authored Apr 18, 2025 by fengzch-das
20 changed files
--- a/opencv/3rdparty/carotene/src/convert_scale.cpp
+++ b/opencv/3rdparty/carotene/src/convert_scale.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
+    void convertScale(const Size2D &_size,                                       \
+                      const T1 * srcBase, ptrdiff_t srcStride,                   \
+                      T2 * dstBase, ptrdiff_t dstStride,                         \
+                      f64 alpha, f64 beta)                                       \
+    {                                                                            \
+        internal::assertSupportedConfiguration();                                \
+        Size2D size(_size);                                                      \
+        if (srcStride == dstStride &&                                            \
+            srcStride == (ptrdiff_t)(size.width))                                \
+        {                                                                        \
+            size.width *= size.height;                                           \
+            size.height = 1;                                                     \
+        }                                                                        \
+        const ptrdiff_t sstep = srcStride / sizeof(T1);                          \
+        const ptrdiff_t dstep = dstStride / sizeof(T2);                          \
+        const size_t w = size.width & ~(SIMD_SIZE-1);                            \
+        if (size.width >= SIMD_SIZE)                                             \
+        {                                                                        \
+            const T1* _src = srcBase;                                            \
+            T2* _dst = dstBase;                                                  \
+            CVTINIT                                                              \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
+                CVTROW                                                           \
+        }                                                                        \
+        if(w < size.width)                                                       \
+        {                                                                        \
+            const T1* _src = srcBase;                                            \
+            T2* _dst = dstBase;                                                  \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
+                for(size_t i = w; i < size.width; i++ )                          \
+                    _dst[i] = internal::saturate_cast<T2>(_src[i]*alpha + beta); \
+        }                                                                        \
+    }
+
+#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW)                             \
+    void convertScale(const Size2D &_size,                                       \
+                      const T1 * srcBase, ptrdiff_t srcStride,                   \
+                      T1 * dstBase, ptrdiff_t dstStride,                         \
+                      f64 alpha, f64 beta)                                       \
+    {                                                                            \
+        internal::assertSupportedConfiguration();                                \
+        Size2D size(_size);                                                      \
+        if (srcStride == dstStride &&                                            \
+            srcStride == (ptrdiff_t)(size.width))                                \
+        {                                                                        \
+            size.width *= size.height;                                           \
+            size.height = 1;                                                     \
+        }                                                                        \
+        const ptrdiff_t sstep = srcStride / sizeof(T1);                          \
+        const ptrdiff_t dstep = dstStride / sizeof(T1);                          \
+        const size_t w = size.width & ~(SIMD_SIZE-1);                            \
+        if (size.width >= SIMD_SIZE)                                             \
+        {                                                                        \
+            const T1* _src = srcBase;                                            \
+            T1* _dst = dstBase;                                                  \
+            CVTSINIT                                                             \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
+                CVTSROW                                                          \
+        }                                                                        \
+        if(w < size.width)                                                       \
+        {                                                                        \
+            const T1* _src = srcBase;                                            \
+            T1* _dst = dstBase;                                                  \
+            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
+                for(size_t i = w; i < size.width; i++ )                          \
+                    _dst[i] = internal::saturate_cast<T1>(_src[i]*alpha + beta); \
+        }                                                                        \
+    }
+
+#else
+
+#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
+    void convertScale(const Size2D &,                                            \
+                      const T1 *, ptrdiff_t,                                     \
+                      T2 *, ptrdiff_t,                                           \
+                      f64, f64)                                                  \
+    {                                                                            \
+        internal::assertSupportedConfiguration();                                \
+    }
+
+#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW)                             \
+    void convertScale(const Size2D &,                                            \
+                      const T1 *, ptrdiff_t,                                     \
+                      T1 *, ptrdiff_t,                                           \
+                      f64, f64)                                                  \
+    {                                                                            \
+        internal::assertSupportedConfiguration();                                \
+    }
+
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC1(u8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovun.s32 d22, q7                                   \n\t"
+            "vqmovun.s32 d23, q8                                   \n\t"
+            "vqmovun.s32 d24, q9                                   \n\t"
+            "vqmovun.s32 d25, q10                                  \n\t"
+            "vqmovn.u16 d26, q11                                   \n\t"
+            "vqmovn.u16 d27, q12                                   \n\t"
+            "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC1(u8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
+        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
+        vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(u8, s8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovn.s32 d22, q7                                    \n\t"
+            "vqmovn.s32 d23, q8                                    \n\t"
+            "vqmovn.s32 d24, q9                                    \n\t"
+            "vqmovn.s32 d25, q10                                   \n\t"
+            "vqmovn.s16 d26, q11                                   \n\t"
+            "vqmovn.s16 d27, q12                                   \n\t"
+            "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
+            : //no output
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, s8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
+        int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
+        vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(u8, u16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovun.s32 d22, q7                                   \n\t"
+            "vqmovun.s32 d23, q8                                   \n\t"
+            "vqmovun.s32 d24, q9                                   \n\t"
+            "vqmovun.s32 d25, q10                                  \n\t"
+            "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
+            "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 8),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, u16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
+        vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(u8, s16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovn.s32 d22, q7                                    \n\t"
+            "vqmovn.s32 d23, q8                                    \n\t"
+            "vqmovn.s32 d24, q9                                    \n\t"
+            "vqmovn.s32 d25, q10                                   \n\t"
+            "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
+            "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
+            : //no output
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 8),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, s16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
+        vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(u8, s32, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
+            "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
+            "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
+            "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              [dst3] "r" (_dst + i + 8),
+              [dst4] "r" (_dst + i + 12),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10",
+            "d11","d12","d13","d14","d15","d16","d17",
+            "d18","d19","d20","d21","d22","d23","d24",
+            "d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, s32, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vst1q_s32(_dst + i + 0,  vline1_s32);
+        vst1q_s32(_dst + i + 4,  vline2_s32);
+        vst1q_s32(_dst + i + 8,  vline3_s32);
+        vst1q_s32(_dst + i + 12, vline4_s32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(u8, f32, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u8 q3, d4                                       \n\t"
+            "vmovl.u8 q4, d5                                       \n\t"
+            "vmovl.u16 q5, d6                                      \n\t"
+            "vmovl.u16 q6, d7                                      \n\t"
+            "vmovl.u16 q7, d8                                      \n\t"
+            "vmovl.u16 q8, d9                                      \n\t"
+            "vcvt.f32.u32 q9, q5                                   \n\t"
+            "vcvt.f32.u32 q10, q6                                  \n\t"
+            "vcvt.f32.u32 q11, q7                                  \n\t"
+            "vcvt.f32.u32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vst1.32 {d6-d7}, [%[dst1]]                            \n\t"
+            "vst1.32 {d8-d9}, [%[dst2]]                            \n\t"
+            "vst1.32 {d10-d11}, [%[dst3]]                          \n\t"
+            "vst1.32 {d12-d13}, [%[dst4]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              [dst3] "r" (_dst + i + 8),
+              [dst4] "r" (_dst + i + 12),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10",
+            "d11","d12","d13","d14","d15","d16","d17",
+            "d18","d19","d20","d21","d22","d23","d24",
+            "d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(u8, f32, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        uint8x16_t vline = vld1q_u8(_src + i);
+        uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
+        uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
+        uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
+        uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
+        float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vst1q_f32(_dst + i + 0,  vline1_f32);
+        vst1q_f32(_dst + i + 4,  vline2_f32);
+        vst1q_f32(_dst + i + 8,  vline3_f32);
+        vst1q_f32(_dst + i + 12, vline4_f32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(s8, u8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovun.s32 d22, q7                                   \n\t"
+            "vqmovun.s32 d23, q8                                   \n\t"
+            "vqmovun.s32 d24, q9                                   \n\t"
+            "vqmovun.s32 d25, q10                                  \n\t"
+            "vqmovn.u16 d26, q11                                   \n\t"
+            "vqmovn.u16 d27, q12                                   \n\t"
+            "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, u8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
+        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
+        vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC1(s8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovn.s32 d22, q7                                    \n\t"
+            "vqmovn.s32 d23, q8                                    \n\t"
+            "vqmovn.s32 d24, q9                                    \n\t"
+            "vqmovn.s32 d25, q10                                   \n\t"
+            "vqmovn.s16 d26, q11                                   \n\t"
+            "vqmovn.s16 d27, q12                                   \n\t"
+            "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC1(s8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
+        int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
+        vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(s8, u16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovun.s32 d22, q7                                   \n\t"
+            "vqmovun.s32 d23, q8                                   \n\t"
+            "vqmovun.s32 d24, q9                                   \n\t"
+            "vqmovun.s32 d25, q10                                  \n\t"
+            "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
+            "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 8),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, u16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
+        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
+        vst1q_u16(_dst + i + 0, vRes1_u16);
+        vst1q_u16(_dst + i + 8, vRes2_u16);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+CVTS_FUNC(s8, s16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vqmovn.s32 d22, q7                                    \n\t"
+            "vqmovn.s32 d23, q8                                    \n\t"
+            "vqmovn.s32 d24, q9                                    \n\t"
+            "vqmovn.s32 d25, q10                                   \n\t"
+            "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
+            "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 8),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, s16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
+        int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
+        vst1q_s16(_dst + i + 0, vRes1_s16);
+        vst1q_s16(_dst + i + 8, vRes2_s16);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s8, s32, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vcvt.s32.f32 q7, q3                                   \n\t"
+            "vcvt.s32.f32 q8, q4                                   \n\t"
+            "vcvt.s32.f32 q9, q5                                   \n\t"
+            "vcvt.s32.f32 q10, q6                                  \n\t"
+            "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
+            "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
+            "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
+            "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              [dst3] "r" (_dst + i + 8),
+              [dst4] "r" (_dst + i + 12),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10",
+            "d11","d12","d13","d14","d15","d16","d17",
+            "d18","d19","d20","d21","d22","d23","d24",
+            "d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, s32, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vst1q_s32(_dst + i + 0,  vline1_s32);
+        vst1q_s32(_dst + i + 4,  vline2_s32);
+        vst1q_s32(_dst + i + 8,  vline3_s32);
+        vst1q_s32(_dst + i + 12, vline4_s32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s8, f32, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s8 q3, d4                                       \n\t"
+            "vmovl.s8 q4, d5                                       \n\t"
+            "vmovl.s16 q5, d6                                      \n\t"
+            "vmovl.s16 q6, d7                                      \n\t"
+            "vmovl.s16 q7, d8                                      \n\t"
+            "vmovl.s16 q8, d9                                      \n\t"
+            "vcvt.f32.s32 q9, q5                                   \n\t"
+            "vcvt.f32.s32 q10, q6                                  \n\t"
+            "vcvt.f32.s32 q11, q7                                  \n\t"
+            "vcvt.f32.s32 q12, q8                                  \n\t"
+            "vmul.f32 q13, q9, q0                                  \n\t"
+            "vmul.f32 q14, q10, q0                                 \n\t"
+            "vmul.f32 q15, q11, q0                                 \n\t"
+            "vmul.f32 q2, q12, q0                                  \n\t"
+            "vadd.f32 q3, q13, q1                                  \n\t"
+            "vadd.f32 q4, q14, q1                                  \n\t"
+            "vadd.f32 q5, q15, q1                                  \n\t"
+            "vadd.f32 q6, q2, q1                                   \n\t"
+            "vst1.32 {d6-d7}, [%[dst1]]                            \n\t"
+            "vst1.32 {d8-d9}, [%[dst2]]                            \n\t"
+            "vst1.32 {d10-d11}, [%[dst3]]                          \n\t"
+            "vst1.32 {d12-d13}, [%[dst4]]                          \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              [dst3] "r" (_dst + i + 8),
+              [dst4] "r" (_dst + i + 12),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10",
+            "d11","d12","d13","d14","d15","d16","d17",
+            "d18","d19","d20","d21","d22","d23","d24",
+            "d25","d26","d27","d28","d29","d30","d31"
+        );
+    }
+})
+#else
+CVTS_FUNC(s8, f32, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 16)
+    {
+        internal::prefetch(_src + i);
+        int8x16_t vline = vld1q_s8(_src + i);
+        int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
+        int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
+        int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
+        int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
+        float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline3_f32 = vmulq_f32(vline3_f32, vscale);
+        vline4_f32 = vmulq_f32(vline4_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline3_f32 = vaddq_f32(vline3_f32, vshift);
+        vline4_f32 = vaddq_f32(vline4_f32, vshift);
+        vst1q_f32(_dst + i + 0,  vline1_f32);
+        vst1q_f32(_dst + i + 4,  vline2_f32);
+        vst1q_f32(_dst + i + 8,  vline3_f32);
+        vst1q_f32(_dst + i + 12, vline4_f32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(u16, u8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
+            "vmovl.u16 q3, d4                                      \n\t"
+            "vmovl.u16 q4, d5                                      \n\t"
+            "vcvt.f32.u32 q5, q3                                   \n\t"
+            "vcvt.f32.u32 q6, q4                                   \n\t"
+            "vmul.f32 q7, q5, q0                                   \n\t"
+            "vmul.f32 q8, q6, q0                                   \n\t"
+            "vadd.f32 q9, q7, q1                                   \n\t"
+            "vadd.f32 q10, q8, q1                                  \n\t"
+            "vcvt.s32.f32 q11, q9                                  \n\t"
+            "vcvt.s32.f32 q12, q10                                 \n\t"
+            "vqmovn.s32 d26, q11                                   \n\t"
+            "vqmovn.s32 d27, q12                                   \n\t"
+            "vqmovun.s16 d28, q13                                  \n\t"
+             "vst1.8 {d28}, [%[dst]]                               \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+               "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, u8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
+        vst1_u8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(u16, s8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
+            "vmovl.u16 q3, d4                                      \n\t"
+            "vmovl.u16 q4, d5                                      \n\t"
+            "vcvt.f32.u32 q5, q3                                   \n\t"
+            "vcvt.f32.u32 q6, q4                                   \n\t"
+            "vmul.f32 q7, q5, q0                                   \n\t"
+            "vmul.f32 q8, q6, q0                                   \n\t"
+            "vadd.f32 q9, q7, q1                                   \n\t"
+            "vadd.f32 q10, q8, q1                                  \n\t"
+            "vcvt.s32.f32 q11, q9                                  \n\t"
+            "vcvt.s32.f32 q12, q10                                 \n\t"
+            "vqmovn.s32 d26, q11                                   \n\t"
+            "vqmovn.s32 d27, q12                                   \n\t"
+            "vqmovn.s16 d28, q13                                   \n\t"
+            "vst1.8 {d28}, [%[dst]]                                \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+               "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, s8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
+        vst1_s8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC1(u16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u16 q3, d4                                       \n\t"
+            "vmovl.u16 q4, d5                                       \n\t"
+            "vcvt.f32.u32 q5, q3                                    \n\t"
+            "vcvt.f32.u32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vqmovun.s32 d26, q11                                   \n\t"
+            "vqmovun.s32 d27, q12                                   \n\t"
+            "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+              "w" (vshift), "w" (vscale)
+            : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
+        );
+    }
+})
+#else
+CVTS_FUNC1(u16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
+        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
+        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(u16, s16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u16 q3, d4                                       \n\t"
+            "vmovl.u16 q4, d5                                       \n\t"
+            "vcvt.f32.u32 q5, q3                                    \n\t"
+            "vcvt.f32.u32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vqmovn.s32 d26, q11                                    \n\t"
+            "vqmovn.s32 d27, q12                                    \n\t"
+            "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+              "w" (vshift), "w" (vscale)
+            : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, s16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(u16, s32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                        \n\t"
+            "vmovl.u16 q3, d4                                 \n\t"
+            "vmovl.u16 q4, d5                                 \n\t"
+            "vcvt.f32.u32 q5, q3                              \n\t"
+            "vcvt.f32.u32 q6, q4                              \n\t"
+            "vmul.f32 q7, q5, q0                              \n\t"
+            "vmul.f32 q8, q6, q0                              \n\t"
+            "vadd.f32 q9, q7, q1                              \n\t"
+            "vadd.f32 q10, q8, q1                             \n\t"
+            "vcvt.s32.f32 q11, q9                             \n\t"
+            "vcvt.s32.f32 q12, q10                            \n\t"
+            "vst1.32 {d22-d23}, [%[dst1]]                     \n\t"
+            "vst1.32 {d24-d25}, [%[dst2]]                     \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i),
+              [dst2] "r" (_dst + i + 4),
+              "w" (vshift), "w" (vscale)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, s32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vst1q_s32(_dst + i + 0, vline1_s32);
+        vst1q_s32(_dst + i + 4, vline2_s32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(u16, f32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.u16 q3, d4                                       \n\t"
+            "vmovl.u16 q4, d5                                       \n\t"
+             "vcvt.f32.u32 q5, q3                                    \n\t"
+            "vcvt.f32.u32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vst1.32 {d18-d19}, [%[dst1]]                           \n\t"
+            "vst1.32 {d20-d21}, [%[dst2]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
+        );
+    }
+})
+#else
+CVTS_FUNC(u16, f32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        uint16x8_t vline = vld1q_u16(_src + i);
+        uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
+        uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
+        float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vst1q_f32(_dst + i + 0, vline1_f32);
+        vst1q_f32(_dst + i + 4, vline2_f32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s16, u8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
+            "vmovl.s16 q3, d4                                      \n\t"
+            "vmovl.s16 q4, d5                                      \n\t"
+            "vcvt.f32.s32 q5, q3                                   \n\t"
+            "vcvt.f32.s32 q6, q4                                   \n\t"
+            "vmul.f32 q7, q5, q0                                   \n\t"
+            "vmul.f32 q8, q6, q0                                   \n\t"
+            "vadd.f32 q9, q7, q1                                   \n\t"
+            "vadd.f32 q10, q8, q1                                  \n\t"
+            "vcvt.s32.f32 q11, q9                                  \n\t"
+            "vcvt.s32.f32 q12, q10                                 \n\t"
+            "vqmovn.s32 d26, q11                                   \n\t"
+            "vqmovn.s32 d27, q12                                   \n\t"
+            "vqmovun.s16 d28, q13                                  \n\t"
+            "vst1.8 {d28}, [%[dst]]                                \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+               "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, u8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
+        vst1_u8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s16, s8, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
+            "vmovl.s16 q3, d4                                      \n\t"
+            "vmovl.s16 q4, d5                                      \n\t"
+            "vcvt.f32.s32 q5, q3                                   \n\t"
+            "vcvt.f32.s32 q6, q4                                   \n\t"
+            "vmul.f32 q7, q5, q0                                   \n\t"
+            "vmul.f32 q8, q6, q0                                   \n\t"
+            "vadd.f32 q9, q7, q1                                   \n\t"
+            "vadd.f32 q10, q8, q1                                  \n\t"
+            "vcvt.s32.f32 q11, q9                                  \n\t"
+            "vcvt.s32.f32 q12, q10                                 \n\t"
+            "vqmovn.s32 d26, q11                                   \n\t"
+            "vqmovn.s32 d27, q12                                   \n\t"
+            "vqmovn.s16 d28, q13                                   \n\t"
+            "vst1.8 {d28}, [%[dst]]                                \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+               "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, s8, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
+        vst1_s8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s16, u16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s16 q3, d4                                       \n\t"
+            "vmovl.s16 q4, d5                                       \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vcvt.f32.s32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vqmovun.s32 d26, q11                                   \n\t"
+            "vqmovun.s32 d27, q12                                   \n\t"
+            "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, u16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
+        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
+        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC1(s16, 16,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s16 q3, d4                                       \n\t"
+            "vmovl.s16 q4, d5                                       \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vcvt.f32.s32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vqmovn.s32 d26, q11                                    \n\t"
+            "vqmovn.s32 d27, q12                                    \n\t"
+            "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst] "r" (_dst + i + 0),
+              "w" (vshift), "w" (vscale)
+            : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
+        );
+    }
+})
+#else
+CVTS_FUNC1(s16, 16,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s16, s32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s16 q3, d4                                       \n\t"
+            "vmovl.s16 q4, d5                                       \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vcvt.f32.s32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vadd.f32 q10, q8, q1                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vcvt.s32.f32 q12, q10                                  \n\t"
+            "vst1.32 {d22-d23}, [%[dst1]]                           \n\t"
+            "vst1.32 {d24-d25}, [%[dst2]]                           \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, s32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vst1q_s32(_dst + i + 0, vline1_s32);
+        vst1q_s32(_dst + i + 4, vline2_s32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s16, f32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.16 {d4-d5}, [%[src]]                              \n\t"
+            "vmovl.s16 q3, d4                                       \n\t"
+            "vmovl.s16 q4, d5                                       \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vcvt.f32.s32 q6, q4                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vmul.f32 q8, q6, q0                                    \n\t"
+            "vadd.f32 q9, q7, q1                                     \n\t"
+            "vadd.f32 q10, q8, q1                                     \n\t"
+            "vst1.32 {d18-d19}, [%[dst1]]                             \n\t"
+            "vst1.32 {d20-d21}, [%[dst2]]                             \n\t"
+            : /*no output*/
+            : [src] "r" (_src + i),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
+        );
+    }
+})
+#else
+CVTS_FUNC(s16, f32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int16x8_t vline = vld1q_s16(_src + i);
+        int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
+        int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vst1q_f32(_dst + i + 0, vline1_f32);
+        vst1q_f32(_dst + i + 4, vline2_f32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s32, u8, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vcvt.f32.s32 q4, q2                                     \n\t"
+            "vcvt.f32.s32 q5, q3                                     \n\t"
+            "vmul.f32 q6, q4, q0                                     \n\t"
+            "vmul.f32 q7, q5, q0                                     \n\t"
+            "vadd.f32 q8, q6, q1                                     \n\t"
+            "vadd.f32 q9, q7, q1                                     \n\t"
+            "vcvt.s32.f32 q10, q8                                    \n\t"
+            "vcvt.s32.f32 q11, q9                                    \n\t"
+            "vqmovun.s32 d24, q10                                    \n\t"
+            "vqmovun.s32 d25, q11                                    \n\t"
+            "vqmovn.u16  d26, q12                                    \n\t"
+            "vst1.8 {d26}, [%[dst]]                                  \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, u8, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
+        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
+        uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
+        vst1_u8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s32, s8, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vcvt.f32.s32 q4, q2                                     \n\t"
+            "vcvt.f32.s32 q5, q3                                     \n\t"
+            "vmul.f32 q6, q4, q0                                     \n\t"
+            "vmul.f32 q7, q5, q0                                     \n\t"
+            "vadd.f32 q8, q6, q1                                     \n\t"
+            "vadd.f32 q9, q7, q1                                     \n\t"
+            "vcvt.s32.f32 q10, q8                                    \n\t"
+            "vcvt.s32.f32 q11, q9                                    \n\t"
+            "vqmovn.s32 d24, q10                                     \n\t"
+            "vqmovn.s32 d25, q11                                     \n\t"
+            "vqmovn.s16  d26, q12                                    \n\t"
+            "vst1.8 {d26}, [%[dst]]                                  \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, s8, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
+        vst1_s8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s32, u16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
+            "vcvt.f32.s32 q4, q2                                    \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vmul.f32 q6, q4, q0                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vadd.f32 q8, q6, q1                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vcvt.s32.f32 q10, q8                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vqmovun.s32 d24, q10                                   \n\t"
+            "vqmovun.s32 d25, q11                                   \n\t"
+            "vst1.16 {d24-d25}, [%[dst]]                            \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, u16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
+        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
+        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s32, s16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
+            "vcvt.f32.s32 q4, q2                                    \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vmul.f32 q6, q4, q0                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vadd.f32 q8, q6, q1                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vcvt.s32.f32 q10, q8                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vqmovn.s32 d24, q10                                    \n\t"
+            "vqmovn.s32 d25, q11                                    \n\t"
+            "vst1.8 {d24-d25}, [%[dst]]                             \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, s16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC1(s32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
+            "vcvt.f32.s32 q4, q2                                    \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vmul.f32 q6, q4, q0                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vadd.f32 q8, q6, q1                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vcvt.s32.f32 q10, q8                                   \n\t"
+            "vcvt.s32.f32 q11, q9                                   \n\t"
+            "vst1.32 {d20-d21}, [%[dst1]]                           \n\t"
+            "vst1.32 {d22-d23}, [%[dst2]]                           \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+        );
+    }
+})
+#else
+CVTS_FUNC1(s32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vst1q_s32(_dst + i + 0, vline1_s32);
+        vst1q_s32(_dst + i + 4, vline2_s32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(s32, f32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
+            "vcvt.f32.s32 q4, q2                                    \n\t"
+            "vcvt.f32.s32 q5, q3                                    \n\t"
+            "vmul.f32 q6, q4, q0                                    \n\t"
+            "vmul.f32 q7, q5, q0                                    \n\t"
+            "vadd.f32 q8, q6, q1                                    \n\t"
+            "vadd.f32 q9, q7, q1                                    \n\t"
+            "vst1.32 {d16-d17}, [%[dst1]]                           \n\t"
+            "vst1.32 {d18-d19}, [%[dst2]]                           \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i),
+              [src2] "r" (_src + i + 4),
+              [dst1] "r" (_dst + i),
+              [dst2] "r" (_dst + i + 4),
+              "w"  (vscale), "w" (vshift)
+           : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC(s32, f32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
+        int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
+        float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
+        float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vst1q_f32(_dst + i + 0, vline1_f32);
+        vst1q_f32(_dst + i + 4, vline2_f32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(f32, u8, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)((1 << 16)*alpha));
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)((1 << 16)*beta));
+    register uint32x4_t  vmask  asm ("q2") = vdupq_n_u32(1<<16);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d6-d7}, [%[src1]]                              \n\t"
+            "vld1.32 {d8-d9}, [%[src2]]                              \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vmul.f32 q6, q4, q0                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vadd.f32 q8, q6, q1                                     \n\t"
+            "vcvt.u32.f32 q9, q7                                     \n\t"
+            "vcvt.u32.f32 q10, q8                                    \n\t"
+            "vbic q11, q2, q6                                        \n\t"
+            "vbic q12, q2, q7                                        \n\t"
+            "vshr.u32 q13, q11, #16                                  \n\t"
+            "vshr.u32 q14, q12, #16                                  \n\t"
+            "vqsub.u32 q7, q9, q13                                   \n\t"
+            "vqsub.u32 q8, q10, q14                                  \n\t"
+            "vqrshrn.u32 d22, q7, #16                                \n\t"
+            "vqrshrn.u32 d23, q8, #16                                \n\t"
+            "vqmovn.u16 d30, q11                                     \n\t"
+            "vst1.8 {d30}, [%[dst]]                                  \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w" (vscale), "w" (vshift), "w" (vmask)
+            : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, u8, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)((1 << 16)*alpha));
+    float32x4_t vshift = vdupq_n_f32((f32)((1 << 16)*beta));
+    uint32x4_t  vmask  = vdupq_n_u32(1<<16);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        float32x4_t vline1Shifted_f32 = vaddq_f32(vline1_f32, vshift);
+        float32x4_t vline2Shifted_f32 = vaddq_f32(vline2_f32, vshift);
+        uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1Shifted_f32);
+        uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2Shifted_f32);
+        uint32x4_t vline1Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline2_f32));
+        uint32x4_t vline2Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline1Shifted_f32));
+        vline1Mask = vshrq_n_u32(vline1Mask, 16);
+        vline2Mask = vshrq_n_u32(vline2Mask, 16);
+        vline1_u32 = vqsubq_u32(vline1_u32, vline1Mask);
+        vline2_u32 = vqsubq_u32(vline2_u32, vline2Mask);
+        uint16x4_t vRes1 = vqrshrn_n_u32(vline1_u32, 16);
+        uint16x4_t vRes2 = vqrshrn_n_u32(vline2_u32, 16);
+        uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
+
+        vst1_u8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(f32, s8, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vcvt.s32.f32 q8, q6                                     \n\t"
+            "vcvt.s32.f32 q9, q7                                     \n\t"
+            "vqmovn.s32 d14, q8                                      \n\t"
+            "vqmovn.s32 d15, q9                                      \n\t"
+            "vqmovn.s16 d16, q7                                      \n\t"
+            "vst1.8 {d16}, [%[dst]]                                  \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, s8, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
+        vst1_s8(_dst + i, vRes);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(f32, u16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vcvt.u32.f32 q8, q6                                     \n\t"
+            "vcvt.u32.f32 q9, q7                                     \n\t"
+            "vqmovn.u32 d8, q8                                       \n\t"
+            "vqmovn.u32 d9, q9                                       \n\t"
+            "vst1.16 {d8-d9}, [%[dst]]                               \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, u16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);
+        uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);
+        uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
+        uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
+        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(f32, s16, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vcvt.s32.f32 q8, q6                                     \n\t"
+            "vcvt.s32.f32 q9, q7                                     \n\t"
+            "vqmovn.s32 d8, q8                                       \n\t"
+            "vqmovn.s32 d9, q9                                       \n\t"
+            "vst1.16 {d8-d9}, [%[dst]]                               \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst] "r" (_dst + i),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, s16, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
+        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
+        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC(f32, s32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vcvt.s32.f32 q4, q6                                     \n\t"
+            "vcvt.s32.f32 q5, q7                                     \n\t"
+            "vst1.32 {d8-d9},   [%[dst1]]                            \n\t"
+            "vst1.32 {d10-d11}, [%[dst2]]                            \n\t"
+            : //no output
+            : [src1] "r" (_src + i),
+              [src2] "r" (_src + i + 4),
+              [dst1] "r" (_dst + i),
+              [dst2] "r" (_dst + i + 4),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
+        );
+    }
+})
+#else
+CVTS_FUNC(f32, s32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vst1q_s32(_dst + i + 0, vline1_s32);
+        vst1q_s32(_dst + i + 4, vline2_s32);
+    }
+})
+#endif
+
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+CVTS_FUNC1(f32, 8,
+    register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
+    register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        __asm__ (
+            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
+            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
+            "vmul.f32 q4, q2, q0                                     \n\t"
+            "vmul.f32 q5, q3, q0                                     \n\t"
+            "vadd.f32 q6, q4, q1                                     \n\t"
+            "vadd.f32 q7, q5, q1                                     \n\t"
+            "vst1.32 {d12-d13}, [%[dst1]]                            \n\t"
+            "vst1.32 {d14-d15}, [%[dst2]]                            \n\t"
+            : /*no output*/
+            : [src1] "r" (_src + i + 0),
+              [src2] "r" (_src + i + 4),
+              [dst1] "r" (_dst + i + 0),
+              [dst2] "r" (_dst + i + 4),
+              "w" (vscale), "w" (vshift)
+            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
+        );
+    }
+})
+#else
+CVTS_FUNC1(f32, 8,
+    float32x4_t vscale = vdupq_n_f32((f32)alpha);
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
+{
+    for (size_t i = 0; i < w; i += 8)
+    {
+        internal::prefetch(_src + i);
+        float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
+        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
+        vline1_f32 = vmulq_f32(vline1_f32, vscale);
+        vline2_f32 = vmulq_f32(vline2_f32, vscale);
+        vline1_f32 = vaddq_f32(vline1_f32, vshift);
+        vline2_f32 = vaddq_f32(vline2_f32, vshift);
+        vst1q_f32(_dst + i + 0, vline1_f32);
+        vst1q_f32(_dst + i + 4, vline2_f32);
+    }
+})
+#endif
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/convolution.cpp
+++ b/opencv/3rdparty/carotene/src/convolution.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+
+namespace CAROTENE_NS {
+
+bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
+                            BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 8 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE) &&
+        (ksize.width == 3) && (ksize.height == 3);
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <int shift>
+int32x4_t vshrq_s32(int32x4_t value)
+{
+    return vshrq_n_s32(value, shift);
+}
+
+template <>
+int32x4_t vshrq_s32<0>(int32x4_t value)
+{
+    return value;
+}
+
+} // namespace
+
+typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
+
+#endif
+
+void convolution(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE border, u8 borderValue,
+                 const Size2D & ksize, s16 * kernelBase, u32 scale)
+{
+    internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
+#ifdef CAROTENE_NEON
+    const uint8x8_t v_zero_u8 = vdup_n_u8(0);
+    const uint8x8_t v_border = vdup_n_u8(borderValue);
+    const int32x4_t v_zero_s32 = vdupq_n_s32(0);
+
+    uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
+              tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
+              tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
+    uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
+
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+    static const vshrq_s32_func vshrq_s32_a[33] =
+    {
+        vshrq_s32<0>,
+        vshrq_s32<1>,
+        vshrq_s32<2>,
+        vshrq_s32<3>,
+        vshrq_s32<4>,
+        vshrq_s32<5>,
+        vshrq_s32<6>,
+        vshrq_s32<7>,
+        vshrq_s32<8>,
+        vshrq_s32<9>,
+        vshrq_s32<10>,
+        vshrq_s32<11>,
+        vshrq_s32<12>,
+        vshrq_s32<13>,
+        vshrq_s32<14>,
+        vshrq_s32<15>,
+        vshrq_s32<16>,
+        vshrq_s32<17>,
+        vshrq_s32<18>,
+        vshrq_s32<19>,
+        vshrq_s32<20>,
+        vshrq_s32<21>,
+        vshrq_s32<22>,
+        vshrq_s32<23>,
+        vshrq_s32<24>,
+        vshrq_s32<25>,
+        vshrq_s32<26>,
+        vshrq_s32<27>,
+        vshrq_s32<28>,
+        vshrq_s32<29>,
+        vshrq_s32<30>,
+        vshrq_s32<31>,
+        vshrq_s32<32>
+    };
+    vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        u8 prevx[3] = { 0, 0, 0 },
+           currx[3] = { 0, 0, 0 },
+           nextx[3] = { 0, 0, 0 };
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 8)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 8 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx[0] = prevx[1] = prevx[2] = borderValue;
+                else
+                {
+                    prevx[0] = srow0 ? srow0[x4] : borderValue;
+                    prevx[1] =         srow1[x4]              ;
+                    prevx[2] = srow2 ? srow2[x4] : borderValue;
+                }
+
+                currx[0] = srow0 ? srow0[x3] : borderValue;
+                currx[1] =         srow1[x3]              ;
+                currx[2] = srow2 ? srow2[x3] : borderValue;
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev[0] = tcurr[0];
+                tcurr[0] = tnext[0];
+
+                tprev[1] = tcurr[1];
+                tcurr[1] = tnext[1];
+
+                tprev[2] = tcurr[2];
+                tcurr[2] = tnext[2];
+            }
+
+            tnext[0] = x0;
+            tnext[1] = x1;
+            tnext[2] = x2;
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr[0] = tcurr[1] = tcurr[2] = v_border;
+                else if (border == BORDER_MODE_REPLICATE)
+                {
+                    tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
+                    tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
+                    tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
+                }
+
+                continue;
+            }
+
+            int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
+
+            {
+                // combine 3 "shifted" vectors
+                t0 = vext_u8(tprev[0], tcurr[0], 7);
+                t1 = tcurr[0];
+                t2 = vext_u8(tcurr[0], tnext[0], 1);
+
+                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
+                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
+                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
+
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
+            }
+
+            {
+                // combine 3 "shifted" vectors
+                t0 = vext_u8(tprev[1], tcurr[1], 7);
+                t1 = tcurr[1];
+                t2 = vext_u8(tcurr[1], tnext[1], 1);
+
+                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
+                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
+                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
+
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
+            }
+
+            {
+                // combine 3 "shifted" vectors
+                t0 = vext_u8(tprev[2], tcurr[2], 7);
+                t1 = tcurr[2];
+                t2 = vext_u8(tcurr[2], tnext[2], 1);
+
+                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
+                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
+                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
+
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
+            }
+
+
+            // make scale
+            v_dst0 = vshrq_s32_p(v_dst0);
+            v_dst1 = vshrq_s32_p(v_dst1);
+
+            // and add them
+            vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
+                                                          vqmovun_s32(v_dst1))));
+        }
+
+        x -= 8;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                {
+                    nextx[0] = borderValue;
+                    nextx[1] = borderValue;
+                    nextx[2] = borderValue;
+                }
+                else if (border == BORDER_MODE_REPLICATE)
+                {
+                    nextx[0] = srow0[x];
+                    nextx[1] = srow1[x];
+                    nextx[2] = srow2[x];
+                }
+            }
+            else
+            {
+                nextx[0] = srow0 ? srow0[x + 1] : borderValue;
+                nextx[1] =         srow1[x + 1]              ;
+                nextx[2] = srow2 ? srow2[x + 1] : borderValue;
+            }
+
+            s32 val = 0;
+            for (s32 _y = 0; _y < 3; ++_y)
+                val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
+                       currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
+                       nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
+
+            drow[x] = internal::saturate_cast<u8>(val >> scale);
+
+            // make shift
+            prevx[0] = currx[0];
+            currx[0] = nextx[0];
+
+            prevx[1] = currx[1];
+            currx[1] = nextx[1];
+
+            prevx[2] = currx[2];
+            currx[2] = nextx[2];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+    (void)ksize;
+    (void)kernelBase;
+    (void)scale;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/count_nonzero.cpp
+++ b/opencv/3rdparty/carotene/src/count_nonzero.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <limits>
+
+namespace CAROTENE_NS {
+
+s32 countNonZero(const Size2D &_size,
+                 const u8 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw16 = size.width & ~15u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        #define COUNTNONZERO8U_BLOCK_SIZE (16*255)
+        uint8x16_t vc1 = vmovq_n_u8(1);
+        for (; i < roiw16;)
+        {
+            size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
+            uint8x16_t vs = vmovq_n_u8(0);
+
+            for (; i <= lim; i+= 16)
+            {
+                internal::prefetch(src + i);
+                uint8x16_t vln = vld1q_u8(src + i);
+                uint8x16_t vnz = vminq_u8(vln, vc1);
+                vs = vaddq_u8(vs, vnz);
+            }
+
+            uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
+            uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
+
+            s32 s[2];
+            vst1_u32((u32*)s, vs2);
+
+            if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
+            {
+                return 0x7fFFffFF;
+            }
+            result += (s[0] += s[1]);
+            if (s[0] < 0 || result < 0)
+            {
+                return 0x7fFFffFF;
+            }
+        }
+        for (; i < size.width; i++)
+            result += (src[i] != 0)?1:0;
+        if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const u16 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width & ~7u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        #define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
+        uint16x8_t vc1 = vmovq_n_u16(1);
+        for (; i < roiw8;)
+        {
+            size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
+            uint16x8_t vs = vmovq_n_u16(0);
+
+            for (; i <= lim; i+= 8)
+            {
+                internal::prefetch(src + i);
+                uint16x8_t vln = vld1q_u16(src + i);
+                uint16x8_t vnz = vminq_u16(vln, vc1);
+                vs = vaddq_u16(vs, vnz);
+            }
+
+            uint32x4_t vs4 = vpaddlq_u16(vs);
+            uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
+
+            s32 s[2];
+            vst1_u32((u32*)s, vs2);
+
+            if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
+            {
+                return 0x7fFFffFF;
+            }
+            result += (s[0] += s[1]);
+            if (s[0] < 0 || result < 0)
+            {
+                return 0x7fFFffFF;
+            }
+        }
+        for (; i < size.width; i++)
+            result += (src[i] != 0)?1:0;
+        if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const s32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width & ~3u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u32* src = (const u32*)internal::getRowPtr( srcBase,  srcStride, k);
+        u32 i = 0;
+
+        uint32x4_t vc1 = vmovq_n_u32(1);
+        uint32x4_t vs = vmovq_n_u32(0);
+
+        for (; i < roiw4; i += 4 )
+        {
+            internal::prefetch(src + i);
+            uint32x4_t vln = vld1q_u32(src + i);
+            uint32x4_t vnz = vminq_u32(vln, vc1);
+            vs = vqaddq_u32(vs, vnz);
+        }
+
+        uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
+
+        s32 s[2];
+        vst1_u32((u32*)s, vs2);
+
+        if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+        result += (s[0] += s[1]);
+        if (s[0] < 0 || result < 0)
+        {
+            return 0x7fFFffFF;
+        }
+
+        for (; i < size.width; i++)
+            result += (src[i] != 0)?1:0;
+        if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const f32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width & ~3u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        float32x4_t vc0 = vmovq_n_f32(0);
+        int32x4_t vs = vmovq_n_s32(0);
+
+        for (; i < roiw4; i += 4 )
+        {
+            internal::prefetch(src + i);
+            float32x4_t vln = vld1q_f32(src + i);
+            int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
+            vs = vqaddq_s32(vs, vnz);
+        }
+
+        int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
+
+        int s[2];
+        vst1_s32(s, vs2);
+
+        result += (s[0] += s[1]);
+        if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+
+        for (; i < size.width; i++)
+            result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
+
+        if (result < 0)
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const f64 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width & ~7u;
+    size_t roiw4 = size.width & ~3u;
+    size_t roiw2 = size.width & ~1u;
+    uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
+    uint32x4_t vc0 = vmovq_n_u32(0);
+
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f64* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        int32x2_t vs1 = vmov_n_s32(0);
+        int32x2_t vs2 = vmov_n_s32(0);
+        int32x2_t vs3 = vmov_n_s32(0);
+        int32x2_t vs4 = vmov_n_s32(0);
+
+        for (; i < roiw8; i += 8 )
+        {
+            internal::prefetch(src + i + 6);
+            uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
+            uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
+            uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
+            uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
+
+            uint64x2_t vm1 = vandq_u64(vln1, vmask1);
+            uint64x2_t vm2 = vandq_u64(vln2, vmask1);
+            uint64x2_t vm3 = vandq_u64(vln3, vmask1);
+            uint64x2_t vm4 = vandq_u64(vln4, vmask1);
+
+            uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
+            uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
+            uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
+            uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
+
+            uint32x4_t vlx1 = vmvnq_u32(vequ1);
+            uint32x4_t vlx2 = vmvnq_u32(vequ2);
+            uint32x4_t vlx3 = vmvnq_u32(vequ3);
+            uint32x4_t vlx4 = vmvnq_u32(vequ4);
+
+            int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
+            int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
+            int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
+            int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
+
+            vs1 = vqadd_s32(vs1, vnz1);
+            vs2 = vqadd_s32(vs2, vnz2);
+            vs3 = vqadd_s32(vs3, vnz3);
+            vs4 = vqadd_s32(vs4, vnz4);
+        }
+
+        if (i < roiw4)
+        {
+            internal::prefetch(src + i + 2);
+            uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
+            uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
+
+            uint64x2_t vm1 = vandq_u64(vln1, vmask1);
+            uint64x2_t vm2 = vandq_u64(vln2, vmask1);
+
+            uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
+            uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
+
+            uint32x4_t vlx1 = vmvnq_u32(vequ1);
+            uint32x4_t vlx2 = vmvnq_u32(vequ2);
+
+            int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
+            int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
+
+            vs1 = vqadd_s32(vs1, vnz1);
+            vs2 = vqadd_s32(vs2, vnz2);
+            i += 4;
+        }
+
+        if (i < roiw2)
+        {
+            internal::prefetch(src + i);
+            uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
+
+            uint64x2_t vm1 = vandq_u64(vln1, vmask1);
+
+            uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
+
+            uint32x4_t vlx1 = vmvnq_u32(vequ1);
+
+            int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
+
+            vs1 = vqadd_s32(vs1, vnz1);
+            i += 2;
+        }
+
+        vs1 = vqadd_s32(vs1, vs2);
+        vs3 = vqadd_s32(vs3, vs4);
+        vs1 = vqadd_s32(vs1, vs3);
+        int32x2_t vsneg = vqneg_s32(vs1);
+
+        s32 s[2];
+        vst1_s32(s, vsneg);
+
+        result += (s[0] += s[1]);
+        if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+
+        for (; i < size.width; i++)
+            result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
+        if (result < 0)
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/div.cpp
+++ b/opencv/3rdparty/carotene/src/div.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cstring>
+#include <cfloat>
+#include <cmath>
+#include <limits>
+
+namespace CAROTENE_NS {
+
+namespace {
+
+#ifdef CAROTENE_NEON
+
+inline float32x4_t vroundq(const float32x4_t& v)
+{
+    const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+    float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
+    return vaddq_f32(v, v_addition);
+}
+
+template <typename T>
+inline T divSaturateQ(const T &v1, const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
+                                                            internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
+                                                            internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
+template <>
+inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
+
+inline float32x2_t vround(const float32x2_t& v)
+{
+    const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
+    float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
+    return vadd_f32(v, v_addition);
+}
+
+template <typename T>
+inline T divSaturate(const T &v1, const T &v2, const float scale)
+{
+    return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
+template <>
+inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
+
+
+template <typename T>
+inline T divWrapQ(const T &v1, const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
+                                                       internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
+                                                       internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
+template <>
+inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
+
+template <typename T>
+inline T divWrap(const T &v1, const T &v2, const float scale)
+{
+    return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
+template <>
+inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
+
+inline  uint8x16_t vtstq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vtstq_u8 (v0, v1); }
+inline  uint16x8_t vtstq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vtstq_u16(v0, v1); }
+inline  uint32x4_t vtstq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vtstq_u32(v0, v1); }
+inline   int8x16_t vtstq(const int8x16_t   & v0, const int8x16_t   & v1) { return vreinterpretq_s8_u8  (vtstq_s8 (v0, v1)); }
+inline   int16x8_t vtstq(const int16x8_t   & v0, const int16x8_t   & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
+inline   int32x4_t vtstq(const int32x4_t   & v0, const int32x4_t   & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
+
+inline   uint8x8_t vtst(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vtst_u8 (v0, v1); }
+inline  uint16x4_t vtst(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vtst_u16(v0, v1); }
+inline  uint32x2_t vtst(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vtst_u32(v0, v1); }
+inline    int8x8_t vtst(const int8x8_t    & v0, const int8x8_t    & v1) { return vreinterpret_s8_u8  (vtst_s8 (v0, v1)); }
+inline   int16x4_t vtst(const int16x4_t   & v0, const int16x4_t   & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
+inline   int32x2_t vtst(const int32x2_t   & v0, const int32x2_t   & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
+#endif
+
+template <typename T>
+void div(const Size2D &size,
+         const T * src0Base, ptrdiff_t src0Stride,
+         const T * src1Base, ptrdiff_t src1Stride,
+         T * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<T>::vec64 vec64;
+
+#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
+    static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
+#endif
+
+    if (scale == 0.0f ||
+        (std::numeric_limits<T>::is_integer &&
+         (scale * std::numeric_limits<T>::max()) <  1.0f &&
+         (scale * std::numeric_limits<T>::max()) > -1.0f))
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            T * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(T) * size.width);
+        }
+        return;
+    }
+
+    const size_t step128 = 16 / sizeof(T);
+    size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
+    const size_t step64 = 8 / sizeof(T);
+    size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        T * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                vec128 v_src0 = internal::vld1q(src0 + j);
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src0 = internal::vld1(src0 + j);
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                vec128 v_src0 = internal::vld1q(src0 + j);
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src0 = internal::vld1(src0 + j);
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+
+template <typename T>
+inline T recipSaturateQ(const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
+template <>
+inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
+
+template <typename T>
+inline T recipSaturate(const T &v2, const float scale)
+{
+    return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
+template <>
+inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
+
+
+template <typename T>
+inline T recipWrapQ(const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
+template <>
+inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
+
+template <typename T>
+inline T recipWrap(const T &v2, const float scale)
+{
+    return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
+template <>
+inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
+#endif
+
+template <typename T>
+void recip(const Size2D &size,
+           const T * src1Base, ptrdiff_t src1Stride,
+           T * dstBase, ptrdiff_t dstStride,
+           f32 scale,
+           CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<T>::vec64 vec64;
+
+#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
+    static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
+#endif
+
+    if (scale == 0.0f ||
+        (std::numeric_limits<T>::is_integer &&
+         scale <  1.0f &&
+         scale > -1.0f))
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            T * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(T) * size.width);
+        }
+        return;
+    }
+
+    const size_t step128 = 16 / sizeof(T);
+    size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
+    const size_t step64 = 8 / sizeof(T);
+    size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        T * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src1 + j);
+
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src1 + j);
+
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+}
+
+void div(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         u8 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const s8 * src0Base, ptrdiff_t src0Stride,
+         const s8 * src1Base, ptrdiff_t src1Stride,
+         s8 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const u16 * src0Base, ptrdiff_t src0Stride,
+         const u16 * src1Base, ptrdiff_t src1Stride,
+         u16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const s32 * src0Base, ptrdiff_t src0Stride,
+         const s32 * src1Base, ptrdiff_t src1Stride,
+         s32 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const f32 * src0Base, ptrdiff_t src0Stride,
+         const f32 * src1Base, ptrdiff_t src1Stride,
+         f32 * dstBase, ptrdiff_t dstStride,
+         f32 scale)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (scale == 0.0f)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(f32) * size.width);
+        }
+        return;
+    }
+
+    size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
+    size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
+
+    if (std::fabs(scale - 1.0f) < FLT_EPSILON)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+            const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src0 = vld1q_f32(src0 + j);
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                vst1q_f32(dst + j, vmulq_f32(v_src0, internal::vrecpq_f32(v_src1)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src0 = vld1_f32(src0 + j);
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                vst1_f32(dst + j, vmul_f32(v_src0, internal::vrecp_f32(v_src1)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src0[j] / src1[j];
+            }
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+            const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src0 = vld1q_f32(src0 + j);
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                vst1q_f32(dst + j, vmulq_f32(vmulq_n_f32(v_src0, scale),
+                                             internal::vrecpq_f32(v_src1)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src0 = vld1_f32(src0 + j);
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                vst1_f32(dst + j, vmul_f32(vmul_n_f32(v_src0, scale),
+                                           internal::vrecp_f32(v_src1)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src0[j] * scale / src1[j];
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)scale;
+#endif
+}
+
+void reciprocal(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const s8 * srcBase, ptrdiff_t srcStride,
+                s8 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const u16 * srcBase, ptrdiff_t srcStride,
+                u16 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const s16 * srcBase, ptrdiff_t srcStride,
+                s16 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const s32 * srcBase, ptrdiff_t srcStride,
+                s32 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const f32 * srcBase, ptrdiff_t srcStride,
+                f32 * dstBase, ptrdiff_t dstStride,
+                f32 scale)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (scale == 0.0f)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(f32) * size.width);
+        }
+        return;
+    }
+
+    size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
+    size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
+
+    if (std::fabs(scale - 1.0f) < FLT_EPSILON)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                vst1q_f32(dst + j, internal::vrecpq_f32(v_src1));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                vst1_f32(dst + j, internal::vrecp_f32(v_src1));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = 1.0f / src1[j];
+            }
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                vst1q_f32(dst + j, vmulq_n_f32(internal::vrecpq_f32(v_src1), scale));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                vst1_f32(dst + j, vmul_n_f32(internal::vrecp_f32(v_src1), scale));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = scale / src1[j];
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)scale;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/dot_product.cpp
+++ b/opencv/3rdparty/carotene/src/dot_product.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+f64 dotProduct(const Size2D &_size,
+               const u8 * src0Base, ptrdiff_t src0Stride,
+               const u8 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
+// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
+#define DOT_UINT_BLOCKSIZE 66050*8
+    f64 result = 0.0;
+    for (size_t row = 0; row < size.height; ++row)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
+
+        size_t i = 0;
+        uint64x2_t ws = vmovq_n_u64(0);
+
+        while(i + 16 <= size.width)
+        {
+            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
+
+            uint32x4_t s1 = vmovq_n_u32(0);
+            uint32x4_t s2 = vmovq_n_u32(0);
+
+            for (; i <= lim; i += 16)
+            {
+                internal::prefetch(src0 + i);
+                internal::prefetch(src1 + i);
+
+                uint8x16_t vs1 = vld1q_u8(src0 + i);
+                uint8x16_t vs2 = vld1q_u8(src1 + i);
+
+                uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
+                uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
+
+                s1 = vpadalq_u16(s1, vdot1);
+                s2 = vpadalq_u16(s2, vdot2);
+            }
+
+            ws = vpadalq_u32(ws, s1);
+            ws = vpadalq_u32(ws, s2);
+        }
+
+        if(i + 8 <= size.width)
+        {
+            uint8x8_t vs1 = vld1_u8(src0 + i);
+            uint8x8_t vs2 = vld1_u8(src1 + i);
+
+            ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
+            i += 8;
+        }
+
+        result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
+
+        for (; i < size.width; ++i)
+            result += s32(src0[i]) * s32(src1[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+f64 dotProduct(const Size2D &_size,
+               const s8 * src0Base, ptrdiff_t src0Stride,
+               const s8 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
+// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
+#define DOT_INT_BLOCKSIZE 131070*8
+    f64 result = 0.0;
+    for (size_t row = 0; row < size.height; ++row)
+    {
+        const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
+        const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
+
+        size_t i = 0;
+        int64x2_t ws = vmovq_n_s64(0);
+
+        while(i + 16 <= size.width)
+        {
+            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
+
+            int32x4_t s1 = vmovq_n_s32(0);
+            int32x4_t s2 = vmovq_n_s32(0);
+
+            for (; i <= lim; i += 16)
+            {
+                internal::prefetch(src0 + i);
+                internal::prefetch(src1 + i);
+
+                int8x16_t vs1 = vld1q_s8(src0 + i);
+                int8x16_t vs2 = vld1q_s8(src1 + i);
+
+                int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
+                int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
+
+                s1 = vpadalq_s16(s1, vdot1);
+                s2 = vpadalq_s16(s2, vdot2);
+            }
+
+            ws = vpadalq_s32(ws, s1);
+            ws = vpadalq_s32(ws, s2);
+        }
+
+        if(i + 8 <= size.width)
+        {
+            int8x8_t vs1 = vld1_s8(src0 + i);
+            int8x8_t vs2 = vld1_s8(src1 + i);
+
+            ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
+            i += 8;
+        }
+
+        result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
+
+        for (; i < size.width; ++i)
+            result += s32(src0[i]) * s32(src1[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+f64 dotProduct(const Size2D &_size,
+               const f32 * src0Base, ptrdiff_t src0Stride,
+               const f32 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+#define DOT_FLOAT_BLOCKSIZE (1 << 13)
+    f64 result = 0.0;
+    for (size_t row = 0; row < size.height; ++row)
+    {
+        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
+        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
+
+        size_t i = 0;
+        while(i + 4 <= size.width)
+        {
+            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
+            float32x4_t v_sum = vdupq_n_f32(0.0f);
+
+            for( ; i <= lim; i += 4 )
+            {
+                internal::prefetch(src0 + i);
+                internal::prefetch(src1 + i);
+                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
+            }
+
+            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
+            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
+        }
+
+        if(i + 2 <= size.width)
+        {
+            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
+            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
+            i += 2;
+        }
+
+        for (; i < size.width; ++i)
+            result += src0[i] * src1[i];
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/dummy.cpp
+++ b/opencv/3rdparty/carotene/src/dummy.cpp
+// This file is needed for compilation on some platforms e.g. with XCode generator
+// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457
--- a/opencv/3rdparty/carotene/src/fast.cpp
+++ b/opencv/3rdparty/carotene/src/fast.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+
+/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
+   Below is the original copyright and the references */
+
+/*
+Copyright (c) 2006, 2008 Edward Rosten
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ *Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+ *Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+ *Neither the name of the University of Cambridge nor the names of
+  its contributors may be used to endorse or promote products derived
+  from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+The references are:
+ * Machine learning for high-speed corner detection,
+   E. Rosten and T. Drummond, ECCV 2006
+ * Faster and better: A machine learning approach to corner detection
+   E. Rosten, R. Porter and T. Drummond, PAMI, 2009
+*/
+
+#include "common.hpp"
+
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+namespace
+{
+
+void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
+{
+    pixel[0] = 0 + row_stride * 3;
+    pixel[1] = 1 + row_stride * 3;
+    pixel[2] = 2 + row_stride * 2;
+    pixel[3] = 3 + row_stride * 1;
+    pixel[4] = 3 + row_stride * 0;
+    pixel[5] = 3 + row_stride * -1;
+    pixel[6] = 2 + row_stride * -2;
+    pixel[7] = 1 + row_stride * -3;
+    pixel[8] = 0 + row_stride * -3;
+    pixel[9] = -1 + row_stride * -3;
+    pixel[10] = -2 + row_stride * -2;
+    pixel[11] = -3 + row_stride * -1;
+    pixel[12] = -3 + row_stride * 0;
+    pixel[13] = -3 + row_stride * 1;
+    pixel[14] = -2 + row_stride * 2;
+    pixel[15] = -1 + row_stride * 3;
+}
+
+u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
+{
+    const s32 K = 8, N = 16 + K + 1;
+    s32 k, v = ptr[0];
+    s16 d[(N + 7) & ~7];
+    for( k = 0; k < N; k++ )
+        d[k] = (s16)(v - ptr[pixel[k]]);
+
+    int16x8_t q0 = vdupq_n_s16((s16)(-1000));
+    int16x8_t q1 = vdupq_n_s16((s16)(1000));
+
+    int16x8_t d0_7   = vld1q_s16(d +  0);
+    int16x8_t d8_15  = vld1q_s16(d +  8);
+    int16x8_t d16_23 = vld1q_s16(d + 16);
+    int16x8_t d24    = vld1q_s16(d + 24);
+
+    //k == 0
+    int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
+    int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
+    int16x8_t ak0 = vminq_s16(v0k0, v1k0);
+    int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
+
+    v0k0 = vextq_s16(d0_7, d8_15, 3);
+    ak0 = vminq_s16(ak0, v0k0);
+    bk0 = vmaxq_s16(bk0, v0k0);
+
+    v1k0 = vextq_s16(d0_7, d8_15, 4);
+    ak0 = vminq_s16(ak0, v1k0);
+    bk0 = vmaxq_s16(bk0, v1k0);
+
+    v0k0 = vextq_s16(d0_7, d8_15, 5);
+    ak0 = vminq_s16(ak0, v0k0);
+    bk0 = vmaxq_s16(bk0, v0k0);
+
+    v1k0 = vextq_s16(d0_7, d8_15, 6);
+    ak0 = vminq_s16(ak0, v1k0);
+    bk0 = vmaxq_s16(bk0, v1k0);
+
+    v0k0 = vextq_s16(d0_7, d8_15, 7);
+    ak0 = vminq_s16(ak0, v0k0);
+    bk0 = vmaxq_s16(bk0, v0k0);
+
+    ak0 = vminq_s16(ak0, d8_15);
+    bk0 = vmaxq_s16(bk0, d8_15);
+
+    q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
+    q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
+
+    v1k0 = vextq_s16(d8_15, d16_23, 1);
+    q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
+    q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
+
+    //k == 8
+    int16x8_t v0k8 = v1k0;
+    int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
+    int16x8_t ak8 = vminq_s16(v0k8, v1k8);
+    int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
+
+    v0k8 = vextq_s16(d8_15, d16_23, 3);
+    ak8 = vminq_s16(ak8, v0k8);
+    bk8 = vmaxq_s16(bk8, v0k8);
+
+    v1k8 = vextq_s16(d8_15, d16_23, 4);
+    ak8 = vminq_s16(ak8, v1k8);
+    bk8 = vmaxq_s16(bk8, v1k8);
+
+    v0k8 = vextq_s16(d8_15, d16_23, 5);
+    ak8 = vminq_s16(ak8, v0k8);
+    bk8 = vmaxq_s16(bk8, v0k8);
+
+    v1k8 = vextq_s16(d8_15, d16_23, 6);
+    ak8 = vminq_s16(ak8, v1k8);
+    bk8 = vmaxq_s16(bk8, v1k8);
+
+    v0k8 = vextq_s16(d8_15, d16_23, 7);
+    ak8 = vminq_s16(ak8, v0k8);
+    bk8 = vmaxq_s16(bk8, v0k8);
+
+    ak8 = vminq_s16(ak8, d16_23);
+    bk8 = vmaxq_s16(bk8, d16_23);
+
+    q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
+    q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
+
+    v1k8 = vextq_s16(d16_23, d24, 1);
+    q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
+    q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
+
+    //fin
+    int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
+    int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
+    int32x4_t q2w = vmovl_s16(q2);
+    int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
+    int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
+
+    return (u8)(vget_lane_s32(q8, 0) - 1);
+}
+
+} //namespace
+#endif
+
+void FAST(const Size2D &size,
+          u8 *srcBase, ptrdiff_t srcStride,
+          KeypointStore *keypoints,
+          u8 threshold, bool nonmax_suppression)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    //keypoints.clear();
+
+    const s32 K = 8, N = 16 + K + 1;
+    ptrdiff_t i, j, k, pixel[N];
+    makeOffsets(pixel, srcStride);
+    for(k = 16; k < N; k++)
+        pixel[k] = pixel[k - 16];
+
+    uint8x16_t delta = vdupq_n_u8(128);
+    uint8x16_t t = vdupq_n_u8(threshold);
+    uint8x16_t K16 = vdupq_n_u8((u8)K);
+
+    u8 threshold_tab[512];
+    for( i = -255; i <= 255; i++ )
+        threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
+
+    std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
+    u8* buf[3];
+    buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
+    ptrdiff_t* cpbuf[3];
+    cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
+    cpbuf[1] = cpbuf[0] + size.width + 1;
+    cpbuf[2] = cpbuf[1] + size.width + 1;
+    memset(buf[0], 0, size.width*3);
+
+    for(i = 3; i < (ptrdiff_t)size.height-2; i++)
+    {
+        const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
+        u8* curr = buf[(i - 3)%3];
+        ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
+        memset(curr, 0, size.width);
+        ptrdiff_t ncorners = 0;
+
+        if( i < (ptrdiff_t)size.height - 3 )
+        {
+            j = 3;
+
+            for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
+            {
+                internal::prefetch(ptr);
+                internal::prefetch(ptr + pixel[0]);
+                internal::prefetch(ptr + pixel[2]);
+
+                uint8x16_t v0 = vld1q_u8(ptr);
+                int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
+                int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
+
+                int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
+                int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
+                int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
+                int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
+
+                uint8x16_t m0 =   vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
+                uint8x16_t m1 =   vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
+                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
+                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
+                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
+                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
+                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
+                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
+                m0 = vorrq_u8(m0, m1);
+
+                u64 mask[2];
+                vst1q_u64(mask, vreinterpretq_u64_u8(m0));
+
+                if( mask[0] == 0 )
+                {
+                    if (mask[1] != 0)
+                    {
+                        j -= 8;
+                        ptr -= 8;
+                    }
+                    continue;
+                }
+
+                uint8x16_t c0 = vmovq_n_u8(0);
+                uint8x16_t c1 = vmovq_n_u8(0);
+                uint8x16_t max0 = vmovq_n_u8(0);
+                uint8x16_t max1 = vmovq_n_u8(0);
+                for( k = 0; k < N; k++ )
+                {
+                    int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
+                    m0 = vcgtq_s8(x, v2);
+                    m1 = vcgtq_s8(v1, x);
+
+                    c0 = vandq_u8(vsubq_u8(c0, m0), m0);
+                    c1 = vandq_u8(vsubq_u8(c1, m1), m1);
+
+                    max0 = vmaxq_u8(max0, c0);
+                    max1 = vmaxq_u8(max1, c1);
+                }
+
+                max0 = vmaxq_u8(max0, max1);
+                u8 m[16];
+                vst1q_u8(m, vcgtq_u8(max0, K16));
+
+                for( k = 0; k < 16; ++k )
+                    if(m[k])
+                    {
+                        cornerpos[ncorners++] = j+k;
+                        if(nonmax_suppression)
+                            curr[j+k] = cornerScore(ptr+k, pixel);
+                    }
+            }
+
+            for( ; j < (s32)size.width - 3; j++, ptr++ )
+            {
+                s32 v = ptr[0];
+                const u8* tab = &threshold_tab[0] - v + 255;
+                s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
+
+                if( d == 0 )
+                    continue;
+
+                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
+                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
+                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
+
+                if( d == 0 )
+                    continue;
+
+                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
+                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
+                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
+                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
+
+                if( d & 1 )
+                {
+                    s32 vt = v - threshold, count = 0;
+
+                    for( k = 0; k < N; k++ )
+                    {
+                        s32 x = ptr[pixel[k]];
+                        if(x < vt)
+                        {
+                            if( ++count > K )
+                            {
+                                cornerpos[ncorners++] = j;
+                                if(nonmax_suppression)
+                                    curr[j] = cornerScore(ptr, pixel);
+                                break;
+                            }
+                        }
+                        else
+                            count = 0;
+                    }
+                }
+
+                if( d & 2 )
+                {
+                    s32 vt = v + threshold, count = 0;
+
+                    for( k = 0; k < N; k++ )
+                    {
+                        s32 x = ptr[pixel[k]];
+                        if(x > vt)
+                        {
+                            if( ++count > K )
+                            {
+                                cornerpos[ncorners++] = j;
+                                if(nonmax_suppression)
+                                    curr[j] = cornerScore(ptr, pixel);
+                                break;
+                            }
+                        }
+                        else
+                            count = 0;
+                    }
+                }
+            }
+        }
+
+        cornerpos[-1] = ncorners;
+
+        if( i == 3 )
+            continue;
+
+        const u8* prev = buf[(i - 4 + 3)%3];
+        const u8* pprev = buf[(i - 5 + 3)%3];
+        cornerpos = cpbuf[(i - 4 + 3)%3];
+        ncorners = cornerpos[-1];
+
+        for( k = 0; k < ncorners; k++ )
+        {
+            j = cornerpos[k];
+            s32 score = prev[j];
+            if( !nonmax_suppression ||
+                    (score > prev[j+1] && score > prev[j-1] &&
+                     score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
+                     score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
+            {
+                keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)keypoints;
+    (void)threshold;
+    (void)nonmax_suppression;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/fill_minmaxloc.cpp
+++ b/opencv/3rdparty/carotene/src/fill_minmaxloc.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+void process(const T * src, size_t j0, size_t j1, size_t i,
+             T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+             T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    for (size_t j = j0; j < j1; ++j)
+    {
+        T val = src[j];
+
+        if (val == maxVal)
+        {
+            if (maxLocCount < maxLocCapacity)
+            {
+                maxLocPtr[maxLocCount] = j;
+                maxLocPtr[maxLocCount + 1] = i;
+            }
+            maxLocCount += 2;
+        }
+
+        if (val == minVal)
+        {
+            if (minLocCount < minLocCapacity)
+            {
+                minLocPtr[minLocCount] = j;
+                minLocPtr[minLocCount + 1] = i;
+            }
+            minLocCount += 2;
+        }
+    }
+}
+
+} // namespace
+
+#endif
+
+void fillMinMaxLocs(const Size2D & size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
+    uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
+
+    u64 mask[2] = { 0ul };
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+
+            uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
+            uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
+            uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
+
+            vst1q_u8((u8 *)&mask[0], v_mask);
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+            if (mask[1])
+                process(src, j + 8, j + 16, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+        for ( ; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src = vld1_u8(src + j);
+
+            uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
+            uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
+            uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
+
+            vst1_u8((u8 *)&mask[0], v_mask);
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const u16 * srcBase, ptrdiff_t srcStride,
+                    u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
+               v_minval8 = vdupq_n_u16(minVal);
+    u64 mask[2] = { 0ul };
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
+
+            uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
+            uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
+
+            vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+            if (mask[1])
+                process(src, j + 8, j + 16, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v_src = vld1q_u16(src + j);
+
+            uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
+            uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
+            uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
+
+            vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const s16 * srcBase, ptrdiff_t srcStride,
+                    s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
+              v_minval8 = vdupq_n_s16(minVal);
+    u64 mask[2] = { 0ul };
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
+
+            uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
+            uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
+
+            vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+            if (mask[1])
+                process(src, j + 8, j + 16, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v_src = vld1q_s16(src + j);
+
+            uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
+            uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
+            uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
+
+            vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const s32 * srcBase, ptrdiff_t srcStride,
+                    s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
+              v_minval4 = vdupq_n_s32(minVal);
+    u64 mask = 0ul;
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
+
+            uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
+            uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
+
+            vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
+
+            if (mask)
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const u32 * srcBase, ptrdiff_t srcStride,
+                    u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
+               v_minval4 = vdupq_n_u32(minVal);
+    u64 mask = 0ul;
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
+
+            uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
+            uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
+
+            vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
+
+            if (mask)
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/flip.cpp
+++ b/opencv/3rdparty/carotene/src/flip.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
+{
+    bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
+    return isSupportedConfiguration() &&
+            ((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
+             (flipMode == FLIP_VERTICAL_MODE));
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+void flip(const Size2D & size,
+          const void * srcBase, ptrdiff_t srcStride,
+          void * dstBase, ptrdiff_t dstStride,
+          FLIP_MODE flipMode)
+{
+    using namespace internal;
+
+    typedef typename VecTraits<T>::vec128 vec128;
+    typedef typename VecTraits<T>::vec64 vec64;
+
+    u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src = getRowPtr((const T *)srcBase, srcStride, i);
+        T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
+        size_t js = 0, jd = size.width;
+
+        for (; js < roiw_base; js += step_base, jd -= step_base)
+        {
+            prefetch(src + js);
+
+            vec128 v_src = vld1q(src + js);
+            vec128 v_dst = vrev64q(v_src);
+            v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
+            vst1q(dst + jd - step_base, v_dst);
+        }
+        for (; js < roiw_tail; js += step_tail, jd -= step_tail)
+        {
+            vec64 v_src = vld1(src + js);
+            vst1(dst + jd - step_tail, vrev64(v_src));
+        }
+
+        for (--jd; js < size.width; ++js, --jd)
+            dst[jd] = src[js];
+    }
+}
+
+template <typename T>
+void flip3(const Size2D & size,
+           const void * srcBase, ptrdiff_t srcStride,
+           void * dstBase, ptrdiff_t dstStride,
+           FLIP_MODE flipMode)
+{
+    using namespace internal;
+
+#ifndef __ANDROID__
+    typedef typename VecTraits<T, 3>::vec128 vec128;
+#endif
+    typedef typename VecTraits<T, 3>::vec64 vec64;
+
+#ifndef __ANDROID__
+    u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+#endif
+    u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src = getRowPtr((const T *)srcBase, srcStride, i);
+        T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
+        size_t j = 0, js = 0, jd = size.width * 3;
+
+#ifndef __ANDROID__
+        for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
+        {
+            prefetch(src + js);
+
+            vec128 v_src = vld3q(src + js), v_dst;
+            v_src.val[0] = vrev64q(v_src.val[0]);
+            v_src.val[1] = vrev64q(v_src.val[1]);
+            v_src.val[2] = vrev64q(v_src.val[2]);
+
+            v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
+            v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
+            v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));
+
+            vst3q(dst + jd - step_base3, v_dst);
+        }
+#endif // __ANDROID__
+
+        for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
+        {
+            vec64 v_src = vld3(src + js), v_dst;
+            v_dst.val[0] = vrev64(v_src.val[0]);
+            v_dst.val[1] = vrev64(v_src.val[1]);
+            v_dst.val[2] = vrev64(v_src.val[2]);
+
+            vst3(dst + jd - step_tail3, v_dst);
+        }
+
+        for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
+        {
+            dst[jd] = src[js];
+            dst[jd + 1] = src[js + 1];
+            dst[jd + 2] = src[js + 2];
+        }
+    }
+}
+
+typedef void (* flipFunc)(const Size2D &size,
+                  const void * srcBase, ptrdiff_t srcStride,
+                  void * dstBase, ptrdiff_t dstStride,
+                  FLIP_MODE flipMode);
+
+} // namespace
+
+#endif
+
+void flip(const Size2D &size,
+          const u8 * srcBase, ptrdiff_t srcStride,
+          u8 * dstBase, ptrdiff_t dstStride,
+          FLIP_MODE flipMode, u32 elemSize)
+{
+    internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
+#ifdef CAROTENE_NEON
+
+    if (flipMode == FLIP_VERTICAL_MODE)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
+            u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);
+
+            std::memcpy(dst_row, src_row, elemSize * size.width);
+        }
+        return;
+    }
+
+    flipFunc func = NULL;
+
+    if (elemSize == (u32)sizeof(u8))
+        func = &flip<u8>;
+    if (elemSize == (u32)sizeof(u16))
+        func = &flip<u16>;
+    if (elemSize == (u32)sizeof(u32))
+        func = &flip<u32>;
+    if (elemSize == (u32)sizeof(u8) * 3)
+        func = &flip3<u8>;
+
+    if (func == NULL)
+        return;
+
+    func(size,
+         srcBase, srcStride,
+         dstBase, dstStride,
+         flipMode);
+
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)flipMode;
+    (void)elemSize;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/gaussian_blur.cpp
+++ b/opencv/3rdparty/carotene/src/gaussian_blur.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+#include "separable_filter.hpp"
+
+namespace CAROTENE_NS {
+
+bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 8 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+void gaussianBlur3x3(const Size2D &size,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2);
+    const uint16x8_t v_zero = vdupq_n_u16(0);
+    const uint8x8_t v_border = vdup_n_u8(borderValue);
+
+    uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
+    uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
+
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        s16 prevx = 0, currx = 0, nextx = 0;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 8)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 8 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx = borderValue;
+                else
+                    prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue);
+
+                currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue);
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+
+            // and calculate next value
+            tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1));
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr = v_border_x4;
+                else if (border == BORDER_MODE_REPLICATE)
+                    tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
+
+                continue;
+            }
+
+            // combine 3 "shifted" vectors
+            t0 = vextq_u16(tprev, tcurr, 7);
+            t1 = tcurr;
+            t2 = vextq_u16(tcurr, tnext, 1);
+
+            // and add them
+            t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2));
+            vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4));
+        }
+
+        x -= 8;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                    nextx = borderValue << 2;
+                else if (border == BORDER_MODE_REPLICATE)
+                    nextx = srow2[x] + (srow1[x] << 1) + srow0[x];
+            }
+            else
+                nextx = (srow2 ? srow2[x + 1] : borderValue) +
+                                (srow1[x + 1] << 1) +
+                        (srow0 ? srow0[x + 1] : borderValue);
+
+            f32 val = (prevx + (currx << 1) + nextx) >> 4;
+            drow[x] = internal::saturate_cast<u8>((s32)val);
+
+            // make shift
+            prevx = currx;
+            currx = nextx;
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin)
+{
+    return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin);
+}
+
+void gaussianBlur3x3Margin(const Size2D &size,
+                           const u8 * srcBase, ptrdiff_t srcStride,
+                           u8 * dstBase, ptrdiff_t dstStride,
+                           BORDER_MODE border, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin));
+#ifdef CAROTENE_NEON
+    internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3U8_121>::process(
+                           size, srcBase, srcStride, dstBase, dstStride,
+                           0, 0, border, borderValue, borderMargin);
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border)
+{
+    return isSupportedConfiguration() &&
+           cn > 0 && cn <= 4 &&
+           size.width >= 8 && size.height >= 2 &&
+           (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REFLECT101 ||
+            border == BORDER_MODE_REFLECT ||
+            border == BORDER_MODE_REPLICATE ||
+            border == BORDER_MODE_WRAP);
+}
+
+void gaussianBlur5x5(const Size2D &size, s32 cn,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 4*cn, borderValue);
+        tmp = &_tmp[cn << 1];
+    }
+
+    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //1-line buffer
+    std::vector<u16> _buf(cn * (size.width + 4) + 32 / sizeof(u16));
+    u16* lane = internal::alignPtr(&_buf[cn << 1], 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lane[-cn+k] = borderValue;
+            lane[-cn-cn+k] = borderValue;
+            lane[colsn+k] = borderValue;
+            lane[colsn+cn+k] = borderValue;
+        }
+
+    uint8x8_t vc6u8 = vmov_n_u8(6);
+    uint16x8_t vc6u16 = vmovq_n_u16(6);
+    uint16x8_t vc4u16 = vmovq_n_u16(4);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
+        const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i);
+        const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+        const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 8; x += 8)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
+            uint8x8_t v0 = vld1_u8(ln0+x);
+            uint8x8_t v1 = vld1_u8(ln1+x);
+            uint8x8_t v2 = vld1_u8(ln2+x);
+            uint8x8_t v3 = vld1_u8(ln3+x);
+            uint8x8_t v4 = vld1_u8(ln4+x);
+
+            uint16x8_t v = vaddl_u8(v0, v4);
+            uint16x8_t v13 = vaddl_u8(v1, v3);
+
+            v = vmlal_u8(v, v2, vc6u8);
+            v = vmlaq_u16(v, v13, vc4u16);
+
+            vst1q_u16(lane + x, v);
+        }
+        for (; x < colsn; ++x)
+            lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x];
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lane[-cn+k] = lane[idx_l1 + k];
+                lane[-cn-cn+k] = lane[idx_l2 + k];
+
+                lane[colsn+k] = lane[idx_r1 + k];
+                lane[colsn+cn+k] = lane[idx_r2 + k];
+            }
+
+        //horizontal convolution
+        x = 0;
+        switch(cn)
+        {
+        case 1:
+            for (; x <= colsn - 8; x += 8)
+            {
+                internal::prefetch(lane + x);
+
+                uint16x8_t lane0 = vld1q_u16(lane + x - 2);
+                uint16x8_t lane4 = vld1q_u16(lane + x + 2);
+                uint16x8_t lane1 = vld1q_u16(lane + x - 1);
+                uint16x8_t lane3 = vld1q_u16(lane + x + 1);
+                uint16x8_t lane2 = vld1q_u16(lane + x + 0);
+
+                uint16x8_t ln04 = vaddq_u16(lane0, lane4);
+                uint16x8_t ln13 = vaddq_u16(lane1, lane3);
+
+                uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16);
+                uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16);
+
+                uint8x8_t ls = vrshrn_n_u16(lsw, 8);
+
+                vst1_u8(dst + x, ls);
+            }
+            break;
+        case 2:
+            for (; x <= colsn - 8*2; x += 8*2)
+            {
+                internal::prefetch(lane + x);
+
+                u16* lidx0 = lane + x - 2*2;
+                u16* lidx1 = lane + x - 1*2;
+                u16* lidx3 = lane + x + 1*2;
+                u16* lidx4 = lane + x + 2*2;
+#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
+                __asm__ __volatile__ (
+                    "vld2.16 {d0, d2}, [%[in0]]!                              \n\t"
+                    "vld2.16 {d1, d3}, [%[in0]]                               \n\t"
+                    "vld2.16 {d8, d10}, [%[in4]]!                             \n\t"
+                    "vld2.16 {d9, d11}, [%[in4]]                              \n\t"
+                    "vadd.i16 q0, q4                                          \n\t"
+                    "vadd.i16 q1, q5                                          \n\t"
+                    "vld2.16 {d16, d18}, [%[in1]]!                            \n\t"
+                    "vld2.16 {d17, d19}, [%[in1]]                             \n\t"
+                    "vld2.16 {d8, d10}, [%[in3]]!                             \n\t"
+                    "vld2.16 {d9, d11}, [%[in3]]                              \n\t"
+                    "vadd.i16 q4, q8                                          \n\t"
+                    "vadd.i16 q5, q9                                          \n\t"
+                    "vld2.16 {d16, d18}, [%[in2]]                             \n\t"
+                    "vld2.16 {d17, d19}, [%[in22]]                            \n\t"
+                    "vmla.i16 q0, q4, %q[c4]                                  \n\t"
+                    "vmla.i16 q1, q5, %q[c4]                                  \n\t"
+                    "vmla.i16 q0, q8, %q[c6]                                  \n\t"
+                    "vmla.i16 q1, q9, %q[c6]                                  \n\t"
+                    "vrshrn.u16 d8, q0, #8                                    \n\t"
+                    "vrshrn.u16 d9, q1, #8                                    \n\t"
+                    "vst2.8 {d8-d9}, [%[out]]                                 \n\t"
+                    : [in0] "=r" (lidx0),
+                      [in1] "=r" (lidx1),
+                      [in3] "=r" (lidx3),
+                      [in4] "=r" (lidx4)
+                    : [out] "r" (dst + x),
+                      "0" (lidx0),
+                      "1" (lidx1),
+                      "2" (lidx3),
+                      "3" (lidx4),
+                      [in2] "r" (lane + x),
+                      [in22] "r" (lane + x + 4*2),
+                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+#else
+                uint16x8x2_t vLane0 = vld2q_u16(lidx0);
+                uint16x8x2_t vLane1 = vld2q_u16(lidx1);
+                uint16x8x2_t vLane2 = vld2q_u16(lane + x);
+                uint16x8x2_t vLane3 = vld2q_u16(lidx3);
+                uint16x8x2_t vLane4 = vld2q_u16(lidx4);
+
+                uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
+                uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
+
+                uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
+                uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
+
+                uint8x8x2_t vRes;
+                vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
+                vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
+                vst2_u8(dst + x, vRes);
+#endif
+            }
+            break;
+        case 3:
+            for (; x <= colsn - 8*3; x += 8*3)
+            {
+                internal::prefetch(lane + x);
+
+                u16* lidx0 = lane + x - 2*3;
+                u16* lidx1 = lane + x - 1*3;
+                u16* lidx3 = lane + x + 1*3;
+                u16* lidx4 = lane + x + 2*3;
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+                __asm__ __volatile__ (
+                    "vld3.16 {d0, d2, d4}, [%[in0]]!                          \n\t"
+                    "vld3.16 {d1, d3, d5}, [%[in0]]                           \n\t"
+                    "vld3.16 {d8, d10, d12}, [%[in4]]!                        \n\t"
+                    "vld3.16 {d9, d11, d13}, [%[in4]]                         \n\t"
+                    "vadd.i16 q0, q4                                          \n\t"
+                    "vadd.i16 q1, q5                                          \n\t"
+                    "vadd.i16 q2, q6                                          \n\t"
+                    "vld3.16 {d16, d18, d20}, [%[in1]]!                       \n\t"
+                    "vld3.16 {d17, d19, d21}, [%[in1]]                        \n\t"
+                    "vld3.16 {d8, d10, d12}, [%[in3]]!                        \n\t"
+                    "vld3.16 {d9, d11, d13}, [%[in3]]                         \n\t"
+                    "vadd.i16 q4, q8                                          \n\t"
+                    "vadd.i16 q5, q9                                          \n\t"
+                    "vadd.i16 q6, q10                                         \n\t"
+                    "vld3.16 {d16, d18, d20}, [%[in2]]                        \n\t"
+                    "vld3.16 {d17, d19, d21}, [%[in22]]                       \n\t"
+                    "vmla.i16 q0, q4, %q[c4]                                  \n\t"
+                    "vmla.i16 q1, q5, %q[c4]                                  \n\t"
+                    "vmla.i16 q2, q6, %q[c4]                                  \n\t"
+                    "vmla.i16 q0, q8, %q[c6]                                  \n\t"
+                    "vmla.i16 q1, q9, %q[c6]                                  \n\t"
+                    "vmla.i16 q2, q10, %q[c6]                                 \n\t"
+                    "vrshrn.u16 d8, q0, #8                                    \n\t"
+                    "vrshrn.u16 d9, q1, #8                                    \n\t"
+                    "vrshrn.u16 d10, q2, #8                                   \n\t"
+                    "vst3.8 {d8-d10}, [%[out]]                                \n\t"
+                    : [in0] "=r" (lidx0),
+                      [in1] "=r" (lidx1),
+                      [in3] "=r" (lidx3),
+                      [in4] "=r" (lidx4)
+                    : [out] "r" (dst + x),
+                      "0" (lidx0),
+                      "1" (lidx1),
+                      "2" (lidx3),
+                      "3" (lidx4),
+                      [in2] "r" (lane + x),
+                      [in22] "r" (lane + x + 4*3),
+                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+#else
+                uint16x8x3_t vLane0 = vld3q_u16(lidx0);
+                uint16x8x3_t vLane1 = vld3q_u16(lidx1);
+                uint16x8x3_t vLane2 = vld3q_u16(lane + x);
+                uint16x8x3_t vLane3 = vld3q_u16(lidx3);
+                uint16x8x3_t vLane4 = vld3q_u16(lidx4);
+
+                uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
+                uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
+                uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]);
+
+                uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]);
+                uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]);
+                uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16);
+                vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
+                vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16);
+
+                uint8x8x3_t vRes;
+                vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
+                vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
+                vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
+
+                vst3_u8(dst + x, vRes);
+#endif
+            }
+            break;
+        case 4:
+            for (; x <= colsn - 8*4; x += 8*4)
+            {
+                internal::prefetch(lane + x);
+                internal::prefetch(lane + x + 16);
+
+                u16* lidx0 = lane + x - 2*4;
+                u16* lidx1 = lane + x - 1*4;
+                u16* lidx3 = lane + x + 1*4;
+                u16* lidx4 = lane + x + 2*4;
+#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
+                __asm__ __volatile__ (
+                    "vld4.16 {d0, d2, d4, d6}, [%[in0]]!                      \n\t"
+                    "vld4.16 {d1, d3, d5, d7}, [%[in0]]                       \n\t"
+                    "vld4.16 {d8, d10, d12, d14}, [%[in4]]!                   \n\t"
+                    "vld4.16 {d9, d11, d13, d15}, [%[in4]]                    \n\t"
+                    "vadd.i16 q0, q4                                          \n\t"
+                    "vadd.i16 q1, q5                                          \n\t"
+                    "vadd.i16 q2, q6                                          \n\t"
+                    "vadd.i16 q3, q7                                          \n\t"
+                    "vld4.16 {d16, d18, d20, d22}, [%[in1]]!                  \n\t"
+                    "vld4.16 {d17, d19, d21, d23}, [%[in1]]                   \n\t"
+                    "vld4.16 {d8, d10, d12, d14}, [%[in3]]!                   \n\t"
+                    "vld4.16 {d9, d11, d13, d15}, [%[in3]]                    \n\t"
+                    "vadd.i16 q4, q8                                          \n\t"
+                    "vadd.i16 q5, q9                                          \n\t"
+                    "vadd.i16 q6, q10                                         \n\t"
+                    "vadd.i16 q7, q11                                         \n\t"
+                    "vld4.16 {d16, d18, d20, d22}, [%[in2],:256]              \n\t"
+                    "vld4.16 {d17, d19, d21, d23}, [%[in22],:256]             \n\t"
+                    "vmla.i16 q0, q4, %q[c4]                                  \n\t"
+                    "vmla.i16 q1, q5, %q[c4]                                  \n\t"
+                    "vmla.i16 q2, q6, %q[c4]                                  \n\t"
+                    "vmla.i16 q3, q7, %q[c4]                                  \n\t"
+                    "vmla.i16 q0, q8, %q[c6]                                  \n\t"
+                    "vmla.i16 q1, q9, %q[c6]                                  \n\t"
+                    "vmla.i16 q2, q10, %q[c6]                                 \n\t"
+                    "vmla.i16 q3, q11, %q[c6]                                 \n\t"
+                    "vrshrn.u16 d8, q0, #8                                    \n\t"
+                    "vrshrn.u16 d9, q1, #8                                    \n\t"
+                    "vrshrn.u16 d10, q2, #8                                   \n\t"
+                    "vrshrn.u16 d11, q3, #8                                   \n\t"
+                    "vst4.8 {d8-d11}, [%[out]]                                \n\t"
+                    : [in0] "=r" (lidx0),
+                      [in1] "=r" (lidx1),
+                      [in3] "=r" (lidx3),
+                      [in4] "=r" (lidx4)
+                    : [out] "r" (dst + x),
+                      "0" (lidx0),
+                      "1" (lidx1),
+                      "2" (lidx3),
+                      "3" (lidx4),
+                      [in2] "r" (lane + x),
+                      [in22] "r" (lane + x + 4*4),
+                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+#else
+                uint16x8x4_t vLane0 = vld4q_u16(lidx0);
+                uint16x8x4_t vLane2 = vld4q_u16(lidx4);
+                uint16x8x4_t vLane4 = vld4q_u16(lidx1);
+                uint16x8x4_t vLane6 = vld4q_u16(lidx3);
+                uint16x8x4_t vLane8 = vld4q_u16(lane + x);
+
+                uint16x8_t vSum_0_4  = vaddq_u16(vLane0.val[0], vLane2.val[0]);
+                uint16x8_t vSum_1_5  = vaddq_u16(vLane0.val[1], vLane2.val[1]);
+                uint16x8_t vSum_2_6  = vaddq_u16(vLane0.val[2], vLane2.val[2]);
+                uint16x8_t vSum_3_7  = vaddq_u16(vLane0.val[3], vLane2.val[3]);
+
+                uint16x8_t vSum_4_8  = vaddq_u16(vLane4.val[0], vLane6.val[0]);
+                uint16x8_t vSum_5_9  = vaddq_u16(vLane4.val[1], vLane6.val[1]);
+                uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]);
+                uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
+                vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16);
+                vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16);
+
+                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16);
+                vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16);
+                vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16);
+                vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16);
+
+                uint8x8x4_t vRes;
+                vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
+                vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
+                vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
+                vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8);
+
+                vst4_u8(dst + x, vRes);
+#endif
+            }
+            break;
+        }
+        for (s32 h = 0; h < cn; ++h)
+        {
+            u16* ln = lane + h;
+            u8* dt = dst + h;
+            for (size_t k = x; k < colsn; k += cn)
+            {
+                dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn]
+                               + u16(4) * (ln[k-cn] + ln[k+cn])
+                               + u16(6) * ln[k] + (1 << 7)) >> 8);
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+void gaussianBlur5x5(const Size2D &size, s32 cn,
+                     const u16 * srcBase, ptrdiff_t srcStride,
+                     u16 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE borderType, u16 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<u16> _tmp;
+    u16 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 4*cn, borderValue);
+        tmp = &_tmp[cn << 1];
+    }
+
+    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //1-line buffer
+    std::vector<u32> _buf(cn * (size.width + 4) + 32 / sizeof(u32));
+    u32* lane = internal::alignPtr(&_buf[cn << 1], 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lane[-cn+k] = borderValue;
+            lane[-cn-cn+k] = borderValue;
+            lane[colsn+k] = borderValue;
+            lane[colsn+cn+k] = borderValue;
+        }
+
+    uint16x4_t vc6u16 = vmov_n_u16(6);
+    uint32x4_t vc6u32 = vmovq_n_u32(6);
+    uint32x4_t vc4u32 = vmovq_n_u32(4);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        u16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
+        const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
+        const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+        const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
+            uint16x4_t v0 = vld1_u16(ln0+x);
+            uint16x4_t v1 = vld1_u16(ln1+x);
+            uint16x4_t v2 = vld1_u16(ln2+x);
+            uint16x4_t v3 = vld1_u16(ln3+x);
+            uint16x4_t v4 = vld1_u16(ln4+x);
+
+            uint32x4_t v = vaddl_u16(v0, v4);
+            uint32x4_t v13 = vaddl_u16(v1, v3);
+
+            v = vmlal_u16(v, v2, vc6u16);
+            v = vmlaq_u32(v, v13, vc4u32);
+
+            vst1q_u32(lane + x, v);
+        }
+        for (; x < colsn; ++x)
+            lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lane[-cn+k] = lane[idx_l1 + k];
+                lane[-cn-cn+k] = lane[idx_l2 + k];
+
+                lane[colsn+k] = lane[idx_r1 + k];
+                lane[colsn+cn+k] = lane[idx_r2 + k];
+            }
+
+        //horizontal convolution
+        x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(lane + x);
+
+            uint32x4_t lane0 = vld1q_u32(lane + x - 2);
+            uint32x4_t lane4 = vld1q_u32(lane + x + 2);
+            uint32x4_t lane1 = vld1q_u32(lane + x - 1);
+            uint32x4_t lane3 = vld1q_u32(lane + x + 1);
+            uint32x4_t lane2 = vld1q_u32(lane + x + 0);
+
+            uint32x4_t ln04 = vaddq_u32(lane0, lane4);
+            uint32x4_t ln13 = vaddq_u32(lane1, lane3);
+
+            uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32);
+            uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32);
+
+            uint16x4_t ls = vrshrn_n_u32(lsw, 8);
+
+            vst1_u16(dst + x, ls);
+        }
+        for (s32 h = 0; h < cn; ++h)
+        {
+            u32* ln = lane + h;
+            u16* dt = dst + h;
+            for (size_t k = x; k < colsn; k += cn)
+            {
+                dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+void gaussianBlur5x5(const Size2D &size, s32 cn,
+                     const s16 * srcBase, ptrdiff_t srcStride,
+                     s16 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE borderType, s16 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<s16> _tmp;
+    s16 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 4*cn, borderValue);
+        tmp = &_tmp[cn << 1];
+    }
+
+    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //1-line buffer
+    std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
+    s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lane[-cn+k] = borderValue;
+            lane[-cn-cn+k] = borderValue;
+            lane[colsn+k] = borderValue;
+            lane[colsn+cn+k] = borderValue;
+        }
+
+    int16x4_t vc6s16 = vmov_n_s16(6);
+    int32x4_t vc6s32 = vmovq_n_s32(6);
+    int32x4_t vc4s32 = vmovq_n_s32(4);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
+        const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
+        const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+        const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
+            int16x4_t v0 = vld1_s16(ln0+x);
+            int16x4_t v1 = vld1_s16(ln1+x);
+            int16x4_t v2 = vld1_s16(ln2+x);
+            int16x4_t v3 = vld1_s16(ln3+x);
+            int16x4_t v4 = vld1_s16(ln4+x);
+
+            int32x4_t v = vaddl_s16(v0, v4);
+            int32x4_t v13 = vaddl_s16(v1, v3);
+
+            v = vmlal_s16(v, v2, vc6s16);
+            v = vmlaq_s32(v, v13, vc4s32);
+
+            vst1q_s32(lane + x, v);
+        }
+        for (; x < colsn; ++x)
+            lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lane[-cn+k] = lane[idx_l1 + k];
+                lane[-cn-cn+k] = lane[idx_l2 + k];
+
+                lane[colsn+k] = lane[idx_r1 + k];
+                lane[colsn+cn+k] = lane[idx_r2 + k];
+            }
+
+        //horizontal convolution
+        x = 0;
+       switch(cn)
+        {
+        case 1:
+        case 2:
+        case 3:
+            for (; x <= colsn - 4; x += 4)
+            {
+                internal::prefetch(lane + x);
+
+                int32x4_t lane0 = vld1q_s32(lane + x - 2);
+                int32x4_t lane4 = vld1q_s32(lane + x + 2);
+                int32x4_t lane1 = vld1q_s32(lane + x - 1);
+                int32x4_t lane3 = vld1q_s32(lane + x + 1);
+                int32x4_t lane2 = vld1q_s32(lane + x + 0);
+
+                int32x4_t ln04 = vaddq_s32(lane0, lane4);
+                int32x4_t ln13 = vaddq_s32(lane1, lane3);
+
+                int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
+                int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
+
+                int16x4_t ls = vrshrn_n_s32(lsw, 8);
+
+                vst1_s16(dst + x, ls);
+           }
+            break;
+        case 4:
+/*            for (; x <= colsn - 4*4; x += 4*4)
+            {
+                internal::prefetch(lane + x);
+                internal::prefetch(lane + x + 16);
+
+                ptrdiff_t* lidx0 = lane + x - 2*4;
+                ptrdiff_t* lidx1 = lane + x - 1*4;
+                ptrdiff_t* lidx3 = lane + x + 1*4;
+                ptrdiff_t* lidx4 = lane + x + 2*4;
+
+                __asm__ __volatile__ (
+                    "vld4.32 {d0, d2, d4, d6}, [%[in0]]!                      \n\t"
+                    "vld4.32 {d1, d3, d5, d7}, [%[in0]]                       \n\t"
+                    "vld4.32 {d8, d10, d12, d14}, [%[in4]]!                   \n\t"
+                    "vld4.32 {d9, d11, d13, d15}, [%[in4]]                    \n\t"
+                    "vadd.i32 q0, q4                                          \n\t"
+                    "vadd.i32 q1, q5                                          \n\t"
+                    "vadd.i32 q2, q6                                          \n\t"
+                    "vadd.i32 q3, q7                                          \n\t"
+                    "vld4.32 {d16, d18, d20, d22}, [%[in1]]!                  \n\t"
+                    "vld4.32 {d17, d19, d21, d23}, [%[in1]]                   \n\t"
+                    "vld4.32 {d8, d10, d12, d14}, [%[in3]]!                   \n\t"
+                    "vld4.32 {d9, d11, d13, d15}, [%[in3]]                    \n\t"
+                    "vadd.i32 q4, q8                                          \n\t"
+                    "vadd.i32 q5, q9                                          \n\t"
+                    "vadd.i32 q6, q10                                         \n\t"
+                    "vadd.i32 q7, q11                                         \n\t"
+                    "vld4.32 {d16, d18, d20, d22}, [%[in2],:256]              \n\t"
+                    "vld4.32 {d17, d19, d21, d23}, [%[in22],:256]             \n\t"
+                    "vmla.i32 q0, q4, %q[c4]                                  \n\t"
+                    "vmla.i32 q1, q5, %q[c4]                                  \n\t"
+                    "vmla.i32 q2, q6, %q[c4]                                  \n\t"
+                    "vmla.i32 q3, q7, %q[c4]                                  \n\t"
+                    "vmla.i32 q0, q8, %q[c6]                                  \n\t"
+                    "vmla.i32 q1, q9, %q[c6]                                  \n\t"
+                    "vmla.i32 q2, q10, %q[c6]                                 \n\t"
+                    "vmla.i32 q3, q11, %q[c6]                                 \n\t"
+                    "vrshrn.i32 d8, q0, #8                                    \n\t"
+                    "vrshrn.i32 d9, q1, #8                                    \n\t"
+                    "vrshrn.i32 d10, q2, #8                                   \n\t"
+                    "vrshrn.i32 d11, q3, #8                                   \n\t"
+                   "vst4.16 {d8-d11}, [%[out]]                                \n\t"
+                    : [in0] "=r" (lidx0),
+                      [in1] "=r" (lidx1),
+                      [in3] "=r" (lidx3),
+                      [in4] "=r" (lidx4)
+                    : [out] "r" (dst + x),
+                      "0" (lidx0),
+                      "1" (lidx1),
+                      "2" (lidx3),
+                      "3" (lidx4),
+                      [in2] "r" (lane + x),
+                      [in22] "r" (lane + x + 4*2),
+                      [c4] "w" (vc4s32), [c6] "w" (vc6s32)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
+                );
+*/
+            for (; x <= colsn - 4; x += 4)
+            {
+                internal::prefetch(lane + x);
+
+                int32x4_t lane0 = vld1q_s32(lane + x - 2);
+                int32x4_t lane4 = vld1q_s32(lane + x + 2);
+                int32x4_t lane1 = vld1q_s32(lane + x - 1);
+                int32x4_t lane3 = vld1q_s32(lane + x + 1);
+                int32x4_t lane2 = vld1q_s32(lane + x + 0);
+
+                int32x4_t ln04 = vaddq_s32(lane0, lane4);
+                int32x4_t ln13 = vaddq_s32(lane1, lane3);
+
+                int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
+                int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
+
+                int16x4_t ls = vrshrn_n_s32(lsw, 8);
+
+                vst1_s16(dst + x, ls);
+            }
+            break;
+        }
+        for (s32 h = 0; h < cn; ++h)
+        {
+            s32* ln = lane + h;
+            s16* dt = dst + h;
+            for (size_t k = x; k < colsn; k += cn)
+            {
+                dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+void gaussianBlur5x5(const Size2D &size, s32 cn,
+                     const s32 * srcBase, ptrdiff_t srcStride,
+                     s32 * dstBase, ptrdiff_t dstStride,
+                     BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width * cn;
+
+    std::vector<s32> _tmp;
+    s32 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(colsn + 4*cn, borderValue);
+        tmp = &_tmp[cn << 1];
+    }
+
+    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
+
+    //1-line buffer
+    std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
+    s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
+
+    if (borderType == BORDER_MODE_CONSTANT)
+        for (s32 k = 0; k < cn; ++k)
+        {
+            lane[-cn+k] = borderValue;
+            lane[-cn-cn+k] = borderValue;
+            lane[colsn+k] = borderValue;
+            lane[colsn+cn+k] = borderValue;
+        }
+
+    int32x4_t vc6s32 = vmovq_n_s32(6);
+    int32x4_t vc4s32 = vmovq_n_s32(4);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        s32* dst = internal::getRowPtr(dstBase, dstStride, i);
+        //vertical convolution
+        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
+        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
+
+        const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
+        const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
+        const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i);
+        const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
+        const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
+
+        size_t x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
+            int32x4_t v0 = vld1q_s32(ln0+x);
+            int32x4_t v1 = vld1q_s32(ln1+x);
+            int32x4_t v2 = vld1q_s32(ln2+x);
+            int32x4_t v3 = vld1q_s32(ln3+x);
+            int32x4_t v4 = vld1q_s32(ln4+x);
+
+            int32x4_t v = vaddq_s32(v0, v4);
+            int32x4_t v13 = vaddq_s32(v1, v3);
+
+            v = vmlaq_s32(v, v2, vc6s32);
+            v = vmlaq_s32(v, v13, vc4s32);
+
+            vst1q_s32(lane + x, v);
+        }
+        for (; x < colsn; ++x)
+            lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
+
+        //left&right borders
+        if (borderType != BORDER_MODE_CONSTANT)
+            for (s32 k = 0; k < cn; ++k)
+            {
+                lane[-cn+k] = lane[idx_l1 + k];
+                lane[-cn-cn+k] = lane[idx_l2 + k];
+
+                lane[colsn+k] = lane[idx_r1 + k];
+                lane[colsn+cn+k] = lane[idx_r2 + k];
+            }
+
+        //horizontal convolution
+        x = 0;
+        for (; x <= colsn - 4; x += 4)
+        {
+            internal::prefetch(lane + x);
+
+            int32x4_t lane0 = vld1q_s32(lane + x - 2);
+            int32x4_t lane4 = vld1q_s32(lane + x + 2);
+            int32x4_t lane1 = vld1q_s32(lane + x - 1);
+            int32x4_t lane3 = vld1q_s32(lane + x + 1);
+            int32x4_t lane2 = vld1q_s32(lane + x + 0);
+
+            int32x4_t ln04 = vaddq_s32(lane0, lane4);
+            int32x4_t ln13 = vaddq_s32(lane1, lane3);
+
+            int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
+            int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
+
+            vst1q_s32(dst + x, lsw);
+        }
+        for (s32 h = 0; h < cn; ++h)
+        {
+            s32* ln = lane + h;
+            s32* dt = dst + h;
+            for (size_t k = x; k < colsn; k += cn)
+            {
+                dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k];
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+    (void)borderMargin;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/in_range.cpp
+++ b/opencv/3rdparty/carotene/src/in_range.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
+inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
+inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
+
+template <typename T, int elsize> struct vtail
+{
+    static inline void inRange(const T *, const T *, const T *,
+                               u8 *, size_t &, size_t)
+    {
+        //do nothing since there couldn't be enough data
+    }
+};
+template <typename T> struct vtail<T, 2>
+{
+    static inline void inRange(const T * src, const T * rng1, const T * rng2,
+                               u8 * dst, size_t &x, size_t width)
+    {
+        typedef typename internal::VecTraits<T>::vec128 vec128;
+        typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
+        //There no more than 15 elements in the tail, so we could handle 8 element vector only once
+        if( x + 8 < width)
+        {
+             vec128  vs = internal::vld1q( src + x);
+             vec128 vr1 = internal::vld1q(rng1 + x);
+             vec128 vr2 = internal::vld1q(rng2 + x);
+            uvec128  vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+            internal::vst1(dst + x, internal::vmovn(vd));
+            x+=8;
+        }
+    }
+};
+template <typename T> struct vtail<T, 1>
+{
+    static inline void inRange(const T * src, const T * rng1, const T * rng2,
+                               u8 * dst, size_t &x, size_t width)
+    {
+        typedef typename internal::VecTraits<T>::vec128 vec128;
+        typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
+        typedef typename internal::VecTraits<T>::vec64 vec64;
+        typedef typename internal::VecTraits<T>::unsign::vec64 uvec64;
+        //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
+        if( x + 16 < width)
+        {
+             vec128  vs = internal::vld1q( src + x);
+             vec128 vr1 = internal::vld1q(rng1 + x);
+             vec128 vr2 = internal::vld1q(rng2 + x);
+            uvec128  vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+            internal::vst1q(dst + x, vd);
+            x+=16;
+        }
+        if( x + 8 < width)
+        {
+             vec64  vs = internal::vld1( src + x);
+             vec64 vr1 = internal::vld1(rng1 + x);
+             vec64 vr2 = internal::vld1(rng2 + x);
+            uvec64  vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs));
+            internal::vst1(dst + x, vd);
+            x+=8;
+        }
+    }
+};
+
+template <typename T>
+inline void inRangeCheck(const Size2D &_size,
+                         const T * srcBase, ptrdiff_t srcStride,
+                         const T * rng1Base, ptrdiff_t rng1Stride,
+                         const T * rng2Base, ptrdiff_t rng2Stride,
+                         u8 * dstBase, ptrdiff_t dstStride)
+{
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
+
+    Size2D size(_size);
+    if (srcStride == dstStride &&
+        srcStride == rng1Stride &&
+        srcStride == rng2Stride &&
+        srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    const size_t width = size.width & ~( 32/sizeof(T) - 1 );
+
+    for(size_t j = 0; j < size.height; ++j)
+    {
+        const T *  src = internal::getRowPtr( srcBase,  srcStride, j);
+        const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j);
+        const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j);
+             u8 *  dst = internal::getRowPtr( dstBase,  dstStride, j);
+        size_t i = 0;
+        for( ; i < width; i += 32/sizeof(T) )
+        {
+            internal::prefetch(src + i);
+            internal::prefetch(rng1 + i);
+            internal::prefetch(rng2 + i);
+
+             vec128  vs = internal::vld1q( src + i);
+             vec128 vr1 = internal::vld1q(rng1 + i);
+             vec128 vr2 = internal::vld1q(rng2 + i);
+            uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+                     vs = internal::vld1q( src + i + 16/sizeof(T));
+                    vr1 = internal::vld1q(rng1 + i + 16/sizeof(T));
+                    vr2 = internal::vld1q(rng2 + i + 16/sizeof(T));
+            uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+            vnst(dst + i, vd1, vd2);
+        }
+        vtail<T, sizeof(T)>::inRange(src, rng1, rng2, dst, i, size.width);
+        for( ; i < size.width; i++ )
+            dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i]));
+    }
+}
+
+}
+
+#define INRANGEFUNC(T)                                       \
+void inRange(const Size2D &_size,                            \
+             const T * srcBase, ptrdiff_t srcStride,         \
+             const T * rng1Base, ptrdiff_t rng1Stride,       \
+             const T * rng2Base, ptrdiff_t rng2Stride,       \
+             u8 * dstBase, ptrdiff_t dstStride)              \
+{                                                            \
+    internal::assertSupportedConfiguration();                \
+    inRangeCheck(_size, srcBase, srcStride,                  \
+                 rng1Base, rng1Stride, rng2Base, rng2Stride, \
+                 dstBase, dstStride);                        \
+}
+#else
+#define INRANGEFUNC(T)                                       \
+void inRange(const Size2D &,                                 \
+             const T *, ptrdiff_t,                           \
+             const T *, ptrdiff_t,                           \
+             const T *, ptrdiff_t,                           \
+             u8 *, ptrdiff_t)                                \
+{                                                            \
+    internal::assertSupportedConfiguration();                \
+}
+#endif
+
+INRANGEFUNC(u8)
+INRANGEFUNC(s8)
+INRANGEFUNC(u16)
+INRANGEFUNC(s16)
+INRANGEFUNC(s32)
+INRANGEFUNC(f32)
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/integral.cpp
+++ b/opencv/3rdparty/carotene/src/integral.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+void integral(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u32 * sumBase, ptrdiff_t sumStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint32x4_t v_zero = vmovq_n_u32(0u);
+
+    // the first iteration
+    const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
+    u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
+
+    uint32x4_t prev = v_zero;
+    size_t j = 0u;
+
+    for ( ; j + 7 < size.width; j += 8)
+    {
+        internal::prefetch(sum + j);
+        internal::prefetch(src + j);
+
+        uint8x8_t el8shr0 = vld1_u8(src + j);
+        uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
+        uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
+        uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
+
+        uint16x8_t el8shr12 =  vaddl_u8(el8shr1, el8shr2);
+        uint16x8_t el8shr03 =  vaddl_u8(el8shr0, el8shr3);
+
+        uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
+        uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
+
+        uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
+        uint32x4_t vsumh = vaddw_u16(prev, el4h);
+
+        vst1q_u32(sum + j, vsuml);
+        vst1q_u32(sum + j + 4, vsumh);
+
+        prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
+    }
+
+    for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
+        sum[j] = (v += src[j]);
+
+    // the others
+    for (size_t i = 1; i < size.height ; ++i)
+    {
+        src = internal::getRowPtr(srcBase, srcStride, i);
+        u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
+        sum = internal::getRowPtr(sumBase, sumStride, i);
+
+        prev = v_zero;
+        j = 0u;
+
+        for ( ; j + 7 < size.width; j += 8)
+        {
+            internal::prefetch(sum + j);
+            internal::prefetch(src + j);
+
+            uint32x4_t vsuml = vld1q_u32(prevSum + j);
+            uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
+
+            uint8x8_t el8shr0 = vld1_u8(src + j);
+            uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
+            uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
+            uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
+
+            vsuml = vaddq_u32(vsuml, prev);
+            vsumh = vaddq_u32(vsumh, prev);
+
+            uint16x8_t el8shr12 =  vaddl_u8(el8shr1, el8shr2);
+            uint16x8_t el8shr03 =  vaddl_u8(el8shr0, el8shr3);
+
+            uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
+            uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
+
+            vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
+            vsumh = vaddw_u16(vsumh, el4h);
+
+            vst1q_u32(sum + j, vsuml);
+            vst1q_u32(sum + j + 4, vsumh);
+
+            prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
+        }
+
+        for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
+            sum[j] = (v += src[j]) + prevSum[j];
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sumBase;
+    (void)sumStride;
+#endif
+}
+
+void sqrIntegral(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 f64 * sqsumBase, ptrdiff_t sqsumStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint16x8_t v_zero8 = vmovq_n_u16(0u);
+
+    // the first iteration
+    const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
+    f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
+
+    double prev = 0.;
+    size_t j = 0u;
+
+    for ( ; j + 7 < size.width; j += 8)
+    {
+        internal::prefetch(sqsum + j);
+        internal::prefetch(src + j);
+
+        uint8x8_t vsrc = vld1_u8(src + j);
+
+        uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
+        uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
+
+        uint32x4_t el8shr01l =  vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
+        uint32x4_t el8shr01h =  vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
+
+        uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
+
+        uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
+        uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
+        uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
+
+        u32 buf[8];
+        vst1_u32(buf, vget_low_u32(el8shr01l));
+        vst1_u32(buf+2, el2l);
+        vst1_u32(buf+4, el2hl);
+        vst1_u32(buf+6, el2hh);
+        for(u32 k=0; k < 8; k++)
+            sqsum[j+k] = prev + buf[k];
+        prev += buf[7];
+    }
+
+    for (; j < size.width; ++j)
+        sqsum[j] = (prev += src[j]*src[j]);
+
+    // the others
+    for (size_t i = 1; i < size.height ; ++i)
+    {
+        src = internal::getRowPtr(srcBase, srcStride, i);
+        f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
+        sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
+
+        prev = 0.;
+        j = 0u;
+
+        for ( ; j + 7 < size.width; j += 8)
+        {
+            internal::prefetch(sqsum + j);
+            internal::prefetch(src + j);
+
+            uint8x8_t vsrc = vld1_u8(src + j);
+
+            uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
+            uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
+
+            uint32x4_t el8shr01l =  vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
+            uint32x4_t el8shr01h =  vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
+
+            uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
+
+            uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
+            uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
+            uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
+
+            u32 buf[8];
+            vst1_u32(buf, vget_low_u32(el8shr01l));
+            vst1_u32(buf+2, el2l);
+            vst1_u32(buf+4, el2hl);
+            vst1_u32(buf+6, el2hh);
+            for(u32 k=0; k < 8; k++)
+                sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
+            prev += buf[7];
+        }
+
+        for (; j < size.width; ++j)
+            sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sqsumBase;
+    (void)sqsumStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/intrinsics.hpp
+++ b/opencv/3rdparty/carotene/src/intrinsics.hpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_INTRINSICS_HPP
+#define CAROTENE_INTRINSICS_HPP
+
+#include <carotene/definitions.hpp>
+
+#include <arm_neon.h>
+
+namespace CAROTENE_NS { namespace internal {
+
+/////////////// Custom NEON intrinsics ///////////////////
+
+// calculate reciprocal value
+
+inline float32x4_t vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+inline float32x2_t vrecp_f32(float32x2_t val)
+{
+    float32x2_t reciprocal = vrecpe_f32(val);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+// caclulate sqrt value
+
+inline float32x4_t vrsqrtq_f32(float32x4_t val)
+{
+    float32x4_t e = vrsqrteq_f32(val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x2_t vrsqrt_f32(float32x2_t val)
+{
+    float32x2_t e = vrsqrte_f32(val);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x4_t vsqrtq_f32(float32x4_t val)
+{
+    return vrecpq_f32(vrsqrtq_f32(val));
+}
+
+inline float32x2_t vsqrt_f32(float32x2_t val)
+{
+    return vrecp_f32(vrsqrt_f32(val));
+}
+
+// table lookup with the table in a 128-bit register
+
+inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
+{
+#ifdef __aarch64__
+    // AArch64 supports this natively
+    return ::vqtbl1_u8(a, b);
+#else
+    union { uint8x16_t v; uint8x8x2_t w; } u = { a };
+    return vtbl2_u8(u.w, b);
+#endif
+}
+
+} }
+
+#endif
--- a/opencv/3rdparty/carotene/src/laplacian.cpp
+++ b/opencv/3rdparty/carotene/src/laplacian.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+
+#include <vector>
+
+namespace CAROTENE_NS {
+
+bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 8 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+void Laplacian3x3(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
+    const uint16x8_t v_zero = vdupq_n_u16(0);
+    const uint8x8_t v_border = vdup_n_u8(borderValue);
+
+    uint8x8_t vsub;
+    uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
+    uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
+
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        s16 prevx = 0, currx = 0, nextx = 0;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 8)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 8 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx = borderValue;
+                else
+                    prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
+
+                currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+
+            // and calculate next value
+            tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr = v_border_x3;
+                else if (border == BORDER_MODE_REPLICATE)
+                    tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
+
+                vsub = x1;
+
+                continue;
+            }
+
+            // combine 3 "shifted" vectors
+            t0 = vextq_u16(tprev, tcurr, 7);
+            t1 = tcurr;
+            t2 = vextq_u16(tcurr, tnext, 1);
+
+            // and add them
+            t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
+
+            int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
+                                      vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
+            uint8x8_t it0 = vqmovun_s16(tt0);
+            vst1_u8(drow + x - 8, it0);
+
+            vsub = x1;
+        }
+
+        x -= 8;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                    nextx = borderValue * 3;
+                else if (border == BORDER_MODE_REPLICATE)
+                    nextx = srow2[x] + srow1[x] + srow0[x];
+            }
+            else
+            {
+                nextx = (srow2 ? srow2[x + 1] : borderValue) +
+                                 srow1[x + 1] +
+                        (srow0 ? srow0[x + 1] : borderValue);
+            }
+
+            s32 val = (prevx + currx + nextx) - 9 * srow1[x];
+            drow[x] = internal::saturate_cast<u8>((s32)val);
+
+            // make shift
+            prevx = currx;
+            currx = nextx;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() &&
+        size.width >= 8 && size.height >= 1 &&
+        (border == BORDER_MODE_CONSTANT   ||
+         border == BORDER_MODE_REFLECT    ||
+         border == BORDER_MODE_REFLECT101 ||
+         border == BORDER_MODE_REPLICATE);
+}
+
+void Laplacian1OpenCV(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
+#ifdef CAROTENE_NEON
+    ptrdiff_t rows = size.height, cols = size.width;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (border == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(cols + 4,borderValue);
+        tmp = &_tmp[2];
+    }
+
+    for( ptrdiff_t y = 0; y < rows; y++ )
+    {
+        const u8* v0 = 0;
+        const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* v2 = 0;
+        // make border
+        if (border == BORDER_MODE_REFLECT101) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
+        } else  if (border == BORDER_MODE_CONSTANT) {
+            v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            v2 =  y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+        } else {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+        }
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        int16x8_t tcurr = vmovq_n_s16(0x0);
+        int16x8_t tnext = vmovq_n_s16(0x0);
+        int16x8_t t0, t2;
+        uint8x8_t xx0 = vmov_n_u8(0x0);
+        uint8x8_t xx1 = vmov_n_u8(0x0);
+        uint8x8_t xx2 = vmov_n_u8(0x0);
+        ptrdiff_t x = 0;
+        const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(v0 + x);
+            internal::prefetch(v1 + x);
+            internal::prefetch(v2 + x);
+
+            uint8x8_t x0 = vld1_u8(v0 + x);
+            uint8x8_t x1 = vld1_u8(v1 + x);
+            uint8x8_t x2 = vld1_u8(v2 + x);
+
+            if(x) {
+                xx0 = xx1;
+                xx1 = xx2;
+            } else {
+                xx1 = x1;
+                // make border
+                    if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+                    {
+                        xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
+                    }
+                    else if (border == BORDER_MODE_CONSTANT)
+                    {
+                        xx1 = vset_lane_u8(borderValue, x1, 7);
+                    }
+                    else if (border == BORDER_MODE_REFLECT101)
+                    {
+                        xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
+                    }
+            }
+            xx2 = x1;
+
+            if(x) {
+                tcurr = tnext;
+            }
+            tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
+                              vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
+
+            if(!x) {
+                tcurr = tnext;
+                continue;
+            }
+            t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
+            t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
+            t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
+
+            vst1q_s16(drow + x - 8, t0);
+        }
+
+        x -= 8;
+        if(x == cols){
+            x--;
+        }
+
+        for( ; x < cols; x++ )
+        {
+            s16 nextx;
+            s16 prevx;
+            // make border
+            if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+            {
+                prevx = x == 0 ? v1[0] : v1[x-1];
+                nextx = x == cols-1 ? v1[x] : v1[x+1];
+            }
+            else if (border == BORDER_MODE_REFLECT101)
+            {
+                prevx = x == 0 ? v1[1] : v1[x-1];
+                nextx = x == cols-1 ? v1[x-1] : v1[x+1];
+            }
+            else //if (border == BORDER_MODE_CONSTANT)
+            {
+                prevx = x == 0 ? borderValue : v1[x-1];
+                nextx = x == cols-1 ? borderValue : v1[x+1];
+            }
+            *(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+void Laplacian3OpenCV(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
+#ifdef CAROTENE_NEON
+    ptrdiff_t rows = size.height, cols = size.width;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (border == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(cols + 4,borderValue);
+        tmp = &_tmp[2];
+    }
+
+    for( ptrdiff_t y = 0; y < rows; y++ )
+    {
+        const u8* v0 = 0;
+        const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* v2 = 0;
+        // make border
+        if (border == BORDER_MODE_REFLECT101) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
+        } else  if (border == BORDER_MODE_CONSTANT) {
+            v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+        } else {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+        }
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        int16x8_t tprev = vmovq_n_s16(0x0);
+        int16x8_t tcurr = vmovq_n_s16(0x0);
+        int16x8_t tnext = vmovq_n_s16(0x0);
+        int16x8_t tc = vmovq_n_s16(0x0);
+        int16x8_t t0, t2, tcnext;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(v0 + x);
+            internal::prefetch(v1 + x);
+            internal::prefetch(v2 + x);
+
+            uint8x8_t x0 = vld1_u8(v0 + x);
+            uint8x8_t x1 = vld1_u8(v1 + x);
+            uint8x8_t x2 = vld1_u8(v2 + x);
+            tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
+
+            if(x) {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+            tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
+
+            if(!x) {
+                tcurr = tnext;
+                tc = tcnext;
+
+                // make border
+                    if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+                    {
+                        tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
+                    }
+                    else if (border == BORDER_MODE_CONSTANT)
+                    {
+                        tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
+                    }
+                    else if (border == BORDER_MODE_REFLECT101)
+                    {
+                        tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
+                    }
+                continue;
+            }
+
+            t0 = vextq_s16(tprev, tcurr, 7);
+            t2 = vextq_s16(tcurr, tnext, 1);
+
+            t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
+            tc = tcnext;
+
+            t0 = vshlq_n_s16(t0, 1);
+            vst1q_s16(drow + x - 8, t0);
+        }
+        x -= 8;
+        if(x == cols){
+            x--;
+        }
+
+        for( ; x < cols; x++ )
+        {
+            s16 nextx, nextx2;
+            s16 prevx, prevx2;
+            // make border
+            if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+            {
+                prevx = x == 0 ? v0[0] : v0[x-1];
+                prevx2 = x == 0 ? v2[0] : v2[x-1];
+                nextx = x == cols-1 ? v0[x] : v0[x+1];
+                nextx2 = x == cols-1 ? v2[x] : v2[x+1];
+            }
+            else if (border == BORDER_MODE_REFLECT101)
+            {
+                prevx = x == 0 ? v0[1] : v0[x-1];
+                prevx2 = x == 0 ? v2[1] : v2[x-1];
+                nextx = x == cols-1 ? v0[x-1] : v0[x+1];
+                nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
+            }
+            else //if (border == BORDER_MODE_CONSTANT)
+            {
+                prevx = x == 0 ? borderValue : v0[x-1];
+                prevx2 = x == 0 ? borderValue : v2[x-1];
+                nextx = x == cols-1 ? borderValue : v0[x+1];
+                nextx2 = x == cols-1 ? borderValue : v2[x+1];
+            }
+            s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
+            *(drow+x) = 2*res;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+void Laplacian5OpenCV(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
+#ifdef CAROTENE_NEON
+    ptrdiff_t rows = size.height, cols = size.width;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (border == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(cols + 4,borderValue);
+        tmp = &_tmp[2];
+    }
+
+    for( ptrdiff_t y = 0; y < rows; y++ )
+    {
+        const u8* v0 = 0;
+        const u8* v1 = 0;
+        const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* v3 = 0;
+        const u8* v4 = 0;
+        // make border
+        if (border == BORDER_MODE_REPLICATE) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
+            v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+            v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
+        } else if (border == BORDER_MODE_REFLECT) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
+            v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+            v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
+        } else if (border == BORDER_MODE_REFLECT101) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
+            v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
+            v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
+            v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1   rows - 4 + (2,1)
+        } else if (border == BORDER_MODE_CONSTANT) {
+            v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
+            v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+            v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
+        }
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        int16x8_t tnext, tc, t0;
+        int16x8_t tnext2, tnext3;
+        int16x8_t tnext1Old, tnext2Old, tnext3Old;
+        int16x8_t tnext4OldOldOld, tnext5OldOldOld;
+
+        int16x8_t tcurr1 = vmovq_n_s16(0x0);
+        int16x8_t tnext1 = vmovq_n_s16(0x0);
+        int16x8_t tprev1 = vmovq_n_s16(0x0);
+        int16x8_t tpprev1 = vmovq_n_s16(0x0);
+        int16x8_t tppprev1 = vmovq_n_s16(0x0);
+
+        int16x8_t tnext4Old = vmovq_n_s16(0x0);
+        int16x8_t tnext5Old = vmovq_n_s16(0x0);
+        int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
+
+        // do vertical convolution
+        ptrdiff_t x = 0;
+        const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(v0 + x);
+            internal::prefetch(v1 + x);
+            internal::prefetch(v2 + x);
+            internal::prefetch(v3 + x);
+            internal::prefetch(v4 + x);
+
+            uint8x8_t x0 = vld1_u8(v0 + x);
+            uint8x8_t x1 = vld1_u8(v1 + x);
+            uint8x8_t x2 = vld1_u8(v2 + x);
+            uint8x8_t x3 = vld1_u8(v3 + x);
+            uint8x8_t x4 = vld1_u8(v4 + x);
+            if(x) {
+                tcurr1 = tnext1;
+            }
+
+            tnext4OldOldOld = tnext4Old;
+            tnext5OldOldOld = tnext5Old;
+            tnext1Old = tnext1OldOld;
+            tnext2Old = tnext2OldOld;
+            tnext3Old = tnext3OldOld;
+            tnext4Old = tnext4OldOld;
+            tnext5Old = tnext5OldOld;
+
+            tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
+            tnext3 = vshlq_n_s16(tnext3, 1);
+
+            tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
+            tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
+            tnext2 = vsubq_s16(tc, tnext);
+
+            tnext1 = vaddq_s16(tnext3, tnext2);
+            // tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
+
+            tnext2 = vshlq_n_s16(tnext2, 1);
+            // tnext2 = 2*x4 - 4*x2 + 2*x0
+
+            tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
+            // tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3  + 2*x4
+
+            tnext1OldOld = tnext1;
+            tnext2OldOld = tnext2;
+            tnext3OldOld = tnext3;
+            tnext4OldOld = tnext2;
+            tnext5OldOld = tnext1;
+
+            if(x) {
+                tnext1 = vextq_s16(tnext1Old, tnext1, 2);
+                tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
+                tprev1 = tnext3Old;
+
+                if(x!=8) {
+                    tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
+                    tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
+                }
+            }
+
+            if(!x) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
+                } else if (border == BORDER_MODE_REFLECT) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
+                    tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
+                }
+                tppprev1 = tprev1;
+                continue;
+            }
+
+            t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
+            t0 = vaddq_s16(t0, t0);
+            vst1q_s16(drow + x - 8, t0);
+        }
+        x -= 8;
+        if(x >= cols - 1)
+            x = cols-2;
+
+        s16 pprevx = 0;
+        s16 prevx = 0;
+        s16 nextx = 0;
+        s16 nnextx = 0;
+
+        for( ; x < cols; x++ )
+        {
+            if (x == 0) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE) {
+                    pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
+                    prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
+                } else if (border == BORDER_MODE_REFLECT) {
+                    pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
+                    prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
+                    prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    pprevx = 8 * borderValue;
+                    prevx = 0;
+                }
+            } else if (x == 1) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
+                    pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    pprevx = 8 * borderValue;
+                }
+                prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
+            } else {
+                pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
+                prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
+            }
+            s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
+            if (x == cols-1) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE) {
+                    nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
+                    nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
+                } else if (border == BORDER_MODE_REFLECT) {
+                    nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
+                    nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
+                    nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    nextx = 0;
+                    nnextx = 8 * borderValue;
+                }
+            } else if (x == cols-2) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
+                    nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    nnextx = 8 * borderValue;
+                }
+                nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
+            } else {
+                nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
+                nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
+            }
+            s16 res = pprevx + prevx + currx + nextx + nnextx;
+            *(drow+x) = 2*res;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/magnitude.cpp
+++ b/opencv/3rdparty/carotene/src/magnitude.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cmath>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+struct Magnitude
+{
+    typedef s16 type;
+
+    void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1,
+              int16x8_t & v_dst) const
+    {
+        int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1);
+        float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
+                                       vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
+        v_src0_p = vget_high_s16(v_src0);
+        v_src1_p = vget_high_s16(v_src1);
+        float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
+                                       vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
+
+        int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0));
+        int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1));
+
+        v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1));
+    }
+
+    void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1,
+              int16x4_t & v_dst) const
+    {
+        float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)),
+                                      vcvtq_f32_s32(vmull_s16(v_src1, v_src1)));
+        int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp));
+        v_dst = vqmovn_s32(v_sqrt);
+    }
+
+    void operator() (const short * src0, const short * src1, short * dst) const
+    {
+        f32 src0val = (f32)src0[0], src1val = (f32)src1[0];
+        dst[0] = internal::saturate_cast<s16>((s32)sqrtf(src0val * src0val + src1val * src1val));
+    }
+};
+
+struct MagnitudeF32
+{
+    typedef f32 type;
+
+    void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1,
+              float32x4_t & v_dst) const
+    {
+        v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1)));
+    }
+
+    void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1,
+              float32x2_t & v_dst) const
+    {
+        v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1)));
+    }
+
+    void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
+    {
+        dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]);
+    }
+};
+
+} // namespace
+
+#endif
+
+void magnitude(const Size2D &size,
+               const s16 * src0Base, ptrdiff_t src0Stride,
+               const s16 * src1Base, ptrdiff_t src1Stride,
+               s16 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         Magnitude());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void magnitude(const Size2D &size,
+               const f32 * src0Base, ptrdiff_t src0Stride,
+               const f32 * src1Base, ptrdiff_t src1Stride,
+               f32 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         MagnitudeF32());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/meanstddev.cpp
+++ b/opencv/3rdparty/carotene/src/meanstddev.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <cmath>
+
+namespace CAROTENE_NS {
+
+void meanStdDev(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                f32 * pMean, f32 * pStdDev)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    f64 fsum = 0.0f, fsqsum = 0.0f;
+    sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);
+
+    // calc mean and stddev
+    f64 itotal = 1.0 / size.total();
+    f64 mean = fsum * itotal;
+    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
+
+    if (pMean)
+        *pMean = mean;
+    if (pStdDev)
+        *pStdDev = stddev;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMean;
+    (void)pStdDev;
+#endif
+}
+
+void meanStdDev(const Size2D &size,
+                const u16 * srcBase, ptrdiff_t srcStride,
+                f32 * pMean, f32 * pStdDev)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
+    f64 fsum = 0.0f, fsqsum = 0.0f;
+
+    f32 arsum[8];
+    uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
+    float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0u;
+
+        while (j < roiw4)
+        {
+            size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
+            v_sum = v_zero;
+            v_sqsum = v_zero_f;
+
+            for ( ; j + 16 < blockSize ; j += 16)
+            {
+                internal::prefetch(src + j);
+                uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
+
+                // 0
+                uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
+                uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
+                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
+                float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
+                float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
+
+                // 1
+                v_srclo = vmovl_u16(vget_low_u16(v_src1));
+                v_srchi = vmovl_u16(vget_high_u16(v_src1));
+                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
+                v_srclo_f = vcvtq_f32_u32(v_srclo);
+                v_srchi_f = vcvtq_f32_u32(v_srchi);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
+            }
+
+            for ( ; j < blockSize; j += 4)
+            {
+                uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
+                float32x4_t v_src_f = vcvtq_f32_u32(v_src);
+                v_sum = vaddq_u32(v_sum, v_src);
+                v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
+            }
+
+            vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
+            vst1q_f32(arsum + 4, v_sqsum);
+
+            fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
+            fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
+        }
+
+        // collect a few last elements in the current row
+        for ( ; j < size.width; ++j)
+        {
+            f32 srcval = src[j];
+            fsum += srcval;
+            fsqsum += srcval * srcval;
+        }
+    }
+
+    // calc mean and stddev
+    f64 itotal = 1.0 / size.total();
+    f64 mean = fsum * itotal;
+    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
+
+    if (pMean)
+        *pMean = mean;
+    if (pStdDev)
+        *pStdDev = stddev;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMean;
+    (void)pStdDev;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/median_filter.cpp
+++ b/opencv/3rdparty/carotene/src/median_filter.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+/*
+ * The code here is based on the code in
+ * <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
+ * See also <http://ndevilla.free.fr/median/median/index.html>.
+ */
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+namespace {
+
+    uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn)
+    {
+        u8 buf[16+8];
+        vst1q_u8(buf+cn, r);
+        for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i];
+        return vld1q_u8(buf);
+    }
+
+    uint8x8_t getRightReplicate(uint8x8_t r, u32 cn)
+    {
+        u8 buf[8+8];
+        vst1_u8(buf, r);
+        for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i];
+        return vld1_u8(buf+cn);
+    }
+
+} // namespace
+
+//o------^-------^-----------------------------o 0
+//       |       |
+//o--^---v---^---|-------^---------------------o 1
+//   |       |   |       |
+//o--v-------v---|-------|-^-------^-------^---o 2
+//               |       | |       |       |
+//o------^-------v-----^-|-|-------|-------|---o 3
+//       |             | | |       |       |
+//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
+//   |       |     |   |   |   |       |
+//o--v-------v---^-|---|---v---|-------|-------o 5
+//               | |   |       |       |
+//o------^-------|-|---v-------|-------v-------o 6
+//       |       | |           |
+//o--^---v---^---|-v-----------v---------------o 7
+//   |       |   |
+//o--v-------v---v-----------------------------o 8
+
+#define ELT(num, level) v ## num ## _lv ## level
+#define PIX_SORT(a, alvl, b, blvl, newlvl) \
+    PIX_MIN(a, alvl, b, blvl, newlvl); \
+    PIX_MAX(a, alvl, b, blvl, newlvl);
+
+#define SORT9 \
+    PIX_SORT(1, 00, 2, 00, 01); \
+    PIX_SORT(4, 00, 5, 00, 02); \
+    PIX_SORT(7, 00, 8, 00, 03); \
+    PIX_SORT(0, 00, 1, 01, 04); \
+    PIX_SORT(3, 00, 4, 02, 05); \
+    PIX_SORT(6, 00, 7, 03, 06); \
+    PIX_SORT(1, 04, 2, 01, 07); \
+    PIX_SORT(4, 05, 5, 02, 08); \
+    PIX_SORT(7, 06, 8, 03, 09); \
+    PIX_MAX (0, 04, 3, 05, 10); \
+    PIX_MIN (5, 08, 8, 09, 11); \
+    PIX_SORT(4, 08, 7, 09, 12); \
+    PIX_MAX (3, 10, 6, 06, 13); \
+    PIX_MAX (1, 07, 4, 12, 14); \
+    PIX_MIN (2, 07, 5, 11, 15); \
+    PIX_MIN (4, 14, 7, 12, 16); \
+    PIX_SORT(4, 16, 2, 15, 17); \
+    PIX_MAX (6, 13, 4, 17, 18); \
+    PIX_MIN (4, 18, 2, 17, 19);
+
+#endif
+
+bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels)
+{
+    return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8;
+}
+
+void medianFilter3x3(const Size2D &size, u32 numChannels,
+                     const u8 *srcBase, ptrdiff_t srcStride,
+                     const Margin &srcMargin,
+                     u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels));
+#ifdef CAROTENE_NEON
+    u32 cn = numChannels;
+    size_t colsn = size.width * cn;
+
+    for (size_t i = 0; i < size.height; ++i) {
+        const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i);
+        const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride;
+        const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride;
+        u8* pdst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        {
+            uint8x16_t v3_lv00 = vld1q_u8(psrc0);
+            uint8x16_t v4_lv00 = vld1q_u8(psrc1);
+            uint8x16_t v5_lv00 = vld1q_u8(psrc2);
+            uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn);
+            uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn);
+            uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn);
+            uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn);
+            uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn);
+            uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn);
+
+            goto medianBlur3x3_mainBody;
+
+            for (; j < colsn - 16; j += 16) {
+                internal::prefetch(psrc0 + j);
+                internal::prefetch(psrc1 + j);
+                internal::prefetch(psrc2 + j);
+
+                v0_lv00 = vld1q_u8(psrc0 + j - cn);
+                v1_lv00 = vld1q_u8(psrc1 + j - cn);
+                v2_lv00 = vld1q_u8(psrc2 + j - cn);
+                v3_lv00 = vld1q_u8(psrc0 + j);
+                v4_lv00 = vld1q_u8(psrc1 + j);
+                v5_lv00 = vld1q_u8(psrc2 + j);
+                v6_lv00 = vld1q_u8(psrc0 + j + cn);
+                v7_lv00 = vld1q_u8(psrc1 + j + cn);
+                v8_lv00 = vld1q_u8(psrc2 + j + cn);
+
+medianBlur3x3_mainBody:
+
+#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
+#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
+                SORT9;
+#undef PIX_MAX
+#undef PIX_MIN
+
+                vst1q_u8(pdst + j, v4_lv19);
+            }
+        }
+
+        {
+            size_t k = colsn - 8;
+            uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn);
+            uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn);
+            uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn);
+            uint8x8_t v3_lv00 = vld1_u8(psrc0 + k);
+            uint8x8_t v4_lv00 = vld1_u8(psrc1 + k);
+            uint8x8_t v5_lv00 = vld1_u8(psrc2 + k);
+            uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn);
+            uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn);
+            uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn);
+
+            goto medianBlur3x3_tailBody;
+
+            for (; k >= j - 8; k -= 8) {
+                v0_lv00 = vld1_u8(psrc0 + k - cn);
+                v1_lv00 = vld1_u8(psrc1 + k - cn);
+                v2_lv00 = vld1_u8(psrc2 + k - cn);
+                v3_lv00 = vld1_u8(psrc0 + k);
+                v4_lv00 = vld1_u8(psrc1 + k);
+                v5_lv00 = vld1_u8(psrc2 + k);
+                v6_lv00 = vld1_u8(psrc0 + k + cn);
+                v7_lv00 = vld1_u8(psrc1 + k + cn);
+                v8_lv00 = vld1_u8(psrc2 + k + cn);
+
+medianBlur3x3_tailBody:
+
+#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
+#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
+                SORT9;
+#undef PIX_MAX
+#undef PIX_MIN
+
+                vst1_u8(pdst + k, v4_lv19);
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)numChannels;
+    (void)srcBase;
+    (void)srcStride;
+    (void)srcMargin;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/min_max.cpp
+++ b/opencv/3rdparty/carotene/src/min_max.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <algorithm>
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+struct Min
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vminq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vmin(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = std::min(src0[0], src1[0]);
+    }
+};
+
+template <typename T>
+struct Max
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vmaxq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vmax(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = std::max(src0[0], src1[0]);
+    }
+};
+
+} // namespace
+
+#define IMPL_OP(fun, op, type)                                         \
+void fun(const Size2D &size,                                           \
+         const type * src0Base, ptrdiff_t src0Stride,                  \
+         const type * src1Base, ptrdiff_t src1Stride,                  \
+         type * dstBase, ptrdiff_t dstStride)                          \
+{                                                                      \
+    internal::assertSupportedConfiguration();                          \
+    internal::vtransform(size,                                         \
+                         src0Base, src0Stride,                         \
+                         src1Base, src1Stride,                         \
+                         dstBase, dstStride, op<type>());              \
+}
+
+#else
+
+#define IMPL_OP(fun, op, type)                    \
+void fun(const Size2D &,                          \
+         const type *, ptrdiff_t,                 \
+         const type *, ptrdiff_t,                 \
+         type *, ptrdiff_t)                       \
+{                                                 \
+    internal::assertSupportedConfiguration();     \
+}
+
+#endif
+
+#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
+
+IMPL_MINMAX(u8)
+IMPL_MINMAX(s8)
+IMPL_MINMAX(u16)
+IMPL_MINMAX(s16)
+IMPL_MINMAX(u32)
+IMPL_MINMAX(s32)
+IMPL_MINMAX(f32)
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/minmaxloc.cpp
+++ b/opencv/3rdparty/carotene/src/minmaxloc.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <limits>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+void minMaxVals(const Size2D &size,
+                const T * srcBase, ptrdiff_t srcStride,
+                T * pMinVal, T * pMaxVal)
+{
+    using namespace internal;
+
+    typedef typename VecTraits<T>::vec128 vec128;
+    typedef typename VecTraits<T>::vec64 vec64;
+
+    u32 step_base = 32 / sizeof(T), step_tail = 8 / sizeof(T);
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    T maxVal = std::numeric_limits<T>::min();
+    T minVal = std::numeric_limits<T>::max();
+    vec128 v_min_base = vdupq_n(minVal), v_max_base = vdupq_n(maxVal);
+    vec64 v_min_tail = vdup_n(minVal), v_max_tail = vdup_n(maxVal);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src = getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for (; j < roiw_base; j += step_base)
+        {
+            prefetch(src + j);
+            vec128 v_src0 = vld1q(src + j), v_src1 = vld1q(src + j + 16 / sizeof(T));
+            v_min_base = vminq(v_min_base, v_src0);
+            v_max_base = vmaxq(v_max_base, v_src0);
+            v_min_base = vminq(v_min_base, v_src1);
+            v_max_base = vmaxq(v_max_base, v_src1);
+        }
+        for (; j < roiw_tail; j += step_tail)
+        {
+            vec64 v_src0 = vld1(src + j);
+            v_min_tail = vmin(v_min_tail, v_src0);
+            v_max_tail = vmax(v_max_tail, v_src0);
+        }
+
+        for (; j < size.width; j++)
+        {
+            T srcval = src[j];
+            minVal = std::min(srcval, minVal);
+            maxVal = std::max(srcval, maxVal);
+        }
+    }
+
+    // collect min & max values
+    T ar[16 / sizeof(T)];
+    vst1q(ar, vcombine(vmin(v_min_tail, vmin(vget_low(v_min_base), vget_high(v_min_base))),
+                       vmax(v_max_tail, vmax(vget_low(v_max_base), vget_high(v_max_base)))));
+
+    for (size_t x = 0; x < 8u / sizeof(T); ++x)
+    {
+        minVal = std::min(minVal, ar[x]);
+        maxVal = std::max(maxVal, ar[x + 8 / sizeof(T)]);
+    }
+
+    if (pMaxVal)
+        *pMaxVal = maxVal;
+    if (pMinVal)
+        *pMinVal = minVal;
+}
+
+} // namespace
+
+#endif
+
+void minMaxVals(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * pMinVal, u8 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<u8>(size,
+                   srcBase, srcStride,
+                   pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxVals(const Size2D &size,
+                const s16 * srcBase, ptrdiff_t srcStride,
+                s16 * pMinVal, s16 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<s16>(size,
+                    srcBase, srcStride,
+                    pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxVals(const Size2D &size,
+                const u16 * srcBase, ptrdiff_t srcStride,
+                u16 * pMinVal, u16 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<u16>(size,
+                    srcBase, srcStride,
+                    pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxVals(const Size2D &size,
+                const s32 * srcBase, ptrdiff_t srcStride,
+                s32 * pMinVal, s32 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<s32>(size,
+                    srcBase, srcStride,
+                    pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxVals(const Size2D &size,
+                const u32 * srcBase, ptrdiff_t srcStride,
+                u32 * pMinVal, u32 * pMaxVal)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minMaxVals<u32>(size,
+                    srcBase, srcStride,
+                    pMinVal, pMaxVal);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMinVal;
+    (void)pMaxVal;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const f32 * srcBase, ptrdiff_t srcStride,
+               f32 &minVal, size_t &minCol, size_t &minRow,
+               f32 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
+        if (size.width >= 16)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t   c4       = vdupq_n_u32(4);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (4 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
+#else
+            {
+                size_t bound = size.width - (4 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                float32x4_t  n_min    = vdupq_n_f32(minVal);
+                uint32x4_t   n_minIdx = vdupq_n_u32(0xffffFFFC);
+                float32x4_t  n_max    = vdupq_n_f32(maxVal);
+                uint32x4_t   n_maxIdx = vdupq_n_u32(0xffffFFFC);
+
+                for(; i < bound; i+=4)
+                {
+                    internal::prefetch(src + i);
+                    float32x4_t line = vld1q_f32(src + i);
+
+                    uint32x4_t minmask = vcltq_f32(line, n_min);
+                    uint32x4_t maxmask = vcgtq_f32(line, n_max);
+
+                    n_min    = vbslq_f32(minmask, line, n_min);
+                    n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
+                    n_max    = vbslq_f32(maxmask, line, n_max);
+                    n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
+
+                    // idx[] +=4
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
+                }
+
+                f32 fmin[4], fmax[4];
+                u32 fminIdx[4], fmaxIdx[4];
+
+                vst1q_f32(fmin, n_min);
+                vst1q_f32(fmax, n_max);
+
+                vst1q_u32(fminIdx, n_minIdx);
+                vst1q_u32(fmaxIdx, n_maxIdx);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 4; ++j)
+                {
+                    f32 minval = fmin[j];
+                    f32 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; ++i )
+        {
+            float val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            else if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const f32 * srcBase, ptrdiff_t srcStride,
+               const u8 * maskBase, ptrdiff_t maskStride,
+               f32 &minVal, size_t &minCol, size_t &minRow,
+               f32 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = std::numeric_limits<f32>::max();
+    minCol = size.width;
+    minRow = size.height;
+    maxVal = -std::numeric_limits<f32>::max();
+    maxCol = size.width;
+    maxRow = size.height;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
+        const u8 * mask = internal::getRowPtr( maskBase, maskStride, l);
+        if (size.width >= 16)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t  uOne      = vdupq_n_u32(1);
+            uint32x4_t   c4       = vdupq_n_u32(4);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (4 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
+#else
+            {
+                size_t bound = size.width - (4 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                float32x4_t  n_min    = vdupq_n_f32(minVal);
+                uint32x4_t   n_minIdx = vdupq_n_u32(0xffffFFFC);
+                float32x4_t  n_max    = vdupq_n_f32(maxVal);
+                uint32x4_t   n_maxIdx = vdupq_n_u32(0xffffFFFC);
+
+                for(; i < bound; i+=4)
+                {
+                    internal::prefetch(src + i);
+                    internal::prefetch(mask + i);
+                    float32x4_t line = vld1q_f32(src + i);
+                    uint8x8_t maskLine = vld1_u8(mask + i);
+
+                    uint32x4_t maskLine4 = vmovl_u16(vget_low_u16(vmovl_u8(maskLine)));
+                    maskLine4 = vcgeq_u32(maskLine4, uOne);
+
+                    uint32x4_t minmask = vcltq_f32(line, n_min);
+                    uint32x4_t maxmask = vcgtq_f32(line, n_max);
+
+                    minmask = vandq_u32(minmask, maskLine4);
+                    maxmask = vandq_u32(maxmask, maskLine4);
+
+                    n_min    = vbslq_f32(minmask, line, n_min);
+                    n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
+                    n_max    = vbslq_f32(maxmask, line, n_max);
+                    n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
+
+                    // idx[] +=4
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
+                }
+
+                f32 fmin[4], fmax[4];
+                u32 fminIdx[4], fmaxIdx[4];
+
+                vst1q_f32(fmin, n_min);
+                vst1q_f32(fmax, n_max);
+
+                vst1q_u32(fminIdx, n_minIdx);
+                vst1q_u32(fmaxIdx, n_maxIdx);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 4; ++j)
+                {
+                    f32 minval = fmin[j];
+                    f32 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; i++ )
+        {
+            if (!mask[i])
+                continue;
+            f32 val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)maskBase;
+    (void)maskStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const s32 * srcBase, ptrdiff_t srcStride,
+               s32 &minVal, size_t &minCol, size_t &minRow,
+               s32 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const s32 * src = internal::getRowPtr( srcBase, srcStride, l);
+        if (size.width >= 16)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t c4       = vdupq_n_u32(4);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (4 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
+#else
+            {
+                size_t bound = size.width - (4 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                int32x4_t  n_min    = vdupq_n_s32(minVal);
+                uint32x4_t   n_minIdx = vdupq_n_u32(0xffffFFFC);
+                int32x4_t  n_max    = vdupq_n_s32(maxVal);
+                uint32x4_t   n_maxIdx = vdupq_n_u32(0xffffFFFC);
+
+                for(; i < bound; i+=4 )
+                {
+                    internal::prefetch(src + i);
+                    int32x4_t line = vld1q_s32(src + i);
+
+                    uint32x4_t minmask = vcltq_s32(line, n_min);
+                    uint32x4_t maxmask = vcgtq_s32(line, n_max);
+
+                    n_min    = vbslq_s32(minmask, line, n_min);
+                    n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
+                    n_max    = vbslq_s32(maxmask, line, n_max);
+                    n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
+
+                    // idx[] +=4
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
+                }
+
+                s32 fmin[4], fmax[4];
+                u32 fminIdx[4], fmaxIdx[4];
+
+                vst1q_s32(fmin, n_min);
+                vst1q_s32(fmax, n_max);
+
+                vst1q_u32(fminIdx, n_minIdx);
+                vst1q_u32(fmaxIdx, n_maxIdx);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 4; ++j)
+                {
+                    s32 minval = fmin[j];
+                    s32 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFFC)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; ++i )
+        {
+            s32 val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            else if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const s16 * srcBase, ptrdiff_t srcStride,
+               s16 &minVal, size_t &minCol, size_t &minRow,
+               s16 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const s16 * src = internal::getRowPtr( srcBase,  srcStride, l);
+        if (size.width >= 32)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t c8        = vdupq_n_u32(8);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (8 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
+#else
+            {
+                size_t bound = size.width - (8 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                int16x8_t  n_min    = vdupq_n_s16(minVal);
+                uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
+                uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
+                int16x8_t  n_max    = vdupq_n_s16(maxVal);
+                uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
+                uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);
+
+                for(; i < bound; i+=8 )
+                {
+                    internal::prefetch(src + i);
+                    int16x8_t line = vld1q_s16(src + i);
+
+                    uint16x8_t minmask = vcltq_s16(line, n_min);
+                    uint16x8_t maxmask = vcgtq_s16(line, n_max);
+
+                    n_min    = vbslq_s16(minmask, line, n_min);
+                    uint16x4_t minml = vget_low_u16(minmask);
+                    uint16x4_t minmh = vget_high_u16(minmask);
+                    uint32x4_t minml2 = vmovl_u16(minml);
+                    uint32x4_t minmh2 = vmovl_u16(minmh);
+                    minml2 = vqshlq_n_u32(minml2, 31);
+                    minmh2 = vqshlq_n_u32(minmh2, 31);
+                    n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
+                    n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);
+
+                    n_max    = vbslq_s16(maxmask, line, n_max);
+                    uint16x4_t maxml = vget_low_u16(maxmask);
+                    uint16x4_t maxmh = vget_high_u16(maxmask);
+                    uint32x4_t maxml2 = vmovl_u16(maxml);
+                    uint32x4_t maxmh2 = vmovl_u16(maxmh);
+                    maxml2 = vqshlq_n_u32(maxml2, 31);
+                    maxmh2 = vqshlq_n_u32(maxmh2, 31);
+                    n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
+                    n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);
+
+                    // idx[] +=8
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
+                }
+
+                // fix high part of indexes
+                uint32x4_t c4 = vdupq_n_u32((int32_t) 4);
+                n_minIdxh = vaddq_u32(n_minIdxh, c4);
+                n_maxIdxh = vaddq_u32(n_maxIdxh, c4);
+
+                s16 fmin[8], fmax[8];
+                u32 fminIdx[8], fmaxIdx[8];
+
+                vst1q_s16(fmin, n_min);
+                vst1q_s16(fmax, n_max);
+                vst1q_u32(fminIdx+0, n_minIdxl);
+                vst1q_u32(fmaxIdx+0, n_maxIdxl);
+                vst1q_u32(fminIdx+4, n_minIdxh);
+                vst1q_u32(fmaxIdx+4, n_maxIdxh);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 8; ++j)
+                {
+                    s16 minval = fmin[j];
+                    s16 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFF8)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFF8)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; ++i )
+        {
+            short val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            else if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const u16 * srcBase, ptrdiff_t srcStride,
+               u16 &minVal, size_t &minCol, size_t &minRow,
+               u16 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
+    {
+        const u16 * src = internal::getRowPtr( srcBase,  srcStride, l);
+        if (size.width >= 32)
+        {
+            u32 tmp0123[4] = { 0, 1, 2, 3 };
+            uint32x4_t c8        = vdupq_n_u32(8);
+
+#if SIZE_MAX > UINT32_MAX
+            size_t boundAll = size.width - (8 - 1);
+            for(size_t b = 0; i < boundAll; b = i)
+            {
+                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
+#else
+            {
+                size_t bound = size.width - (8 - 1);
+#endif
+                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
+                uint16x8_t  n_min    = vdupq_n_u16(minVal);
+                uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
+                uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
+                uint16x8_t  n_max    = vdupq_n_u16(maxVal);
+                uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
+                uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);
+
+                for(; i < bound; i+=8 )
+                {
+                    internal::prefetch(src + i);
+                    uint16x8_t line = vld1q_u16(src + i);
+
+                    uint16x8_t minmask = vcltq_u16(line, n_min);
+                    uint16x8_t maxmask = vcgtq_u16(line, n_max);
+
+                    n_min    = vbslq_u16(minmask, line, n_min);
+                    uint16x4_t minml = vget_low_u16(minmask);
+                    uint16x4_t minmh = vget_high_u16(minmask);
+                    uint32x4_t minml2 = vmovl_u16(minml);
+                    uint32x4_t minmh2 = vmovl_u16(minmh);
+                    minml2 = vqshlq_n_u32(minml2, 31);
+                    minmh2 = vqshlq_n_u32(minmh2, 31);
+                    n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
+                    n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);
+
+                    n_max    = vbslq_u16(maxmask, line, n_max);
+                    uint16x4_t maxml = vget_low_u16(maxmask);
+                    uint16x4_t maxmh = vget_high_u16(maxmask);
+                    uint32x4_t maxml2 = vmovl_u16(maxml);
+                    uint32x4_t maxmh2 = vmovl_u16(maxmh);
+                    maxml2 = vqshlq_n_u32(maxml2, 31);
+                    maxmh2 = vqshlq_n_u32(maxmh2, 31);
+                    n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
+                    n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);
+
+                    // idx[] +=8
+                    lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
+                }
+
+                // fix high part of indexes
+                uint32x4_t c4 = vdupq_n_u32(4);
+                n_minIdxh = vaddq_u32(n_minIdxh, c4);
+                n_maxIdxh = vaddq_u32(n_maxIdxh, c4);
+
+                u16 fmin[8], fmax[8];
+                u32 fminIdx[8], fmaxIdx[8];
+
+                vst1q_u16(fmin, n_min);
+                vst1q_u16(fmax, n_max);
+                vst1q_u32(fminIdx+0, n_minIdxl);
+                vst1q_u32(fmaxIdx+0, n_maxIdxl);
+                vst1q_u32(fminIdx+4, n_minIdxh);
+                vst1q_u32(fmaxIdx+4, n_maxIdxh);
+
+                size_t minIdx = fminIdx[0];
+                size_t maxIdx = fmaxIdx[0];
+                minVal = fmin[0];
+                maxVal = fmax[0];
+
+                for (s32 j = 1; j < 8; ++j)
+                {
+                    u16 minval = fmin[j];
+                    u16 maxval = fmax[j];
+                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+                    {
+                        minIdx = fminIdx[j];
+                        minVal = minval;
+                    }
+                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+                    {
+                        maxIdx = fmaxIdx[j];
+                        maxVal = maxval;
+                    }
+                }
+                if(minIdx < 0xffffFFF8)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    minCol = b + minIdx;
+#else
+                    minCol = minIdx;
+#endif
+                    minRow = l;
+                }
+                if(maxIdx < 0xffffFFF8)
+                {
+#if SIZE_MAX > UINT32_MAX
+                    maxCol = b + maxIdx;
+#else
+                    maxCol = maxIdx;
+#endif
+                    maxRow = l;
+                }
+            }
+        }
+        for(; i < size.width; ++i )
+        {
+            u16 val = src[i];
+            if( val < minVal )
+            {
+                minVal = val;
+                minCol = i;
+                minRow = l;
+            }
+            else if( val > maxVal )
+            {
+                maxVal = val;
+                maxCol = i;
+                maxRow = l;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+namespace {
+
+void minMaxLocBlock(const u8 * src, u32 len,
+                    u8 &minVal, u16 &minIdx,
+                    u8 &maxVal, u16 &maxIdx)
+{
+    u16 tmp0123[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+    uint8x16_t n_min     = vdupq_n_u8(src[0]);
+    uint16x8_t n_minIdxl = vdupq_n_u16(0);
+    uint16x8_t n_minIdxh = vdupq_n_u16(0);
+    uint8x16_t n_max     = vdupq_n_u8(src[0]);
+    uint16x8_t n_maxIdxl = vdupq_n_u16(0);
+    uint16x8_t n_maxIdxh = vdupq_n_u16(0);
+    uint16x8_t c16       = vdupq_n_u16(16);
+    uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);
+
+    s32 i = 0;
+    s32 bound = len - (16 - 1);
+    for(; i < bound; i+=16 )
+    {
+        internal::prefetch(src + i);
+        uint8x16_t line = vld1q_u8(src + i);
+
+        uint8x16_t minmask = vcltq_u8(line, n_min);
+        uint8x16_t maxmask = vcgtq_u8(line, n_max);
+
+        n_min    = vbslq_u8(minmask, line, n_min);
+        uint8x8_t minml = vget_low_u8(minmask);
+        uint8x8_t minmh = vget_high_u8(minmask);
+        uint16x8_t minml2 = vmovl_u8(minml);
+        uint16x8_t minmh2 = vmovl_u8(minmh);
+        minml2 = vqshlq_n_u16(minml2, 15);
+        minmh2 = vqshlq_n_u16(minmh2, 15);
+        n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
+        n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);
+
+        n_max    = vbslq_u8(maxmask, line, n_max);
+        uint8x8_t maxml = vget_low_u8(maxmask);
+        uint8x8_t maxmh = vget_high_u8(maxmask);
+        uint16x8_t maxml2 = vmovl_u8(maxml);
+        uint16x8_t maxmh2 = vmovl_u8(maxmh);
+        maxml2 = vqshlq_n_u16(maxml2, 15);
+        maxmh2 = vqshlq_n_u16(maxmh2, 15);
+        n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
+        n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);
+
+        // idx[] +=16
+        lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
+    }
+
+    // fix high part of indexes
+    uint16x8_t c8 = vdupq_n_u16(8);
+    n_minIdxh = vaddq_u16(n_minIdxh, c8);
+    n_maxIdxh = vaddq_u16(n_maxIdxh, c8);
+
+    u8 fmin[16], fmax[16];
+    u16 fminIdx[16], fmaxIdx[16];
+    /*{
+        uint8x8_t min_low  = vget_low_u8(n_min);
+        uint8x8_t min_high = vget_high_u8(n_min);
+        uint8x8_t max_low  = vget_low_u8(n_max);
+        uint8x8_t max_high = vget_high_u8(n_max);
+
+        uint8x8_t minmask  = vclt_u8(min_low, min_high);
+        uint8x8_t maxmask  = vcgt_u8(max_low, max_high);
+
+        uint8x8_t min2     = vbsl_u8(minmask, min_low, min_high);
+        uint8x8_t max2     = vbsl_u8(maxmask, max_low, max_high);
+
+        uint16x8_t minidxmask = vmovl_u8(minmask);
+        uint16x8_t maxidxmask = vmovl_u8(maxmask);
+        minidxmask = vqshlq_n_u16(minidxmask, 15);
+        maxidxmask = vqshlq_n_u16(maxidxmask, 15);
+
+        uint16x8_t n_minIdx = vbslq_u16(minidxmask, n_minIdxl, n_minIdxh);
+        uint16x8_t n_maxIdx = vbslq_u16(maxidxmask, n_maxIdxl, n_maxIdxh);
+
+        vst1_u8((uint8_t*)fmin, min2);
+        vst1_u8((uint8_t*)fmax, max2);
+
+        vst1q_u16((uint16_t*)(fminIdx), n_minIdx);
+        vst1q_u16((uint16_t*)(fmaxIdx), n_maxIdx);
+    }*/
+
+    vst1q_u8(fmin, n_min);
+    vst1q_u8(fmax, n_max);
+    vst1q_u16(fminIdx+0, n_minIdxl);
+    vst1q_u16(fmaxIdx+0, n_maxIdxl);
+    vst1q_u16(fminIdx+8, n_minIdxh);
+    vst1q_u16(fmaxIdx+8, n_maxIdxh);
+
+    minIdx = fminIdx[0];
+    maxIdx = fmaxIdx[0];
+    minVal = fmin[0];
+    maxVal = fmax[0];
+
+    for (s32 j = 1; j < 16; ++j)
+    {
+        u8 minval = fmin[j];
+        u8 maxval = fmax[j];
+        if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+        {
+            minIdx = fminIdx[j];
+            minVal = minval;
+        }
+        if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+        {
+            maxIdx = fmaxIdx[j];
+            maxVal = maxval;
+        }
+    }
+
+    for(; i < (s32)len; ++i )
+    {
+        u8 val = src[i];
+        if( val < minVal )
+        {
+            minVal = val;
+            minIdx = (u16)i;
+        }
+        else if( val > maxVal )
+        {
+            maxVal = val;
+            maxIdx = (u16)i;
+        }
+    }
+}
+
+void minMaxLocBlock(const s8 * src, u32 len,
+                    s8 &minVal, u16 &minIdx,
+                    s8 &maxVal, u16 &maxIdx)
+{
+    u16 tmp0123[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    int8x16_t n_min      = vdupq_n_s8(src[0]);
+    uint16x8_t n_minIdxl = vdupq_n_u16(0);
+    uint16x8_t n_minIdxh = vdupq_n_u16(0);
+    int8x16_t n_max      = vdupq_n_s8(src[0]);
+    uint16x8_t n_maxIdxl = vdupq_n_u16(0);
+    uint16x8_t n_maxIdxh = vdupq_n_u16(0);
+    uint16x8_t c16       = vdupq_n_u16(16);
+    uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);
+
+    s32 i = 0;
+    s32 bound = len - (16 - 1);
+    for(; i < bound; i+=16 )
+    {
+        internal::prefetch(src + i);
+        int8x16_t line = vld1q_s8(src + i);
+
+        uint8x16_t minmask = vcltq_s8(line, n_min);
+        uint8x16_t maxmask = vcgtq_s8(line, n_max);
+
+        n_min    = vbslq_s8(minmask, line, n_min);
+        uint8x8_t minml = vget_low_u8(minmask);
+        uint8x8_t minmh = vget_high_u8(minmask);
+        uint16x8_t minml2 = vmovl_u8(minml);
+        uint16x8_t minmh2 = vmovl_u8(minmh);
+        minml2 = vqshlq_n_u16(minml2, 15);
+        minmh2 = vqshlq_n_u16(minmh2, 15);
+        n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
+        n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);
+
+        n_max    = vbslq_s8(maxmask, line, n_max);
+        uint8x8_t maxml = vget_low_u8(maxmask);
+        uint8x8_t maxmh = vget_high_u8(maxmask);
+        uint16x8_t maxml2 = vmovl_u8(maxml);
+        uint16x8_t maxmh2 = vmovl_u8(maxmh);
+        maxml2 = vqshlq_n_u16(maxml2, 15);
+        maxmh2 = vqshlq_n_u16(maxmh2, 15);
+        n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
+        n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);
+
+        // idx[] +=16
+        lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
+    }
+
+    // fix high part of indexes
+    uint16x8_t c8 = vdupq_n_u16(8);
+    n_minIdxh = vaddq_u16(n_minIdxh, c8);
+    n_maxIdxh = vaddq_u16(n_maxIdxh, c8);
+
+    s8 fmin[16], fmax[16];
+    u16 fminIdx[16], fmaxIdx[16];
+
+    vst1q_s8(fmin, n_min);
+    vst1q_s8(fmax, n_max);
+    vst1q_u16(fminIdx+0, n_minIdxl);
+    vst1q_u16(fmaxIdx+0, n_maxIdxl);
+    vst1q_u16(fminIdx+8, n_minIdxh);
+    vst1q_u16(fmaxIdx+8, n_maxIdxh);
+
+    minIdx = fminIdx[0];
+    maxIdx = fmaxIdx[0];
+    minVal = fmin[0];
+    maxVal = fmax[0];
+
+    for (s32 j = 1; j < 16; ++j)
+    {
+        s8 minval = fmin[j];
+        s8 maxval = fmax[j];
+        if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
+        {
+            minIdx = fminIdx[j];
+            minVal = minval;
+        }
+        if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
+        {
+            maxIdx = fmaxIdx[j];
+            maxVal = maxval;
+        }
+    }
+
+    for(; i < (s32)len; ++i )
+    {
+        s8 val = src[i];
+        if( val < minVal )
+        {
+            minVal = val;
+            minIdx = (u16)i;
+        }
+        else if( val > maxVal )
+        {
+            maxVal = val;
+            maxIdx = (u16)i;
+        }
+    }
+}
+
+} // namespace
+#endif // CAROTENE_NEON
+
+#define USHORT_BLOCK_MAX_SIZE (1 << 16)
+
+void minMaxLoc(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 &minVal, size_t &minCol, size_t &minRow,
+               u8 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0; l < size.height; ++l)
+    {
+        const u8 * src = internal::getRowPtr( srcBase,  srcStride, l);
+        if (size.width > 128)
+        {
+            for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
+            {
+                u8 locMinVal, locMaxVal;
+                u16 locMinIdx, locMaxIdx;
+                size_t tail = size.width - blockStart;
+                minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
+                               locMinVal, locMinIdx, locMaxVal, locMaxIdx);
+
+                if (locMinVal == 0 && locMaxVal == 255)
+                {
+                    minCol = blockStart + locMinIdx;
+                    maxCol = blockStart + locMaxIdx;
+                    minRow = l;
+                    maxRow = l;
+                    minVal = 0;
+                    maxVal = 255;
+                    return;
+                }
+                else
+                {
+                    if (locMinVal < minVal)
+                    {
+                        minCol = blockStart + locMinIdx;
+                        minRow = l;
+                        minVal = locMinVal;
+                    }
+                    if (locMaxVal > maxVal)
+                    {
+                        maxCol = blockStart + locMaxIdx;
+                        maxRow = l;
+                        maxVal = locMaxVal;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for(size_t i = 0; i < size.width; ++i )
+            {
+                u8 val = src[i];
+                if( val < minVal )
+                {
+                    minVal = val;
+                    minCol = i;
+                    minRow = l;
+                }
+                else if( val > maxVal )
+                {
+                    maxVal = val;
+                    maxCol = i;
+                    maxRow = l;
+                }
+            }
+        }
+
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+void minMaxLoc(const Size2D &size,
+               const s8 * srcBase, ptrdiff_t srcStride,
+               s8 &minVal, size_t &minCol, size_t &minRow,
+               s8 &maxVal, size_t &maxCol, size_t &maxRow)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    minVal = srcBase[0];
+    minCol = 0;
+    minRow = 0;
+    maxVal = srcBase[0];
+    maxCol = 0;
+    maxRow = 0;
+    for(size_t l = 0; l < size.height; ++l)
+    {
+        const s8 * src = internal::getRowPtr( srcBase,  srcStride, l);
+        if (size.width > 128)
+        {
+            for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
+            {
+                s8 locMinVal, locMaxVal;
+                u16 locMinIdx, locMaxIdx;
+                size_t tail = size.width - blockStart;
+                minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
+                               locMinVal, locMinIdx, locMaxVal, locMaxIdx);
+
+                if (locMinVal == -128 && locMaxVal == 127)
+                {
+                    minCol = blockStart + locMinIdx;
+                    maxCol = blockStart + locMaxIdx;
+                    minRow = l;
+                    maxRow = l;
+                    minVal = -128;
+                    maxVal = 127;
+                    return;
+                }
+                else
+                {
+                    if (locMinVal < minVal)
+                    {
+                        minCol = blockStart + locMinIdx;
+                        minRow = l;
+                        minVal = locMinVal;
+                    }
+                    if (locMaxVal > maxVal)
+                    {
+                        maxCol = blockStart + locMaxIdx;
+                        maxRow = l;
+                        maxVal = locMaxVal;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for(size_t i = 0; i < size.width; ++i )
+            {
+                s8 val = src[i];
+                if( val < minVal )
+                {
+                    minVal = val;
+                    minRow = l;
+                    minCol = i;
+                }
+                else if( val > maxVal )
+                {
+                    maxVal = val;
+                    maxRow = l;
+                    maxCol = i;
+                }
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minCol;
+    (void)minRow;
+    (void)maxVal;
+    (void)maxCol;
+    (void)maxRow;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/opencv/3rdparty/carotene/src/morph.cpp
+++ b/opencv/3rdparty/carotene/src/morph.cpp
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 16 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+struct ErodeVecOp
+{
+    ErodeVecOp():borderValue(0){}
+
+    ErodeVecOp(BORDER_MODE border, u8 borderValue_) :
+        borderValue(borderValue_)
+    {
+        if (border == BORDER_MODE_REPLICATE)
+            borderValue = std::numeric_limits<u8>::max();
+    }
+
+    inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
+    {
+        return vminq_u8(a, b);
+    }
+
+    inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
+    {
+        return vmin_u8(a, b);
+    }
+
+    inline u8 operator()(u8 a, u8 b) const
+    {
+        return std::min(a, b);
+    }
+
+    u8 borderValue;
+};
+
+struct DilateVecOp
+{
+    DilateVecOp():borderValue(0){}
+
+    DilateVecOp(BORDER_MODE border, u8 borderValue_) :
+        borderValue(borderValue_)
+    {
+        if (border == BORDER_MODE_REPLICATE)
+            borderValue = std::numeric_limits<u8>::min();
+    }
+
+    inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
+    {
+        return vmaxq_u8(a, b);
+    }
+
+    inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
+    {
+        return vmax_u8(a, b);
+    }
+
+    inline u8 operator()(u8 a, u8 b) const
+    {
+        return std::max(a, b);
+    }
+
+    u8 borderValue;
+};
+
+template <typename VecOp>
+void morph3x3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              BORDER_MODE border, const VecOp & vop)
+{
+    u8 borderValue = vop.borderValue;
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+
+    const uint8x16_t v_zero = vdupq_n_u8(0);
+    const uint8x16_t v_border = vdupq_n_u8(borderValue);
+
+    uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
+    uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        u8 prevx = 0, currx = 0, nextx = 0;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 16)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x);
+            uint8x16_t x1 = vld1q_u8(srow1 + x);
+            uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 16 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx = borderValue;
+                else
+                    prevx = vop(srow1[x4],
+                                vop(srow2 ? srow2[x4] : borderValue,
+                                    srow0 ? srow0[x4] : borderValue));
+
+                currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue));
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+
+            // and calculate next value
+            tnext = vop(vop(x0, x1), x2);
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr = v_border;
+                else if (border == BORDER_MODE_REPLICATE)
+                    tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0));
+
+                continue;
+            }
+
+            // combine 3 "shifted" vectors
+            t0 = vextq_u8(tprev, tcurr, 15);
+            t1 = tcurr;
+            t2 = vextq_u8(tcurr, tnext, 1);
+
+            // and add them
+            t0 = vop(t0, vop(t1, t2));
+
+            vst1q_u8(drow + x - 16, t0);
+        }
+
+        x -= 16;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                    nextx = borderValue;
+                else if (border == BORDER_MODE_REPLICATE)
+                    nextx = vop(srow2[x], vop(srow1[x], srow0[x]));
+            }
+            else
+                nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue,
+                                srow0 ? srow0[x + 1] : borderValue),
+                            srow1[x + 1]);
+
+            drow[x] = vop(prevx, vop(currx, nextx));
+
+            // make shift
+            prevx = currx;
+            currx = nextx;
+        }
+    }
+}
+
+} // namespace
+
+#endif
+
+void erode3x3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    morph3x3(size,
+             srcBase, srcStride,
+             dstBase, dstStride,
+             border, ErodeVecOp(border, borderValue));
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+void dilate3x3(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride,
+               BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    morph3x3(size,
+             srcBase, srcStride,
+             dstBase, dstStride,
+             border, DilateVecOp(border, borderValue));
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+namespace {
+
+template<class VecUpdate>
+void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize)
+{
+    size_t i, j, k;
+    size_t width16 = (width & -16) * cn;
+    size_t width8 = (width & -8) * cn;
+    width *= cn;
+
+    if (ksize == 1)
+    {
+        for (i = 0; i < width; i++)
+            dst[i] = src[i];
+        return;
+    }
+
+    ksize = ksize*cn;
+    VecUpdate updateOp;
+    switch(cn)
+    {
+    case 1:
+        for (i = 0; i < width16; i += 16)
+        {
+            const u8* sptr = src + i;
+            uint8x16_t s = vld1q_u8(sptr);
+            internal::prefetch(sptr);
+
+            for( k = 1; k < ksize; ++k)
+                s = updateOp(s, vld1q_u8(sptr + k));
+
+            vst1q_u8(dst + i, s);
+        }
+
+        for (; i < width8; i += 8)
+        {
+            const u8* sptr = src + i;
+            uint8x8_t s = vld1_u8(sptr);
+            internal::prefetch(sptr);
+
+            for( k = 1; k < ksize; ++k)
+                s = updateOp(s, vld1_u8(sptr + k));
+
+            vst1_u8(dst + i, s);
+        }
+        break;
+    default:
+        for (i = 0; i < width16; i += 16)
+        {
+            uint8x16_t s = vld1q_u8(src + i);
+            internal::prefetch(src + i);
+
+            for (k = cn; k < ksize; k += cn)
+                s = updateOp(s, vld1q_u8(src + i + k));
+
+            vst1q_u8(dst + i, s);
+        }
+
+        for (; i < width8; i += 8)
+        {
+            uint8x8_t s = vld1_u8(src + i);
+            internal::prefetch(src + i);
+
+            for (k = cn; k < ksize; k += cn)
+                s = updateOp(s, vld1_u8(src + i + k));
+
+            vst1_u8(dst + i, s);
+        }
+        break;
+    }
+
+    ptrdiff_t i0 = i;
+    for( k = 0; k < (size_t)cn; k++, src++, dst++ )
+    {
+        for( i = i0; i <= width - cn*2; i += cn*2 )
+        {
+            const u8* s = src + i;
+            u8 m = s[cn];
+            for( j = cn*2; j < ksize; j += cn )
+                m = updateOp(m, s[j]);
+            dst[i] = updateOp(m, s[0]);
+            dst[i+cn] = updateOp(m, s[j]);
+        }
+
+        for( ; i < width; i += cn )
+        {
+            const u8* s = src + i;
+            u8 m = s[0];
+            for( j = cn; j < ksize; j += cn )
+                m = updateOp(m, s[j]);
+            dst[i] = m;
+        }
+    }
+}
+
+template<class VecUpdate>
+void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize)
+{
+    size_t i, k;
+    size_t width32 = width & -32;
+    VecUpdate updateOp;
+
+    uint8x16_t x0,x1,s0,s1;
+    if (ksize == 3)
+    {
+        for (; count > 1; count -= 2, dst += dststep * 2, src += 2)
+        {
+            for (i = 0; i < width32; i += 32)
+            {
+                const u8* sptr = src[1] + i;
+                s0 = vld1q_u8(sptr);
+                s1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                sptr = src[2] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                s0 = updateOp(s0, x0);
+                s1 = updateOp(s1, x1);
+
+                sptr = src[0] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                vst1q_u8(dst+i, updateOp(s0, x0));
+                vst1q_u8(dst+i+16, updateOp(s1, x1));
+
+                sptr = src[3] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                vst1q_u8(dst + dststep + i, updateOp(s0, x0));
+                vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
+
+            }
+            for(; i < width; i++ )
+            {
+                u8 s = src[1][i];
+
+                for( k = 2; k < ksize; k++ )
+                    s = updateOp(s, src[k][i]);
+
+                dst[i] = updateOp(s, src[0][i]);
+                dst[i+dststep] = updateOp(s, src[k][i]);
+            }
+        }
+    }
+    else if (ksize > 1)
+        for (; count > 1; count -= 2, dst += dststep*2, src += 2)
+        {
+            for (i = 0; i < width32; i += 32)
+            {
+                const u8* sptr = src[1] + i;
+                s0 = vld1q_u8(sptr);
+                s1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                for (k = 2; k < ksize; k++)
+                {
+                    sptr = src[k] + i;
+                    x0 = vld1q_u8(sptr);
+                    x1 = vld1q_u8(sptr + 16);
+                    internal::prefetch(sptr);
+
+                    s0 = updateOp(s0, x0);
+                    s1 = updateOp(s1, x1);
+                }
+
+                sptr = src[0] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                vst1q_u8(dst+i, updateOp(s0, x0));
+                vst1q_u8(dst+i+16, updateOp(s1, x1));
+
+                sptr = src[k] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                vst1q_u8(dst + dststep + i, updateOp(s0, x0));
+                vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
+            }
+            for(; i < width; i++ )
+            {
+                u8 s = src[1][i];
+
+                for( k = 2; k < ksize; k++ )
+                    s = updateOp(s, src[k][i]);
+
+                dst[i] = updateOp(s, src[0][i]);
+                dst[i+dststep] = updateOp(s, src[k][i]);
+            }
+        }
+
+    for (; count > 0; count--, dst += dststep, src++)
+    {
+        for (i = 0; i < width32; i += 32)
+        {
+            const u8* sptr = src[0] + i;
+            s0 = vld1q_u8(sptr);
+            s1 = vld1q_u8(sptr + 16);
+            internal::prefetch(sptr);
+
+            for (k = 1; k < ksize; k++)
+            {
+                sptr = src[k] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                s0 = updateOp(s0, x0);
+                s1 = updateOp(s1, x1);
+            }
+
+            vst1q_u8(dst + i, s0);
+            vst1q_u8(dst + i + 16, s1);
+        }
+        for(; i < width; i++ )
+        {
+            u8 s = src[0][i];
+            for( k = 1; k < ksize; k++ )
+                s = updateOp(s, src[k][i]);
+            dst[i] = s;
+        }
+    }
+}
+
+template <class Op>
+inline void morphology(const Size2D &ssize, u32 cn,
+                       const u8 * srcBase, ptrdiff_t srcStride,
+                       u8 * dstBase, ptrdiff_t dstStride,
+                       const Size2D &ksize,
+                       size_t anchorX, size_t anchorY,
+                       BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+                       const u8 * borderValues, Margin borderMargin)
+{
+    //Temporary buffers common for all iterations
+    std::vector<u8> _srcRow(cn*(ssize.width + ksize.width - 1));
+    u8* srcRow = &_srcRow[0];
+
+    size_t bufRows = std::max<size_t>(ksize.height + 3, std::max<size_t>(anchorY, ksize.height-anchorY-1)*2+1);
+    std::vector<u8*> _rows(bufRows);
+    u8** rows = &_rows[0];
+
+    // adjust swidthcn so that the used part of buffers stays compact in memory
+    ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size)
+    std::vector<u8> _ringBuf(swidthcn*bufRows+16);
+    u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16);
+
+    size_t borderLength = std::max<size_t>(ksize.width - 1, 1) * cn;
+    std::vector<ptrdiff_t> _borderTab(borderLength);
+    ptrdiff_t * borderTab = &_borderTab[0];
+
+    std::vector<u8> _constBorderValue;
+    std::vector<u8> _constBorderRow;
+    u8 * constBorderValue = NULL;
+    u8 * constBorderRow = NULL;
+    if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT )
+    {
+        _constBorderValue.resize(borderLength);
+        constBorderValue = &_constBorderValue[0];
+        size_t i;
+        for(i = 0; i < cn; i++)
+            constBorderValue[i] = borderValues[i];
+        for(; i < borderLength; i++)
+            constBorderValue[i] = constBorderValue[i-cn];
+
+        if( columnBorderType == BORDER_MODE_CONSTANT )
+        {
+            _constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16));
+            constBorderRow = internal::alignPtr(&_constBorderRow[0], 16);
+            size_t N = (ssize.width + ksize.width - 1)*cn;
+            for( i = 0; i < N; i += borderLength )
+            {
+                size_t n = std::min( borderLength, N - i );
+                for(size_t j = 0; j < n; j++)
+                    srcRow[i+j] = constBorderValue[j];
+            }
+            MorphRow<Op>(srcRow, constBorderRow, ssize.width, cn, ksize.width);
+        }
+    }
+
+    Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right,
+                     ssize.height + borderMargin.top + borderMargin.bottom);
+
+    ptrdiff_t dx1 = std::max<ptrdiff_t>(anchorX - (ptrdiff_t)borderMargin.left, 0);
+    ptrdiff_t dx2 = std::max<ptrdiff_t>((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0);
+    // recompute border tables
+    if( dx1 > 0 || dx2 > 0 )
+    {
+        if( rowBorderType == BORDER_MODE_CONSTANT )
+        {
+            memcpy( srcRow, &constBorderValue[0], dx1*cn );
+            memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn );
+        }
+        else
+        {
+            ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX) - borderMargin.left;
+
+            ptrdiff_t wholeWidth = wholeSize.width;
+
+            ptrdiff_t i, j;
+            for( i = 0; i < dx1; i++ )
+            {
+                ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn;
+                for( j = 0; j < (ptrdiff_t)cn; j++ )
+                    borderTab[i*cn + j] = p0 + j;
+            }
+
+            for( i = 0; i < dx2; i++ )
+            {
+                ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn;
+                for( j = 0; j < (ptrdiff_t)cn; j++ )
+                    borderTab[(i + dx1)*cn + j] = p0 + j;
+            }
+        }
+    }
+
+    ptrdiff_t startY, startY0, endY, rowCount;
+    startY = startY0 = std::max<ptrdiff_t>(borderMargin.top - anchorY, 0);
+    endY = std::min<ptrdiff_t>(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height);
+
+    const u8* src = srcBase + (startY - borderMargin.top)*srcStride;
+    u8* dst = dstBase;
+
+    ptrdiff_t width = ssize.width, kwidth = ksize.width;
+    ptrdiff_t kheight = ksize.height, ay = anchorY;
+    ptrdiff_t width1 = ssize.width + kwidth - 1;
+    ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX);
+    bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT;
+    ptrdiff_t dy = 0, i = 0;
+
+    src -= xofs1*cn;
+    ptrdiff_t count = endY - startY;
+
+    rowCount = 0;
+    for(;; dst += dstStride*i, dy += i)
+    {
+        ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top;
+        dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
+        dcount = std::min(dcount, count);
+        count -= dcount;
+        for( ; dcount-- > 0; src += srcStride )
+        {
+            ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows;
+            u8* brow = ringBuf + bi*swidthcn;
+
+            if( (size_t)(++rowCount) > bufRows )
+            {
+                --rowCount;
+                ++startY;
+            }
+
+            memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn );
+
+            if( makeBorder )
+            {
+                    for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ )
+                        srcRow[i] = src[borderTab[i]];
+                    for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ )
+                        srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]];
+            }
+
+            MorphRow<Op>(srcRow, brow, width, cn, ksize.width);
+        }
+
+        ptrdiff_t max_i = std::min<ptrdiff_t>(bufRows, ssize.height - dy + (kheight - 1));
+        for( i = 0; i < max_i; i++ )
+        {
+            ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay,
+                                               wholeSize.height, columnBorderType);
+            if( srcY < 0 ) // can happen only with constant border type
+                rows[i] = constBorderRow;
+            else
+            {
+                if( srcY >= startY + rowCount )
+                    break;
+                ptrdiff_t bi = (srcY - startY0) % bufRows;
+                rows[i] = ringBuf + bi*swidthcn;
+            }
+        }
+        if( i < kheight )
+            break;
+        i -= kheight - 1;
+        MorphColumn<Op>((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height);
+    }
+}
+
+} // namespace
+#endif // CAROTENE_NEON
+
+void erode(const Size2D &ssize, u32 cn,
+           const u8 * srcBase, ptrdiff_t srcStride,
+           u8 * dstBase, ptrdiff_t dstStride,
+           const Size2D &ksize,
+           size_t anchorX, size_t anchorY,
+           BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+           const u8 * borderValues, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
+                                           anchorX < ksize.width && anchorY < ksize.height);
+#ifdef CAROTENE_NEON
+    morphology<ErodeVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
+                           ksize, anchorX, anchorY, rowBorderType, columnBorderType,
+                           borderValues, borderMargin);
+#else
+    (void)cn;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)rowBorderType;
+    (void)columnBorderType;
+    (void)borderValues;
+    (void)borderMargin;
+#endif
+}
+
+void dilate(const Size2D &ssize, u32 cn,
+            const u8 * srcBase, ptrdiff_t srcStride,
+            u8 * dstBase, ptrdiff_t dstStride,
+            const Size2D &ksize,
+            size_t anchorX, size_t anchorY,
+            BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+            const u8 * borderValues, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
+                                           anchorX < ksize.width && anchorY < ksize.height);
+#ifdef CAROTENE_NEON
+    morphology<DilateVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
+                            ksize, anchorX, anchorY, rowBorderType, columnBorderType,
+                            borderValues, borderMargin);
+#else
+    (void)cn;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)rowBorderType;
+    (void)columnBorderType;
+    (void)borderValues;
+    (void)borderMargin;
+#endif
+}
+
+} // namespace CAROTENE_NS