Commit fbd3199c authored by fengzch-das's avatar fengzch-das
Browse files

Initial commit

parents
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
void convertScale(const Size2D &_size, \
const T1 * srcBase, ptrdiff_t srcStride, \
T2 * dstBase, ptrdiff_t dstStride, \
f64 alpha, f64 beta) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (srcStride == dstStride && \
srcStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
const ptrdiff_t sstep = srcStride / sizeof(T1); \
const ptrdiff_t dstep = dstStride / sizeof(T2); \
const size_t w = size.width & ~(SIMD_SIZE-1); \
if (size.width >= SIMD_SIZE) \
{ \
const T1* _src = srcBase; \
T2* _dst = dstBase; \
CVTINIT \
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
CVTROW \
} \
if(w < size.width) \
{ \
const T1* _src = srcBase; \
T2* _dst = dstBase; \
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
for(size_t i = w; i < size.width; i++ ) \
_dst[i] = internal::saturate_cast<T2>(_src[i]*alpha + beta); \
} \
}
#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \
void convertScale(const Size2D &_size, \
const T1 * srcBase, ptrdiff_t srcStride, \
T1 * dstBase, ptrdiff_t dstStride, \
f64 alpha, f64 beta) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (srcStride == dstStride && \
srcStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
const ptrdiff_t sstep = srcStride / sizeof(T1); \
const ptrdiff_t dstep = dstStride / sizeof(T1); \
const size_t w = size.width & ~(SIMD_SIZE-1); \
if (size.width >= SIMD_SIZE) \
{ \
const T1* _src = srcBase; \
T1* _dst = dstBase; \
CVTSINIT \
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
CVTSROW \
} \
if(w < size.width) \
{ \
const T1* _src = srcBase; \
T1* _dst = dstBase; \
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
for(size_t i = w; i < size.width; i++ ) \
_dst[i] = internal::saturate_cast<T1>(_src[i]*alpha + beta); \
} \
}
#else
#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
void convertScale(const Size2D &, \
const T1 *, ptrdiff_t, \
T2 *, ptrdiff_t, \
f64, f64) \
{ \
internal::assertSupportedConfiguration(); \
}
#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \
void convertScale(const Size2D &, \
const T1 *, ptrdiff_t, \
T1 *, ptrdiff_t, \
f64, f64) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC1(u8, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.u8 q3, d4 \n\t"
"vmovl.u8 q4, d5 \n\t"
"vmovl.u16 q5, d6 \n\t"
"vmovl.u16 q6, d7 \n\t"
"vmovl.u16 q7, d8 \n\t"
"vmovl.u16 q8, d9 \n\t"
"vcvt.f32.u32 q9, q5 \n\t"
"vcvt.f32.u32 q10, q6 \n\t"
"vcvt.f32.u32 q11, q7 \n\t"
"vcvt.f32.u32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vqmovun.s32 d22, q7 \n\t"
"vqmovun.s32 d23, q8 \n\t"
"vqmovun.s32 d24, q9 \n\t"
"vqmovun.s32 d25, q10 \n\t"
"vqmovn.u16 d26, q11 \n\t"
"vqmovn.u16 d27, q12 \n\t"
"vst1.8 {d26-d27}, [%[dst1]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC1(u8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
uint8x16_t vline = vld1q_u8(_src + i);
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC(u8, s8, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.u8 q3, d4 \n\t"
"vmovl.u8 q4, d5 \n\t"
"vmovl.u16 q5, d6 \n\t"
"vmovl.u16 q6, d7 \n\t"
"vmovl.u16 q7, d8 \n\t"
"vmovl.u16 q8, d9 \n\t"
"vcvt.f32.u32 q9, q5 \n\t"
"vcvt.f32.u32 q10, q6 \n\t"
"vcvt.f32.u32 q11, q7 \n\t"
"vcvt.f32.u32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vqmovn.s32 d22, q7 \n\t"
"vqmovn.s32 d23, q8 \n\t"
"vqmovn.s32 d24, q9 \n\t"
"vqmovn.s32 d25, q10 \n\t"
"vqmovn.s16 d26, q11 \n\t"
"vqmovn.s16 d27, q12 \n\t"
"vst1.8 {d26-d27}, [%[dst1]] \n\t"
: //no output
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(u8, s8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
uint8x16_t vline = vld1q_u8(_src + i);
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC(u8, u16, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.u8 q3, d4 \n\t"
"vmovl.u8 q4, d5 \n\t"
"vmovl.u16 q5, d6 \n\t"
"vmovl.u16 q6, d7 \n\t"
"vmovl.u16 q7, d8 \n\t"
"vmovl.u16 q8, d9 \n\t"
"vcvt.f32.u32 q9, q5 \n\t"
"vcvt.f32.u32 q10, q6 \n\t"
"vcvt.f32.u32 q11, q7 \n\t"
"vcvt.f32.u32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vqmovun.s32 d22, q7 \n\t"
"vqmovun.s32 d23, q8 \n\t"
"vqmovun.s32 d24, q9 \n\t"
"vqmovun.s32 d25, q10 \n\t"
"vst1.16 {d22-d23}, [%[dst1]] \n\t"
"vst1.16 {d24-d25}, [%[dst2]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 8),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(u8, u16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
uint8x16_t vline = vld1q_u8(_src + i);
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC(u8, s16, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.u8 q3, d4 \n\t"
"vmovl.u8 q4, d5 \n\t"
"vmovl.u16 q5, d6 \n\t"
"vmovl.u16 q6, d7 \n\t"
"vmovl.u16 q7, d8 \n\t"
"vmovl.u16 q8, d9 \n\t"
"vcvt.f32.u32 q9, q5 \n\t"
"vcvt.f32.u32 q10, q6 \n\t"
"vcvt.f32.u32 q11, q7 \n\t"
"vcvt.f32.u32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vqmovn.s32 d22, q7 \n\t"
"vqmovn.s32 d23, q8 \n\t"
"vqmovn.s32 d24, q9 \n\t"
"vqmovn.s32 d25, q10 \n\t"
"vst1.16 {d22-d23}, [%[dst1]] \n\t"
"vst1.16 {d24-d25}, [%[dst2]] \n\t"
: //no output
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 8),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(u8, s16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
uint8x16_t vline = vld1q_u8(_src + i);
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(u8, s32, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.u8 q3, d4 \n\t"
"vmovl.u8 q4, d5 \n\t"
"vmovl.u16 q5, d6 \n\t"
"vmovl.u16 q6, d7 \n\t"
"vmovl.u16 q7, d8 \n\t"
"vmovl.u16 q8, d9 \n\t"
"vcvt.f32.u32 q9, q5 \n\t"
"vcvt.f32.u32 q10, q6 \n\t"
"vcvt.f32.u32 q11, q7 \n\t"
"vcvt.f32.u32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vst1.32 {d14-d15}, [%[dst1]] \n\t"
"vst1.32 {d16-d17}, [%[dst2]] \n\t"
"vst1.32 {d18-d19}, [%[dst3]] \n\t"
"vst1.32 {d20-d21}, [%[dst4]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 4),
[dst3] "r" (_dst + i + 8),
[dst4] "r" (_dst + i + 12),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10",
"d11","d12","d13","d14","d15","d16","d17",
"d18","d19","d20","d21","d22","d23","d24",
"d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(u8, s32, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
uint8x16_t vline = vld1q_u8(_src + i);
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32);
vst1q_s32(_dst + i + 8, vline3_s32);
vst1q_s32(_dst + i + 12, vline4_s32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(u8, f32, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.u8 q3, d4 \n\t"
"vmovl.u8 q4, d5 \n\t"
"vmovl.u16 q5, d6 \n\t"
"vmovl.u16 q6, d7 \n\t"
"vmovl.u16 q7, d8 \n\t"
"vmovl.u16 q8, d9 \n\t"
"vcvt.f32.u32 q9, q5 \n\t"
"vcvt.f32.u32 q10, q6 \n\t"
"vcvt.f32.u32 q11, q7 \n\t"
"vcvt.f32.u32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vst1.32 {d6-d7}, [%[dst1]] \n\t"
"vst1.32 {d8-d9}, [%[dst2]] \n\t"
"vst1.32 {d10-d11}, [%[dst3]] \n\t"
"vst1.32 {d12-d13}, [%[dst4]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 4),
[dst3] "r" (_dst + i + 8),
[dst4] "r" (_dst + i + 12),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10",
"d11","d12","d13","d14","d15","d16","d17",
"d18","d19","d20","d21","d22","d23","d24",
"d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(u8, f32, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
uint8x16_t vline = vld1q_u8(_src + i);
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
vst1q_f32(_dst + i + 0, vline1_f32);
vst1q_f32(_dst + i + 4, vline2_f32);
vst1q_f32(_dst + i + 8, vline3_f32);
vst1q_f32(_dst + i + 12, vline4_f32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC(s8, u8, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.s8 q3, d4 \n\t"
"vmovl.s8 q4, d5 \n\t"
"vmovl.s16 q5, d6 \n\t"
"vmovl.s16 q6, d7 \n\t"
"vmovl.s16 q7, d8 \n\t"
"vmovl.s16 q8, d9 \n\t"
"vcvt.f32.s32 q9, q5 \n\t"
"vcvt.f32.s32 q10, q6 \n\t"
"vcvt.f32.s32 q11, q7 \n\t"
"vcvt.f32.s32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vqmovun.s32 d22, q7 \n\t"
"vqmovun.s32 d23, q8 \n\t"
"vqmovun.s32 d24, q9 \n\t"
"vqmovun.s32 d25, q10 \n\t"
"vqmovn.u16 d26, q11 \n\t"
"vqmovn.u16 d27, q12 \n\t"
"vst1.8 {d26-d27}, [%[dst1]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(s8, u8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
int8x16_t vline = vld1q_s8(_src + i);
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32);
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC1(s8, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.s8 q3, d4 \n\t"
"vmovl.s8 q4, d5 \n\t"
"vmovl.s16 q5, d6 \n\t"
"vmovl.s16 q6, d7 \n\t"
"vmovl.s16 q7, d8 \n\t"
"vmovl.s16 q8, d9 \n\t"
"vcvt.f32.s32 q9, q5 \n\t"
"vcvt.f32.s32 q10, q6 \n\t"
"vcvt.f32.s32 q11, q7 \n\t"
"vcvt.f32.s32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vqmovn.s32 d22, q7 \n\t"
"vqmovn.s32 d23, q8 \n\t"
"vqmovn.s32 d24, q9 \n\t"
"vqmovn.s32 d25, q10 \n\t"
"vqmovn.s16 d26, q11 \n\t"
"vqmovn.s16 d27, q12 \n\t"
"vst1.8 {d26-d27}, [%[dst1]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC1(s8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
int8x16_t vline = vld1q_s8(_src + i);
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32);
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC(s8, u16, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.s8 q3, d4 \n\t"
"vmovl.s8 q4, d5 \n\t"
"vmovl.s16 q5, d6 \n\t"
"vmovl.s16 q6, d7 \n\t"
"vmovl.s16 q7, d8 \n\t"
"vmovl.s16 q8, d9 \n\t"
"vcvt.f32.s32 q9, q5 \n\t"
"vcvt.f32.s32 q10, q6 \n\t"
"vcvt.f32.s32 q11, q7 \n\t"
"vcvt.f32.s32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vqmovun.s32 d22, q7 \n\t"
"vqmovun.s32 d23, q8 \n\t"
"vqmovun.s32 d24, q9 \n\t"
"vqmovun.s32 d25, q10 \n\t"
"vst1.16 {d22-d23}, [%[dst1]] \n\t"
"vst1.16 {d24-d25}, [%[dst2]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 8),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(s8, u16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
int8x16_t vline = vld1q_s8(_src + i);
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32);
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
vst1q_u16(_dst + i + 0, vRes1_u16);
vst1q_u16(_dst + i + 8, vRes2_u16);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC(s8, s16, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.s8 q3, d4 \n\t"
"vmovl.s8 q4, d5 \n\t"
"vmovl.s16 q5, d6 \n\t"
"vmovl.s16 q6, d7 \n\t"
"vmovl.s16 q7, d8 \n\t"
"vmovl.s16 q8, d9 \n\t"
"vcvt.f32.s32 q9, q5 \n\t"
"vcvt.f32.s32 q10, q6 \n\t"
"vcvt.f32.s32 q11, q7 \n\t"
"vcvt.f32.s32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vqmovn.s32 d22, q7 \n\t"
"vqmovn.s32 d23, q8 \n\t"
"vqmovn.s32 d24, q9 \n\t"
"vqmovn.s32 d25, q10 \n\t"
"vst1.16 {d22-d23}, [%[dst1]] \n\t"
"vst1.16 {d24-d25}, [%[dst2]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 8),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(s8, s16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
int8x16_t vline = vld1q_s8(_src + i);
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32);
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
vst1q_s16(_dst + i + 0, vRes1_s16);
vst1q_s16(_dst + i + 8, vRes2_s16);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s8, s32, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.s8 q3, d4 \n\t"
"vmovl.s8 q4, d5 \n\t"
"vmovl.s16 q5, d6 \n\t"
"vmovl.s16 q6, d7 \n\t"
"vmovl.s16 q7, d8 \n\t"
"vmovl.s16 q8, d9 \n\t"
"vcvt.f32.s32 q9, q5 \n\t"
"vcvt.f32.s32 q10, q6 \n\t"
"vcvt.f32.s32 q11, q7 \n\t"
"vcvt.f32.s32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vcvt.s32.f32 q7, q3 \n\t"
"vcvt.s32.f32 q8, q4 \n\t"
"vcvt.s32.f32 q9, q5 \n\t"
"vcvt.s32.f32 q10, q6 \n\t"
"vst1.32 {d14-d15}, [%[dst1]] \n\t"
"vst1.32 {d16-d17}, [%[dst2]] \n\t"
"vst1.32 {d18-d19}, [%[dst3]] \n\t"
"vst1.32 {d20-d21}, [%[dst4]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 4),
[dst3] "r" (_dst + i + 8),
[dst4] "r" (_dst + i + 12),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10",
"d11","d12","d13","d14","d15","d16","d17",
"d18","d19","d20","d21","d22","d23","d24",
"d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(s8, s32, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
int8x16_t vline = vld1q_s8(_src + i);
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32);
vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32);
vst1q_s32(_dst + i + 8, vline3_s32);
vst1q_s32(_dst + i + 12, vline4_s32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s8, f32, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src]] \n\t"
"vmovl.s8 q3, d4 \n\t"
"vmovl.s8 q4, d5 \n\t"
"vmovl.s16 q5, d6 \n\t"
"vmovl.s16 q6, d7 \n\t"
"vmovl.s16 q7, d8 \n\t"
"vmovl.s16 q8, d9 \n\t"
"vcvt.f32.s32 q9, q5 \n\t"
"vcvt.f32.s32 q10, q6 \n\t"
"vcvt.f32.s32 q11, q7 \n\t"
"vcvt.f32.s32 q12, q8 \n\t"
"vmul.f32 q13, q9, q0 \n\t"
"vmul.f32 q14, q10, q0 \n\t"
"vmul.f32 q15, q11, q0 \n\t"
"vmul.f32 q2, q12, q0 \n\t"
"vadd.f32 q3, q13, q1 \n\t"
"vadd.f32 q4, q14, q1 \n\t"
"vadd.f32 q5, q15, q1 \n\t"
"vadd.f32 q6, q2, q1 \n\t"
"vst1.32 {d6-d7}, [%[dst1]] \n\t"
"vst1.32 {d8-d9}, [%[dst2]] \n\t"
"vst1.32 {d10-d11}, [%[dst3]] \n\t"
"vst1.32 {d12-d13}, [%[dst4]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 4),
[dst3] "r" (_dst + i + 8),
[dst4] "r" (_dst + i + 12),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10",
"d11","d12","d13","d14","d15","d16","d17",
"d18","d19","d20","d21","d22","d23","d24",
"d25","d26","d27","d28","d29","d30","d31"
);
}
})
#else
CVTS_FUNC(s8, f32, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 16)
{
internal::prefetch(_src + i);
int8x16_t vline = vld1q_s8(_src + i);
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline3_f32 = vmulq_f32(vline3_f32, vscale);
vline4_f32 = vmulq_f32(vline4_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift);
vst1q_f32(_dst + i + 0, vline1_f32);
vst1q_f32(_dst + i + 4, vline2_f32);
vst1q_f32(_dst + i + 8, vline3_f32);
vst1q_f32(_dst + i + 12, vline4_f32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(u16, u8, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src1]] \n\t"
"vmovl.u16 q3, d4 \n\t"
"vmovl.u16 q4, d5 \n\t"
"vcvt.f32.u32 q5, q3 \n\t"
"vcvt.f32.u32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vqmovn.s32 d26, q11 \n\t"
"vqmovn.s32 d27, q12 \n\t"
"vqmovun.s16 d28, q13 \n\t"
"vst1.8 {d28}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i),
[dst] "r" (_dst + i + 0),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
);
}
})
#else
CVTS_FUNC(u16, u8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
uint16x8_t vline = vld1q_u16(_src + i);
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
vst1_u8(_dst + i, vRes);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(u16, s8, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src1]] \n\t"
"vmovl.u16 q3, d4 \n\t"
"vmovl.u16 q4, d5 \n\t"
"vcvt.f32.u32 q5, q3 \n\t"
"vcvt.f32.u32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vqmovn.s32 d26, q11 \n\t"
"vqmovn.s32 d27, q12 \n\t"
"vqmovn.s16 d28, q13 \n\t"
"vst1.8 {d28}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i),
[dst] "r" (_dst + i + 0),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
);
}
})
#else
CVTS_FUNC(u16, s8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
uint16x8_t vline = vld1q_u16(_src + i);
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
vst1_s8(_dst + i, vRes);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC1(u16, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.16 {d4-d5}, [%[src]] \n\t"
"vmovl.u16 q3, d4 \n\t"
"vmovl.u16 q4, d5 \n\t"
"vcvt.f32.u32 q5, q3 \n\t"
"vcvt.f32.u32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vqmovun.s32 d26, q11 \n\t"
"vqmovun.s32 d27, q12 \n\t"
"vst1.16 {d26-d27}, [%[dst]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst] "r" (_dst + i + 0),
"w" (vshift), "w" (vscale)
: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
);
}
})
#else
CVTS_FUNC1(u16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
uint16x8_t vline = vld1q_u16(_src + i);
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(u16, s16, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.16 {d4-d5}, [%[src]] \n\t"
"vmovl.u16 q3, d4 \n\t"
"vmovl.u16 q4, d5 \n\t"
"vcvt.f32.u32 q5, q3 \n\t"
"vcvt.f32.u32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vqmovn.s32 d26, q11 \n\t"
"vqmovn.s32 d27, q12 \n\t"
"vst1.16 {d26-d27}, [%[dst]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst] "r" (_dst + i + 0),
"w" (vshift), "w" (vscale)
: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
);
}
})
#else
CVTS_FUNC(u16, s16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
uint16x8_t vline = vld1q_u16(_src + i);
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(u16, s32, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.16 {d4-d5}, [%[src]] \n\t"
"vmovl.u16 q3, d4 \n\t"
"vmovl.u16 q4, d5 \n\t"
"vcvt.f32.u32 q5, q3 \n\t"
"vcvt.f32.u32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vst1.32 {d22-d23}, [%[dst1]] \n\t"
"vst1.32 {d24-d25}, [%[dst2]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i),
[dst2] "r" (_dst + i + 4),
"w" (vshift), "w" (vscale)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
);
}
})
#else
CVTS_FUNC(u16, s32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
uint16x8_t vline = vld1q_u16(_src + i);
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(u16, f32, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.16 {d4-d5}, [%[src]] \n\t"
"vmovl.u16 q3, d4 \n\t"
"vmovl.u16 q4, d5 \n\t"
"vcvt.f32.u32 q5, q3 \n\t"
"vcvt.f32.u32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vst1.32 {d18-d19}, [%[dst1]] \n\t"
"vst1.32 {d20-d21}, [%[dst2]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 4),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
);
}
})
#else
CVTS_FUNC(u16, f32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
uint16x8_t vline = vld1q_u16(_src + i);
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vst1q_f32(_dst + i + 0, vline1_f32);
vst1q_f32(_dst + i + 4, vline2_f32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s16, u8, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src1]] \n\t"
"vmovl.s16 q3, d4 \n\t"
"vmovl.s16 q4, d5 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vcvt.f32.s32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vqmovn.s32 d26, q11 \n\t"
"vqmovn.s32 d27, q12 \n\t"
"vqmovun.s16 d28, q13 \n\t"
"vst1.8 {d28}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i),
[dst] "r" (_dst + i + 0),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
);
}
})
#else
CVTS_FUNC(s16, u8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int16x8_t vline = vld1q_s16(_src + i);
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
vst1_u8(_dst + i, vRes);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s16, s8, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.8 {d4-d5}, [%[src1]] \n\t"
"vmovl.s16 q3, d4 \n\t"
"vmovl.s16 q4, d5 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vcvt.f32.s32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vqmovn.s32 d26, q11 \n\t"
"vqmovn.s32 d27, q12 \n\t"
"vqmovn.s16 d28, q13 \n\t"
"vst1.8 {d28}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i),
[dst] "r" (_dst + i + 0),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
);
}
})
#else
CVTS_FUNC(s16, s8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int16x8_t vline = vld1q_s16(_src + i);
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
vst1_s8(_dst + i, vRes);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s16, u16, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.16 {d4-d5}, [%[src]] \n\t"
"vmovl.s16 q3, d4 \n\t"
"vmovl.s16 q4, d5 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vcvt.f32.s32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vqmovun.s32 d26, q11 \n\t"
"vqmovun.s32 d27, q12 \n\t"
"vst1.16 {d26-d27}, [%[dst]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst] "r" (_dst + i + 0),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
);
}
})
#else
CVTS_FUNC(s16, u16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int16x8_t vline = vld1q_s16(_src + i);
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC1(s16, 16,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.16 {d4-d5}, [%[src]] \n\t"
"vmovl.s16 q3, d4 \n\t"
"vmovl.s16 q4, d5 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vcvt.f32.s32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vqmovn.s32 d26, q11 \n\t"
"vqmovn.s32 d27, q12 \n\t"
"vst1.16 {d26-d27}, [%[dst]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst] "r" (_dst + i + 0),
"w" (vshift), "w" (vscale)
: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
);
}
})
#else
CVTS_FUNC1(s16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int16x8_t vline = vld1q_s16(_src + i);
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s16, s32, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.16 {d4-d5}, [%[src]] \n\t"
"vmovl.s16 q3, d4 \n\t"
"vmovl.s16 q4, d5 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vcvt.f32.s32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vcvt.s32.f32 q12, q10 \n\t"
"vst1.32 {d22-d23}, [%[dst1]] \n\t"
"vst1.32 {d24-d25}, [%[dst2]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 4),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
);
}
})
#else
CVTS_FUNC(s16, s32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int16x8_t vline = vld1q_s16(_src + i);
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s16, f32, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.16 {d4-d5}, [%[src]] \n\t"
"vmovl.s16 q3, d4 \n\t"
"vmovl.s16 q4, d5 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vcvt.f32.s32 q6, q4 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vmul.f32 q8, q6, q0 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vadd.f32 q10, q8, q1 \n\t"
"vst1.32 {d18-d19}, [%[dst1]] \n\t"
"vst1.32 {d20-d21}, [%[dst2]] \n\t"
: /*no output*/
: [src] "r" (_src + i),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 4),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
);
}
})
#else
CVTS_FUNC(s16, f32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int16x8_t vline = vld1q_s16(_src + i);
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vst1q_f32(_dst + i + 0, vline1_f32);
vst1q_f32(_dst + i + 4, vline2_f32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s32, u8, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vcvt.f32.s32 q4, q2 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vmul.f32 q6, q4, q0 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vadd.f32 q8, q6, q1 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vcvt.s32.f32 q10, q8 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vqmovun.s32 d24, q10 \n\t"
"vqmovun.s32 d25, q11 \n\t"
"vqmovn.u16 d26, q12 \n\t"
"vst1.8 {d26}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst] "r" (_dst + i),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
);
}
})
#else
CVTS_FUNC(s32, u8, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
vst1_u8(_dst + i, vRes);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s32, s8, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vcvt.f32.s32 q4, q2 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vmul.f32 q6, q4, q0 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vadd.f32 q8, q6, q1 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vcvt.s32.f32 q10, q8 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vqmovn.s32 d24, q10 \n\t"
"vqmovn.s32 d25, q11 \n\t"
"vqmovn.s16 d26, q12 \n\t"
"vst1.8 {d26}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst] "r" (_dst + i),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
);
}
})
#else
CVTS_FUNC(s32, s8, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
vst1_s8(_dst + i, vRes);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s32, u16, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vcvt.f32.s32 q4, q2 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vmul.f32 q6, q4, q0 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vadd.f32 q8, q6, q1 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vcvt.s32.f32 q10, q8 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vqmovun.s32 d24, q10 \n\t"
"vqmovun.s32 d25, q11 \n\t"
"vst1.16 {d24-d25}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst] "r" (_dst + i),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
);
}
})
#else
CVTS_FUNC(s32, u16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s32, s16, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vcvt.f32.s32 q4, q2 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vmul.f32 q6, q4, q0 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vadd.f32 q8, q6, q1 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vcvt.s32.f32 q10, q8 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vqmovn.s32 d24, q10 \n\t"
"vqmovn.s32 d25, q11 \n\t"
"vst1.8 {d24-d25}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst] "r" (_dst + i),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
);
}
})
#else
CVTS_FUNC(s32, s16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC1(s32, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vcvt.f32.s32 q4, q2 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vmul.f32 q6, q4, q0 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vadd.f32 q8, q6, q1 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vcvt.s32.f32 q10, q8 \n\t"
"vcvt.s32.f32 q11, q9 \n\t"
"vst1.32 {d20-d21}, [%[dst1]] \n\t"
"vst1.32 {d22-d23}, [%[dst2]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 4),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
);
}
})
#else
CVTS_FUNC1(s32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32);
vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(s32, f32, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vcvt.f32.s32 q4, q2 \n\t"
"vcvt.f32.s32 q5, q3 \n\t"
"vmul.f32 q6, q4, q0 \n\t"
"vmul.f32 q7, q5, q0 \n\t"
"vadd.f32 q8, q6, q1 \n\t"
"vadd.f32 q9, q7, q1 \n\t"
"vst1.32 {d16-d17}, [%[dst1]] \n\t"
"vst1.32 {d18-d19}, [%[dst2]] \n\t"
: /*no output*/
: [src1] "r" (_src + i),
[src2] "r" (_src + i + 4),
[dst1] "r" (_dst + i),
[dst2] "r" (_dst + i + 4),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
);
}
})
#else
CVTS_FUNC(s32, f32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vst1q_f32(_dst + i + 0, vline1_f32);
vst1q_f32(_dst + i + 4, vline2_f32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(f32, u8, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)((1 << 16)*alpha));
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)((1 << 16)*beta));
register uint32x4_t vmask asm ("q2") = vdupq_n_u32(1<<16);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d6-d7}, [%[src1]] \n\t"
"vld1.32 {d8-d9}, [%[src2]] \n\t"
"vmul.f32 q5, q3, q0 \n\t"
"vmul.f32 q6, q4, q0 \n\t"
"vadd.f32 q7, q5, q1 \n\t"
"vadd.f32 q8, q6, q1 \n\t"
"vcvt.u32.f32 q9, q7 \n\t"
"vcvt.u32.f32 q10, q8 \n\t"
"vbic q11, q2, q6 \n\t"
"vbic q12, q2, q7 \n\t"
"vshr.u32 q13, q11, #16 \n\t"
"vshr.u32 q14, q12, #16 \n\t"
"vqsub.u32 q7, q9, q13 \n\t"
"vqsub.u32 q8, q10, q14 \n\t"
"vqrshrn.u32 d22, q7, #16 \n\t"
"vqrshrn.u32 d23, q8, #16 \n\t"
"vqmovn.u16 d30, q11 \n\t"
"vst1.8 {d30}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst] "r" (_dst + i),
"w" (vscale), "w" (vshift), "w" (vmask)
: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
);
}
})
#else
CVTS_FUNC(f32, u8, 8,
float32x4_t vscale = vdupq_n_f32((f32)((1 << 16)*alpha));
float32x4_t vshift = vdupq_n_f32((f32)((1 << 16)*beta));
uint32x4_t vmask = vdupq_n_u32(1<<16);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
float32x4_t vline1Shifted_f32 = vaddq_f32(vline1_f32, vshift);
float32x4_t vline2Shifted_f32 = vaddq_f32(vline2_f32, vshift);
uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1Shifted_f32);
uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2Shifted_f32);
uint32x4_t vline1Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline2_f32));
uint32x4_t vline2Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline1Shifted_f32));
vline1Mask = vshrq_n_u32(vline1Mask, 16);
vline2Mask = vshrq_n_u32(vline2Mask, 16);
vline1_u32 = vqsubq_u32(vline1_u32, vline1Mask);
vline2_u32 = vqsubq_u32(vline2_u32, vline2Mask);
uint16x4_t vRes1 = vqrshrn_n_u32(vline1_u32, 16);
uint16x4_t vRes2 = vqrshrn_n_u32(vline2_u32, 16);
uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
vst1_u8(_dst + i, vRes);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(f32, s8, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vmul.f32 q4, q2, q0 \n\t"
"vmul.f32 q5, q3, q0 \n\t"
"vadd.f32 q6, q4, q1 \n\t"
"vadd.f32 q7, q5, q1 \n\t"
"vcvt.s32.f32 q8, q6 \n\t"
"vcvt.s32.f32 q9, q7 \n\t"
"vqmovn.s32 d14, q8 \n\t"
"vqmovn.s32 d15, q9 \n\t"
"vqmovn.s16 d16, q7 \n\t"
"vst1.8 {d16}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst] "r" (_dst + i),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
);
}
})
#else
CVTS_FUNC(f32, s8, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
vst1_s8(_dst + i, vRes);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(f32, u16, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vmul.f32 q4, q2, q0 \n\t"
"vmul.f32 q5, q3, q0 \n\t"
"vadd.f32 q6, q4, q1 \n\t"
"vadd.f32 q7, q5, q1 \n\t"
"vcvt.u32.f32 q8, q6 \n\t"
"vcvt.u32.f32 q9, q7 \n\t"
"vqmovn.u32 d8, q8 \n\t"
"vqmovn.u32 d9, q9 \n\t"
"vst1.16 {d8-d9}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst] "r" (_dst + i),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
);
}
})
#else
CVTS_FUNC(f32, u16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);
uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(f32, s16, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vmul.f32 q4, q2, q0 \n\t"
"vmul.f32 q5, q3, q0 \n\t"
"vadd.f32 q6, q4, q1 \n\t"
"vadd.f32 q7, q5, q1 \n\t"
"vcvt.s32.f32 q8, q6 \n\t"
"vcvt.s32.f32 q9, q7 \n\t"
"vqmovn.s32 d8, q8 \n\t"
"vqmovn.s32 d9, q9 \n\t"
"vst1.16 {d8-d9}, [%[dst]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst] "r" (_dst + i),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
);
}
})
#else
CVTS_FUNC(f32, s16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC(f32, s32, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vmul.f32 q4, q2, q0 \n\t"
"vmul.f32 q5, q3, q0 \n\t"
"vadd.f32 q6, q4, q1 \n\t"
"vadd.f32 q7, q5, q1 \n\t"
"vcvt.s32.f32 q4, q6 \n\t"
"vcvt.s32.f32 q5, q7 \n\t"
"vst1.32 {d8-d9}, [%[dst1]] \n\t"
"vst1.32 {d10-d11}, [%[dst2]] \n\t"
: //no output
: [src1] "r" (_src + i),
[src2] "r" (_src + i + 4),
[dst1] "r" (_dst + i),
[dst2] "r" (_dst + i + 4),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
);
}
})
#else
CVTS_FUNC(f32, s32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC1(f32, 8,
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
__asm__ (
"vld1.32 {d4-d5}, [%[src1]] \n\t"
"vld1.32 {d6-d7}, [%[src2]] \n\t"
"vmul.f32 q4, q2, q0 \n\t"
"vmul.f32 q5, q3, q0 \n\t"
"vadd.f32 q6, q4, q1 \n\t"
"vadd.f32 q7, q5, q1 \n\t"
"vst1.32 {d12-d13}, [%[dst1]] \n\t"
"vst1.32 {d14-d15}, [%[dst2]] \n\t"
: /*no output*/
: [src1] "r" (_src + i + 0),
[src2] "r" (_src + i + 4),
[dst1] "r" (_dst + i + 0),
[dst2] "r" (_dst + i + 4),
"w" (vscale), "w" (vshift)
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
);
}
})
#else
CVTS_FUNC1(f32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta);,
{
for (size_t i = 0; i < w; i += 8)
{
internal::prefetch(_src + i);
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
vline1_f32 = vmulq_f32(vline1_f32, vscale);
vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift);
vst1q_f32(_dst + i + 0, vline1_f32);
vst1q_f32(_dst + i + 4, vline2_f32);
}
})
#endif
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
namespace CAROTENE_NS {
bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE) &&
(ksize.width == 3) && (ksize.height == 3);
}
#ifdef CAROTENE_NEON
namespace {
template <int shift>
int32x4_t vshrq_s32(int32x4_t value)
{
return vshrq_n_s32(value, shift);
}
template <>
int32x4_t vshrq_s32<0>(int32x4_t value)
{
return value;
}
} // namespace
typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
#endif
void convolution(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue,
const Size2D & ksize, s16 * kernelBase, u32 scale)
{
internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
#ifdef CAROTENE_NEON
const uint8x8_t v_zero_u8 = vdup_n_u8(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
const int32x4_t v_zero_s32 = vdupq_n_s32(0);
uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
static const vshrq_s32_func vshrq_s32_a[33] =
{
vshrq_s32<0>,
vshrq_s32<1>,
vshrq_s32<2>,
vshrq_s32<3>,
vshrq_s32<4>,
vshrq_s32<5>,
vshrq_s32<6>,
vshrq_s32<7>,
vshrq_s32<8>,
vshrq_s32<9>,
vshrq_s32<10>,
vshrq_s32<11>,
vshrq_s32<12>,
vshrq_s32<13>,
vshrq_s32<14>,
vshrq_s32<15>,
vshrq_s32<16>,
vshrq_s32<17>,
vshrq_s32<18>,
vshrq_s32<19>,
vshrq_s32<20>,
vshrq_s32<21>,
vshrq_s32<22>,
vshrq_s32<23>,
vshrq_s32<24>,
vshrq_s32<25>,
vshrq_s32<26>,
vshrq_s32<27>,
vshrq_s32<28>,
vshrq_s32<29>,
vshrq_s32<30>,
vshrq_s32<31>,
vshrq_s32<32>
};
vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx[3] = { 0, 0, 0 },
currx[3] = { 0, 0, 0 },
nextx[3] = { 0, 0, 0 };
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx[0] = prevx[1] = prevx[2] = borderValue;
else
{
prevx[0] = srow0 ? srow0[x4] : borderValue;
prevx[1] = srow1[x4] ;
prevx[2] = srow2 ? srow2[x4] : borderValue;
}
currx[0] = srow0 ? srow0[x3] : borderValue;
currx[1] = srow1[x3] ;
currx[2] = srow2 ? srow2[x3] : borderValue;
}
// make shift
if (x)
{
tprev[0] = tcurr[0];
tcurr[0] = tnext[0];
tprev[1] = tcurr[1];
tcurr[1] = tnext[1];
tprev[2] = tcurr[2];
tcurr[2] = tnext[2];
}
tnext[0] = x0;
tnext[1] = x1;
tnext[2] = x2;
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr[0] = tcurr[1] = tcurr[2] = v_border;
else if (border == BORDER_MODE_REPLICATE)
{
tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
}
continue;
}
int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[0], tcurr[0], 7);
t1 = tcurr[0];
t2 = vext_u8(tcurr[0], tnext[0], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[1], tcurr[1], 7);
t1 = tcurr[1];
t2 = vext_u8(tcurr[1], tnext[1], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[2], tcurr[2], 7);
t1 = tcurr[2];
t2 = vext_u8(tcurr[2], tnext[2], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
}
// make scale
v_dst0 = vshrq_s32_p(v_dst0);
v_dst1 = vshrq_s32_p(v_dst1);
// and add them
vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
vqmovun_s32(v_dst1))));
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
{
nextx[0] = borderValue;
nextx[1] = borderValue;
nextx[2] = borderValue;
}
else if (border == BORDER_MODE_REPLICATE)
{
nextx[0] = srow0[x];
nextx[1] = srow1[x];
nextx[2] = srow2[x];
}
}
else
{
nextx[0] = srow0 ? srow0[x + 1] : borderValue;
nextx[1] = srow1[x + 1] ;
nextx[2] = srow2 ? srow2[x + 1] : borderValue;
}
s32 val = 0;
for (s32 _y = 0; _y < 3; ++_y)
val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
drow[x] = internal::saturate_cast<u8>(val >> scale);
// make shift
prevx[0] = currx[0];
currx[0] = nextx[0];
prevx[1] = currx[1];
currx[1] = nextx[1];
prevx[2] = currx[2];
currx[2] = nextx[2];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
(void)ksize;
(void)kernelBase;
(void)scale;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <limits>
namespace CAROTENE_NS {
s32 countNonZero(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw16 = size.width & ~15u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO8U_BLOCK_SIZE (16*255)
uint8x16_t vc1 = vmovq_n_u8(1);
for (; i < roiw16;)
{
size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
uint8x16_t vs = vmovq_n_u8(0);
for (; i <= lim; i+= 16)
{
internal::prefetch(src + i);
uint8x16_t vln = vld1q_u8(src + i);
uint8x16_t vnz = vminq_u8(vln, vc1);
vs = vaddq_u8(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const u16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
uint16x8_t vc1 = vmovq_n_u16(1);
for (; i < roiw8;)
{
size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
uint16x8_t vs = vmovq_n_u16(0);
for (; i <= lim; i+= 8)
{
internal::prefetch(src + i);
uint16x8_t vln = vld1q_u16(src + i);
uint16x8_t vnz = vminq_u16(vln, vc1);
vs = vaddq_u16(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vs);
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const s32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k);
u32 i = 0;
uint32x4_t vc1 = vmovq_n_u32(1);
uint32x4_t vs = vmovq_n_u32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
uint32x4_t vln = vld1q_u32(src + i);
uint32x4_t vnz = vminq_u32(vln, vc1);
vs = vqaddq_u32(vs, vnz);
}
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
float32x4_t vc0 = vmovq_n_f32(0);
int32x4_t vs = vmovq_n_s32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
float32x4_t vln = vld1q_f32(src + i);
int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
vs = vqaddq_s32(vs, vnz);
}
int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
int s[2];
vst1_s32(s, vs2);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f64 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
size_t roiw4 = size.width & ~3u;
size_t roiw2 = size.width & ~1u;
uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
uint32x4_t vc0 = vmovq_n_u32(0);
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f64* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
int32x2_t vs1 = vmov_n_s32(0);
int32x2_t vs2 = vmov_n_s32(0);
int32x2_t vs3 = vmov_n_s32(0);
int32x2_t vs4 = vmov_n_s32(0);
for (; i < roiw8; i += 8 )
{
internal::prefetch(src + i + 6);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint64x2_t vm3 = vandq_u64(vln3, vmask1);
uint64x2_t vm4 = vandq_u64(vln4, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
uint32x4_t vlx3 = vmvnq_u32(vequ3);
uint32x4_t vlx4 = vmvnq_u32(vequ4);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
vs3 = vqadd_s32(vs3, vnz3);
vs4 = vqadd_s32(vs4, vnz4);
}
if (i < roiw4)
{
internal::prefetch(src + i + 2);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
i += 4;
}
if (i < roiw2)
{
internal::prefetch(src + i);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
vs1 = vqadd_s32(vs1, vnz1);
i += 2;
}
vs1 = vqadd_s32(vs1, vs2);
vs3 = vqadd_s32(vs3, vs4);
vs1 = vqadd_s32(vs1, vs3);
int32x2_t vsneg = vqneg_s32(vs1);
s32 s[2];
vst1_s32(s, vsneg);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
#include <cfloat>
#include <cmath>
#include <limits>
namespace CAROTENE_NS {
namespace {
#ifdef CAROTENE_NEON
inline float32x4_t vroundq(const float32x4_t& v)
{
const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
return vaddq_f32(v, v_addition);
}
template <typename T>
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
template <>
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
inline float32x2_t vround(const float32x2_t& v)
{
const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
return vadd_f32(v, v_addition);
}
template <typename T>
inline T divSaturate(const T &v1, const T &v2, const float scale)
{
return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
template <>
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
template <typename T>
inline T divWrapQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
template <>
inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
template <typename T>
inline T divWrap(const T &v1, const T &v2, const float scale)
{
return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
template <>
inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); }
inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); }
inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); }
inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); }
inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); }
inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); }
inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); }
inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); }
inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
#endif
template <typename T>
void div(const Size2D &size,
const T * src0Base, ptrdiff_t src0Stride,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
#endif
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
(scale * std::numeric_limits<T>::max()) < 1.0f &&
(scale * std::numeric_limits<T>::max()) > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
#ifdef CAROTENE_NEON
template <typename T>
inline T recipSaturateQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipSaturate(const T &v2, const float scale)
{
return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrapQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrap(const T &v2, const float scale)
{
return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
#endif
template <typename T>
void recip(const Size2D &size,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
#endif
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
scale < 1.0f &&
scale > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
}
void div(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
vst1q_f32(dst + j, vmulq_f32(v_src0, internal::vrecpq_f32(v_src1)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
vst1_f32(dst + j, vmul_f32(v_src0, internal::vrecp_f32(v_src1)));
}
for (; j < size.width; j++)
{
dst[j] = src0[j] / src1[j];
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
vst1q_f32(dst + j, vmulq_f32(vmulq_n_f32(v_src0, scale),
internal::vrecpq_f32(v_src1)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
vst1_f32(dst + j, vmul_f32(vmul_n_f32(v_src0, scale),
internal::vrecp_f32(v_src1)));
}
for (; j < size.width; j++)
{
dst[j] = src0[j] * scale / src1[j];
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
void reciprocal(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s8 * srcBase, ptrdiff_t srcStride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
vst1q_f32(dst + j, internal::vrecpq_f32(v_src1));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
vst1_f32(dst + j, internal::vrecp_f32(v_src1));
}
for (; j < size.width; j++)
{
dst[j] = 1.0f / src1[j];
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
vst1q_f32(dst + j, vmulq_n_f32(internal::vrecpq_f32(v_src1), scale));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
vst1_f32(dst + j, vmul_n_f32(internal::vrecp_f32(v_src1), scale));
}
for (; j < size.width; j++)
{
dst[j] = scale / src1[j];
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
f64 dotProduct(const Size2D &_size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
#define DOT_UINT_BLOCKSIZE 66050*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
uint64x2_t ws = vmovq_n_u64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
uint32x4_t s1 = vmovq_n_u32(0);
uint32x4_t s2 = vmovq_n_u32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
uint8x16_t vs1 = vld1q_u8(src0 + i);
uint8x16_t vs2 = vld1q_u8(src1 + i);
uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
s1 = vpadalq_u16(s1, vdot1);
s2 = vpadalq_u16(s2, vdot2);
}
ws = vpadalq_u32(ws, s1);
ws = vpadalq_u32(ws, s2);
}
if(i + 8 <= size.width)
{
uint8x8_t vs1 = vld1_u8(src0 + i);
uint8x8_t vs2 = vld1_u8(src1 + i);
ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
int64x2_t ws = vmovq_n_s64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
int32x4_t s1 = vmovq_n_s32(0);
int32x4_t s2 = vmovq_n_s32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
int8x16_t vs1 = vld1q_s8(src0 + i);
int8x16_t vs2 = vld1q_s8(src1 + i);
int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
s1 = vpadalq_s16(s1, vdot1);
s2 = vpadalq_s16(s2, vdot2);
}
ws = vpadalq_s32(ws, s1);
ws = vpadalq_s32(ws, s2);
}
if(i + 8 <= size.width)
{
int8x8_t vs1 = vld1_s8(src0 + i);
int8x8_t vs2 = vld1_s8(src1 + i);
ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
{
size.width *= size.height;
size.height = 1;
}
#define DOT_FLOAT_BLOCKSIZE (1 << 13)
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
while(i + 4 <= size.width)
{
size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
float32x4_t v_sum = vdupq_n_f32(0.0f);
for( ; i <= lim; i += 4 )
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
}
float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
}
if(i + 2 <= size.width)
{
float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
i += 2;
}
for (; i < size.width; ++i)
result += src0[i] * src1[i];
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
} // namespace CAROTENE_NS
// This file is needed for compilation on some platforms e.g. with XCode generator
// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
Below is the original copyright and the references */
/*
Copyright (c) 2006, 2008 Edward Rosten
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
*Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
*Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
*Neither the name of the University of Cambridge nor the names of
its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
The references are:
* Machine learning for high-speed corner detection,
E. Rosten and T. Drummond, ECCV 2006
* Faster and better: A machine learning approach to corner detection
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace
{
void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
{
pixel[0] = 0 + row_stride * 3;
pixel[1] = 1 + row_stride * 3;
pixel[2] = 2 + row_stride * 2;
pixel[3] = 3 + row_stride * 1;
pixel[4] = 3 + row_stride * 0;
pixel[5] = 3 + row_stride * -1;
pixel[6] = 2 + row_stride * -2;
pixel[7] = 1 + row_stride * -3;
pixel[8] = 0 + row_stride * -3;
pixel[9] = -1 + row_stride * -3;
pixel[10] = -2 + row_stride * -2;
pixel[11] = -3 + row_stride * -1;
pixel[12] = -3 + row_stride * 0;
pixel[13] = -3 + row_stride * 1;
pixel[14] = -2 + row_stride * 2;
pixel[15] = -1 + row_stride * 3;
}
u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
{
const s32 K = 8, N = 16 + K + 1;
s32 k, v = ptr[0];
s16 d[(N + 7) & ~7];
for( k = 0; k < N; k++ )
d[k] = (s16)(v - ptr[pixel[k]]);
int16x8_t q0 = vdupq_n_s16((s16)(-1000));
int16x8_t q1 = vdupq_n_s16((s16)(1000));
int16x8_t d0_7 = vld1q_s16(d + 0);
int16x8_t d8_15 = vld1q_s16(d + 8);
int16x8_t d16_23 = vld1q_s16(d + 16);
int16x8_t d24 = vld1q_s16(d + 24);
//k == 0
int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
int16x8_t ak0 = vminq_s16(v0k0, v1k0);
int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 3);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 4);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 5);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 6);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 7);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
ak0 = vminq_s16(ak0, d8_15);
bk0 = vmaxq_s16(bk0, d8_15);
q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
v1k0 = vextq_s16(d8_15, d16_23, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
//k == 8
int16x8_t v0k8 = v1k0;
int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
int16x8_t ak8 = vminq_s16(v0k8, v1k8);
int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 3);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 4);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 5);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 6);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 7);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
ak8 = vminq_s16(ak8, d16_23);
bk8 = vmaxq_s16(bk8, d16_23);
q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
v1k8 = vextq_s16(d16_23, d24, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
//fin
int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
int32x4_t q2w = vmovl_s16(q2);
int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
return (u8)(vget_lane_s32(q8, 0) - 1);
}
} //namespace
#endif
void FAST(const Size2D &size,
u8 *srcBase, ptrdiff_t srcStride,
KeypointStore *keypoints,
u8 threshold, bool nonmax_suppression)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
//keypoints.clear();
const s32 K = 8, N = 16 + K + 1;
ptrdiff_t i, j, k, pixel[N];
makeOffsets(pixel, srcStride);
for(k = 16; k < N; k++)
pixel[k] = pixel[k - 16];
uint8x16_t delta = vdupq_n_u8(128);
uint8x16_t t = vdupq_n_u8(threshold);
uint8x16_t K16 = vdupq_n_u8((u8)K);
u8 threshold_tab[512];
for( i = -255; i <= 255; i++ )
threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
u8* buf[3];
buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
ptrdiff_t* cpbuf[3];
cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
cpbuf[1] = cpbuf[0] + size.width + 1;
cpbuf[2] = cpbuf[1] + size.width + 1;
memset(buf[0], 0, size.width*3);
for(i = 3; i < (ptrdiff_t)size.height-2; i++)
{
const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
u8* curr = buf[(i - 3)%3];
ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
memset(curr, 0, size.width);
ptrdiff_t ncorners = 0;
if( i < (ptrdiff_t)size.height - 3 )
{
j = 3;
for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
{
internal::prefetch(ptr);
internal::prefetch(ptr + pixel[0]);
internal::prefetch(ptr + pixel[2]);
uint8x16_t v0 = vld1q_u8(ptr);
int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
uint8x16_t m0 = vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
uint8x16_t m1 = vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
m0 = vorrq_u8(m0, m1);
u64 mask[2];
vst1q_u64(mask, vreinterpretq_u64_u8(m0));
if( mask[0] == 0 )
{
if (mask[1] != 0)
{
j -= 8;
ptr -= 8;
}
continue;
}
uint8x16_t c0 = vmovq_n_u8(0);
uint8x16_t c1 = vmovq_n_u8(0);
uint8x16_t max0 = vmovq_n_u8(0);
uint8x16_t max1 = vmovq_n_u8(0);
for( k = 0; k < N; k++ )
{
int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
m0 = vcgtq_s8(x, v2);
m1 = vcgtq_s8(v1, x);
c0 = vandq_u8(vsubq_u8(c0, m0), m0);
c1 = vandq_u8(vsubq_u8(c1, m1), m1);
max0 = vmaxq_u8(max0, c0);
max1 = vmaxq_u8(max1, c1);
}
max0 = vmaxq_u8(max0, max1);
u8 m[16];
vst1q_u8(m, vcgtq_u8(max0, K16));
for( k = 0; k < 16; ++k )
if(m[k])
{
cornerpos[ncorners++] = j+k;
if(nonmax_suppression)
curr[j+k] = cornerScore(ptr+k, pixel);
}
}
for( ; j < (s32)size.width - 3; j++, ptr++ )
{
s32 v = ptr[0];
const u8* tab = &threshold_tab[0] - v + 255;
s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
if( d & 1 )
{
s32 vt = v - threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x < vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
if( d & 2 )
{
s32 vt = v + threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x > vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
}
}
cornerpos[-1] = ncorners;
if( i == 3 )
continue;
const u8* prev = buf[(i - 4 + 3)%3];
const u8* pprev = buf[(i - 5 + 3)%3];
cornerpos = cpbuf[(i - 4 + 3)%3];
ncorners = cornerpos[-1];
for( k = 0; k < ncorners; k++ )
{
j = cornerpos[k];
s32 score = prev[j];
if( !nonmax_suppression ||
(score > prev[j+1] && score > prev[j-1] &&
score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
{
keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)keypoints;
(void)threshold;
(void)nonmax_suppression;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void process(const T * src, size_t j0, size_t j1, size_t i,
T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
for (size_t j = j0; j < j1; ++j)
{
T val = src[j];
if (val == maxVal)
{
if (maxLocCount < maxLocCapacity)
{
maxLocPtr[maxLocCount] = j;
maxLocPtr[maxLocCount + 1] = i;
}
maxLocCount += 2;
}
if (val == minVal)
{
if (minLocCount < minLocCapacity)
{
minLocPtr[minLocCount] = j;
minLocPtr[minLocCount + 1] = i;
}
minLocCount += 2;
}
}
}
} // namespace
#endif
void fillMinMaxLocs(const Size2D & size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
vst1q_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
v_minval8 = vdupq_n_u16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint16x8_t v_src = vld1q_u16(src + j);
uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
v_minval8 = vdupq_n_s16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int16x8_t v_src = vld1q_s16(src + j);
uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
v_minval4 = vdupq_n_s32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u32 * srcBase, ptrdiff_t srcStride,
u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
v_minval4 = vdupq_n_u32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace CAROTENE_NS {
bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
{
bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
return isSupportedConfiguration() &&
((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
(flipMode == FLIP_VERTICAL_MODE));
}
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void flip(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
typedef typename VecTraits<T>::vec128 vec128;
typedef typename VecTraits<T>::vec64 vec64;
u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t js = 0, jd = size.width;
for (; js < roiw_base; js += step_base, jd -= step_base)
{
prefetch(src + js);
vec128 v_src = vld1q(src + js);
vec128 v_dst = vrev64q(v_src);
v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
vst1q(dst + jd - step_base, v_dst);
}
for (; js < roiw_tail; js += step_tail, jd -= step_tail)
{
vec64 v_src = vld1(src + js);
vst1(dst + jd - step_tail, vrev64(v_src));
}
for (--jd; js < size.width; ++js, --jd)
dst[jd] = src[js];
}
}
template <typename T>
void flip3(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
#ifndef __ANDROID__
typedef typename VecTraits<T, 3>::vec128 vec128;
#endif
typedef typename VecTraits<T, 3>::vec64 vec64;
#ifndef __ANDROID__
u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
#endif
u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t j = 0, js = 0, jd = size.width * 3;
#ifndef __ANDROID__
for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
{
prefetch(src + js);
vec128 v_src = vld3q(src + js), v_dst;
v_src.val[0] = vrev64q(v_src.val[0]);
v_src.val[1] = vrev64q(v_src.val[1]);
v_src.val[2] = vrev64q(v_src.val[2]);
v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));
vst3q(dst + jd - step_base3, v_dst);
}
#endif // __ANDROID__
for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
{
vec64 v_src = vld3(src + js), v_dst;
v_dst.val[0] = vrev64(v_src.val[0]);
v_dst.val[1] = vrev64(v_src.val[1]);
v_dst.val[2] = vrev64(v_src.val[2]);
vst3(dst + jd - step_tail3, v_dst);
}
for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
{
dst[jd] = src[js];
dst[jd + 1] = src[js + 1];
dst[jd + 2] = src[js + 2];
}
}
}
typedef void (* flipFunc)(const Size2D &size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode);
} // namespace
#endif
void flip(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode, u32 elemSize)
{
internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
#ifdef CAROTENE_NEON
if (flipMode == FLIP_VERTICAL_MODE)
{
for (size_t y = 0; y < size.height; ++y)
{
const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);
std::memcpy(dst_row, src_row, elemSize * size.width);
}
return;
}
flipFunc func = NULL;
if (elemSize == (u32)sizeof(u8))
func = &flip<u8>;
if (elemSize == (u32)sizeof(u16))
func = &flip<u16>;
if (elemSize == (u32)sizeof(u32))
func = &flip<u32>;
if (elemSize == (u32)sizeof(u8) * 3)
func = &flip3<u8>;
if (func == NULL)
return;
func(size,
srcBase, srcStride,
dstBase, dstStride,
flipMode);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)flipMode;
(void)elemSize;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include "separable_filter.hpp"
namespace CAROTENE_NS {
bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
void gaussianBlur3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border));
#ifdef CAROTENE_NEON
const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2);
const uint16x8_t v_zero = vdupq_n_u16(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
s16 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue);
currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue);
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1));
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border_x4;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u16(tprev, tcurr, 7);
t1 = tcurr;
t2 = vextq_u16(tcurr, tnext, 1);
// and add them
t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2));
vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4));
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue << 2;
else if (border == BORDER_MODE_REPLICATE)
nextx = srow2[x] + (srow1[x] << 1) + srow0[x];
}
else
nextx = (srow2 ? srow2[x + 1] : borderValue) +
(srow1[x + 1] << 1) +
(srow0 ? srow0[x + 1] : borderValue);
f32 val = (prevx + (currx << 1) + nextx) >> 4;
drow[x] = internal::saturate_cast<u8>((s32)val);
// make shift
prevx = currx;
currx = nextx;
}
}
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin)
{
return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin);
}
void gaussianBlur3x3Margin(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin));
#ifdef CAROTENE_NEON
internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3U8_121>::process(
size, srcBase, srcStride, dstBase, dstStride,
0, 0, border, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border)
{
return isSupportedConfiguration() &&
cn > 0 && cn <= 4 &&
size.width >= 8 && size.height >= 2 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REPLICATE ||
border == BORDER_MODE_WRAP);
}
void gaussianBlur5x5(const Size2D &size, s32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
#ifdef CAROTENE_NEON
size_t colsn = size.width * cn;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (borderType == BORDER_MODE_CONSTANT)
{
_tmp.assign(colsn + 4*cn, borderValue);
tmp = &_tmp[cn << 1];
}
ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
//1-line buffer
std::vector<u16> _buf(cn * (size.width + 4) + 32 / sizeof(u16));
u16* lane = internal::alignPtr(&_buf[cn << 1], 32);
if (borderType == BORDER_MODE_CONSTANT)
for (s32 k = 0; k < cn; ++k)
{
lane[-cn+k] = borderValue;
lane[-cn-cn+k] = borderValue;
lane[colsn+k] = borderValue;
lane[colsn+cn+k] = borderValue;
}
uint8x8_t vc6u8 = vmov_n_u8(6);
uint16x8_t vc6u16 = vmovq_n_u16(6);
uint16x8_t vc4u16 = vmovq_n_u16(4);
for (size_t i = 0; i < size.height; ++i)
{
u8* dst = internal::getRowPtr(dstBase, dstStride, i);
//vertical convolution
ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i);
const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
size_t x = 0;
for (; x <= colsn - 8; x += 8)
{
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
uint8x8_t v0 = vld1_u8(ln0+x);
uint8x8_t v1 = vld1_u8(ln1+x);
uint8x8_t v2 = vld1_u8(ln2+x);
uint8x8_t v3 = vld1_u8(ln3+x);
uint8x8_t v4 = vld1_u8(ln4+x);
uint16x8_t v = vaddl_u8(v0, v4);
uint16x8_t v13 = vaddl_u8(v1, v3);
v = vmlal_u8(v, v2, vc6u8);
v = vmlaq_u16(v, v13, vc4u16);
vst1q_u16(lane + x, v);
}
for (; x < colsn; ++x)
lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x];
//left&right borders
if (borderType != BORDER_MODE_CONSTANT)
for (s32 k = 0; k < cn; ++k)
{
lane[-cn+k] = lane[idx_l1 + k];
lane[-cn-cn+k] = lane[idx_l2 + k];
lane[colsn+k] = lane[idx_r1 + k];
lane[colsn+cn+k] = lane[idx_r2 + k];
}
//horizontal convolution
x = 0;
switch(cn)
{
case 1:
for (; x <= colsn - 8; x += 8)
{
internal::prefetch(lane + x);
uint16x8_t lane0 = vld1q_u16(lane + x - 2);
uint16x8_t lane4 = vld1q_u16(lane + x + 2);
uint16x8_t lane1 = vld1q_u16(lane + x - 1);
uint16x8_t lane3 = vld1q_u16(lane + x + 1);
uint16x8_t lane2 = vld1q_u16(lane + x + 0);
uint16x8_t ln04 = vaddq_u16(lane0, lane4);
uint16x8_t ln13 = vaddq_u16(lane1, lane3);
uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16);
uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16);
uint8x8_t ls = vrshrn_n_u16(lsw, 8);
vst1_u8(dst + x, ls);
}
break;
case 2:
for (; x <= colsn - 8*2; x += 8*2)
{
internal::prefetch(lane + x);
u16* lidx0 = lane + x - 2*2;
u16* lidx1 = lane + x - 1*2;
u16* lidx3 = lane + x + 1*2;
u16* lidx4 = lane + x + 2*2;
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
__asm__ __volatile__ (
"vld2.16 {d0, d2}, [%[in0]]! \n\t"
"vld2.16 {d1, d3}, [%[in0]] \n\t"
"vld2.16 {d8, d10}, [%[in4]]! \n\t"
"vld2.16 {d9, d11}, [%[in4]] \n\t"
"vadd.i16 q0, q4 \n\t"
"vadd.i16 q1, q5 \n\t"
"vld2.16 {d16, d18}, [%[in1]]! \n\t"
"vld2.16 {d17, d19}, [%[in1]] \n\t"
"vld2.16 {d8, d10}, [%[in3]]! \n\t"
"vld2.16 {d9, d11}, [%[in3]] \n\t"
"vadd.i16 q4, q8 \n\t"
"vadd.i16 q5, q9 \n\t"
"vld2.16 {d16, d18}, [%[in2]] \n\t"
"vld2.16 {d17, d19}, [%[in22]] \n\t"
"vmla.i16 q0, q4, %q[c4] \n\t"
"vmla.i16 q1, q5, %q[c4] \n\t"
"vmla.i16 q0, q8, %q[c6] \n\t"
"vmla.i16 q1, q9, %q[c6] \n\t"
"vrshrn.u16 d8, q0, #8 \n\t"
"vrshrn.u16 d9, q1, #8 \n\t"
"vst2.8 {d8-d9}, [%[out]] \n\t"
: [in0] "=r" (lidx0),
[in1] "=r" (lidx1),
[in3] "=r" (lidx3),
[in4] "=r" (lidx4)
: [out] "r" (dst + x),
"0" (lidx0),
"1" (lidx1),
"2" (lidx3),
"3" (lidx4),
[in2] "r" (lane + x),
[in22] "r" (lane + x + 4*2),
[c4] "w" (vc4u16), [c6] "w" (vc6u16)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
);
#else
uint16x8x2_t vLane0 = vld2q_u16(lidx0);
uint16x8x2_t vLane1 = vld2q_u16(lidx1);
uint16x8x2_t vLane2 = vld2q_u16(lane + x);
uint16x8x2_t vLane3 = vld2q_u16(lidx3);
uint16x8x2_t vLane4 = vld2q_u16(lidx4);
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]);
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
uint8x8x2_t vRes;
vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
vst2_u8(dst + x, vRes);
#endif
}
break;
case 3:
for (; x <= colsn - 8*3; x += 8*3)
{
internal::prefetch(lane + x);
u16* lidx0 = lane + x - 2*3;
u16* lidx1 = lane + x - 1*3;
u16* lidx3 = lane + x + 1*3;
u16* lidx4 = lane + x + 2*3;
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__ __volatile__ (
"vld3.16 {d0, d2, d4}, [%[in0]]! \n\t"
"vld3.16 {d1, d3, d5}, [%[in0]] \n\t"
"vld3.16 {d8, d10, d12}, [%[in4]]! \n\t"
"vld3.16 {d9, d11, d13}, [%[in4]] \n\t"
"vadd.i16 q0, q4 \n\t"
"vadd.i16 q1, q5 \n\t"
"vadd.i16 q2, q6 \n\t"
"vld3.16 {d16, d18, d20}, [%[in1]]! \n\t"
"vld3.16 {d17, d19, d21}, [%[in1]] \n\t"
"vld3.16 {d8, d10, d12}, [%[in3]]! \n\t"
"vld3.16 {d9, d11, d13}, [%[in3]] \n\t"
"vadd.i16 q4, q8 \n\t"
"vadd.i16 q5, q9 \n\t"
"vadd.i16 q6, q10 \n\t"
"vld3.16 {d16, d18, d20}, [%[in2]] \n\t"
"vld3.16 {d17, d19, d21}, [%[in22]] \n\t"
"vmla.i16 q0, q4, %q[c4] \n\t"
"vmla.i16 q1, q5, %q[c4] \n\t"
"vmla.i16 q2, q6, %q[c4] \n\t"
"vmla.i16 q0, q8, %q[c6] \n\t"
"vmla.i16 q1, q9, %q[c6] \n\t"
"vmla.i16 q2, q10, %q[c6] \n\t"
"vrshrn.u16 d8, q0, #8 \n\t"
"vrshrn.u16 d9, q1, #8 \n\t"
"vrshrn.u16 d10, q2, #8 \n\t"
"vst3.8 {d8-d10}, [%[out]] \n\t"
: [in0] "=r" (lidx0),
[in1] "=r" (lidx1),
[in3] "=r" (lidx3),
[in4] "=r" (lidx4)
: [out] "r" (dst + x),
"0" (lidx0),
"1" (lidx1),
"2" (lidx3),
"3" (lidx4),
[in2] "r" (lane + x),
[in22] "r" (lane + x + 4*3),
[c4] "w" (vc4u16), [c6] "w" (vc6u16)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
);
#else
uint16x8x3_t vLane0 = vld3q_u16(lidx0);
uint16x8x3_t vLane1 = vld3q_u16(lidx1);
uint16x8x3_t vLane2 = vld3q_u16(lane + x);
uint16x8x3_t vLane3 = vld3q_u16(lidx3);
uint16x8x3_t vLane4 = vld3q_u16(lidx4);
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]);
uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]);
uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]);
uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]);
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16);
vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16);
vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16);
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16);
uint8x8x3_t vRes;
vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
vst3_u8(dst + x, vRes);
#endif
}
break;
case 4:
for (; x <= colsn - 8*4; x += 8*4)
{
internal::prefetch(lane + x);
internal::prefetch(lane + x + 16);
u16* lidx0 = lane + x - 2*4;
u16* lidx1 = lane + x - 1*4;
u16* lidx3 = lane + x + 1*4;
u16* lidx4 = lane + x + 2*4;
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__ __volatile__ (
"vld4.16 {d0, d2, d4, d6}, [%[in0]]! \n\t"
"vld4.16 {d1, d3, d5, d7}, [%[in0]] \n\t"
"vld4.16 {d8, d10, d12, d14}, [%[in4]]! \n\t"
"vld4.16 {d9, d11, d13, d15}, [%[in4]] \n\t"
"vadd.i16 q0, q4 \n\t"
"vadd.i16 q1, q5 \n\t"
"vadd.i16 q2, q6 \n\t"
"vadd.i16 q3, q7 \n\t"
"vld4.16 {d16, d18, d20, d22}, [%[in1]]! \n\t"
"vld4.16 {d17, d19, d21, d23}, [%[in1]] \n\t"
"vld4.16 {d8, d10, d12, d14}, [%[in3]]! \n\t"
"vld4.16 {d9, d11, d13, d15}, [%[in3]] \n\t"
"vadd.i16 q4, q8 \n\t"
"vadd.i16 q5, q9 \n\t"
"vadd.i16 q6, q10 \n\t"
"vadd.i16 q7, q11 \n\t"
"vld4.16 {d16, d18, d20, d22}, [%[in2],:256] \n\t"
"vld4.16 {d17, d19, d21, d23}, [%[in22],:256] \n\t"
"vmla.i16 q0, q4, %q[c4] \n\t"
"vmla.i16 q1, q5, %q[c4] \n\t"
"vmla.i16 q2, q6, %q[c4] \n\t"
"vmla.i16 q3, q7, %q[c4] \n\t"
"vmla.i16 q0, q8, %q[c6] \n\t"
"vmla.i16 q1, q9, %q[c6] \n\t"
"vmla.i16 q2, q10, %q[c6] \n\t"
"vmla.i16 q3, q11, %q[c6] \n\t"
"vrshrn.u16 d8, q0, #8 \n\t"
"vrshrn.u16 d9, q1, #8 \n\t"
"vrshrn.u16 d10, q2, #8 \n\t"
"vrshrn.u16 d11, q3, #8 \n\t"
"vst4.8 {d8-d11}, [%[out]] \n\t"
: [in0] "=r" (lidx0),
[in1] "=r" (lidx1),
[in3] "=r" (lidx3),
[in4] "=r" (lidx4)
: [out] "r" (dst + x),
"0" (lidx0),
"1" (lidx1),
"2" (lidx3),
"3" (lidx4),
[in2] "r" (lane + x),
[in22] "r" (lane + x + 4*4),
[c4] "w" (vc4u16), [c6] "w" (vc6u16)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
);
#else
uint16x8x4_t vLane0 = vld4q_u16(lidx0);
uint16x8x4_t vLane2 = vld4q_u16(lidx4);
uint16x8x4_t vLane4 = vld4q_u16(lidx1);
uint16x8x4_t vLane6 = vld4q_u16(lidx3);
uint16x8x4_t vLane8 = vld4q_u16(lane + x);
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane2.val[0]);
uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane2.val[1]);
uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane2.val[2]);
uint16x8_t vSum_3_7 = vaddq_u16(vLane0.val[3], vLane2.val[3]);
uint16x8_t vSum_4_8 = vaddq_u16(vLane4.val[0], vLane6.val[0]);
uint16x8_t vSum_5_9 = vaddq_u16(vLane4.val[1], vLane6.val[1]);
uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]);
uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]);
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16);
vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16);
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16);
vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16);
vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16);
vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16);
uint8x8x4_t vRes;
vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8);
vst4_u8(dst + x, vRes);
#endif
}
break;
}
for (s32 h = 0; h < cn; ++h)
{
u16* ln = lane + h;
u8* dt = dst + h;
for (size_t k = x; k < colsn; k += cn)
{
dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn]
+ u16(4) * (ln[k-cn] + ln[k+cn])
+ u16(6) * ln[k] + (1 << 7)) >> 8);
}
}
}
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
(void)borderMargin;
#endif
}
void gaussianBlur5x5(const Size2D &size, s32 cn,
const u16 * srcBase, ptrdiff_t srcStride,
u16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderType, u16 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
#ifdef CAROTENE_NEON
size_t colsn = size.width * cn;
std::vector<u16> _tmp;
u16 *tmp = 0;
if (borderType == BORDER_MODE_CONSTANT)
{
_tmp.assign(colsn + 4*cn, borderValue);
tmp = &_tmp[cn << 1];
}
ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
//1-line buffer
std::vector<u32> _buf(cn * (size.width + 4) + 32 / sizeof(u32));
u32* lane = internal::alignPtr(&_buf[cn << 1], 32);
if (borderType == BORDER_MODE_CONSTANT)
for (s32 k = 0; k < cn; ++k)
{
lane[-cn+k] = borderValue;
lane[-cn-cn+k] = borderValue;
lane[colsn+k] = borderValue;
lane[colsn+cn+k] = borderValue;
}
uint16x4_t vc6u16 = vmov_n_u16(6);
uint32x4_t vc6u32 = vmovq_n_u32(6);
uint32x4_t vc4u32 = vmovq_n_u32(4);
for (size_t i = 0; i < size.height; ++i)
{
u16* dst = internal::getRowPtr(dstBase, dstStride, i);
//vertical convolution
ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
size_t x = 0;
for (; x <= colsn - 4; x += 4)
{
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
uint16x4_t v0 = vld1_u16(ln0+x);
uint16x4_t v1 = vld1_u16(ln1+x);
uint16x4_t v2 = vld1_u16(ln2+x);
uint16x4_t v3 = vld1_u16(ln3+x);
uint16x4_t v4 = vld1_u16(ln4+x);
uint32x4_t v = vaddl_u16(v0, v4);
uint32x4_t v13 = vaddl_u16(v1, v3);
v = vmlal_u16(v, v2, vc6u16);
v = vmlaq_u32(v, v13, vc4u32);
vst1q_u32(lane + x, v);
}
for (; x < colsn; ++x)
lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
//left&right borders
if (borderType != BORDER_MODE_CONSTANT)
for (s32 k = 0; k < cn; ++k)
{
lane[-cn+k] = lane[idx_l1 + k];
lane[-cn-cn+k] = lane[idx_l2 + k];
lane[colsn+k] = lane[idx_r1 + k];
lane[colsn+cn+k] = lane[idx_r2 + k];
}
//horizontal convolution
x = 0;
for (; x <= colsn - 4; x += 4)
{
internal::prefetch(lane + x);
uint32x4_t lane0 = vld1q_u32(lane + x - 2);
uint32x4_t lane4 = vld1q_u32(lane + x + 2);
uint32x4_t lane1 = vld1q_u32(lane + x - 1);
uint32x4_t lane3 = vld1q_u32(lane + x + 1);
uint32x4_t lane2 = vld1q_u32(lane + x + 0);
uint32x4_t ln04 = vaddq_u32(lane0, lane4);
uint32x4_t ln13 = vaddq_u32(lane1, lane3);
uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32);
uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32);
uint16x4_t ls = vrshrn_n_u32(lsw, 8);
vst1_u16(dst + x, ls);
}
for (s32 h = 0; h < cn; ++h)
{
u32* ln = lane + h;
u16* dt = dst + h;
for (size_t k = x; k < colsn; k += cn)
{
dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
}
}
}
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
(void)borderMargin;
#endif
}
void gaussianBlur5x5(const Size2D &size, s32 cn,
const s16 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderType, s16 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
#ifdef CAROTENE_NEON
size_t colsn = size.width * cn;
std::vector<s16> _tmp;
s16 *tmp = 0;
if (borderType == BORDER_MODE_CONSTANT)
{
_tmp.assign(colsn + 4*cn, borderValue);
tmp = &_tmp[cn << 1];
}
ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
//1-line buffer
std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
if (borderType == BORDER_MODE_CONSTANT)
for (s32 k = 0; k < cn; ++k)
{
lane[-cn+k] = borderValue;
lane[-cn-cn+k] = borderValue;
lane[colsn+k] = borderValue;
lane[colsn+cn+k] = borderValue;
}
int16x4_t vc6s16 = vmov_n_s16(6);
int32x4_t vc6s32 = vmovq_n_s32(6);
int32x4_t vc4s32 = vmovq_n_s32(4);
for (size_t i = 0; i < size.height; ++i)
{
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
//vertical convolution
ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
size_t x = 0;
for (; x <= colsn - 4; x += 4)
{
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
int16x4_t v0 = vld1_s16(ln0+x);
int16x4_t v1 = vld1_s16(ln1+x);
int16x4_t v2 = vld1_s16(ln2+x);
int16x4_t v3 = vld1_s16(ln3+x);
int16x4_t v4 = vld1_s16(ln4+x);
int32x4_t v = vaddl_s16(v0, v4);
int32x4_t v13 = vaddl_s16(v1, v3);
v = vmlal_s16(v, v2, vc6s16);
v = vmlaq_s32(v, v13, vc4s32);
vst1q_s32(lane + x, v);
}
for (; x < colsn; ++x)
lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
//left&right borders
if (borderType != BORDER_MODE_CONSTANT)
for (s32 k = 0; k < cn; ++k)
{
lane[-cn+k] = lane[idx_l1 + k];
lane[-cn-cn+k] = lane[idx_l2 + k];
lane[colsn+k] = lane[idx_r1 + k];
lane[colsn+cn+k] = lane[idx_r2 + k];
}
//horizontal convolution
x = 0;
switch(cn)
{
case 1:
case 2:
case 3:
for (; x <= colsn - 4; x += 4)
{
internal::prefetch(lane + x);
int32x4_t lane0 = vld1q_s32(lane + x - 2);
int32x4_t lane4 = vld1q_s32(lane + x + 2);
int32x4_t lane1 = vld1q_s32(lane + x - 1);
int32x4_t lane3 = vld1q_s32(lane + x + 1);
int32x4_t lane2 = vld1q_s32(lane + x + 0);
int32x4_t ln04 = vaddq_s32(lane0, lane4);
int32x4_t ln13 = vaddq_s32(lane1, lane3);
int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
int16x4_t ls = vrshrn_n_s32(lsw, 8);
vst1_s16(dst + x, ls);
}
break;
case 4:
/* for (; x <= colsn - 4*4; x += 4*4)
{
internal::prefetch(lane + x);
internal::prefetch(lane + x + 16);
ptrdiff_t* lidx0 = lane + x - 2*4;
ptrdiff_t* lidx1 = lane + x - 1*4;
ptrdiff_t* lidx3 = lane + x + 1*4;
ptrdiff_t* lidx4 = lane + x + 2*4;
__asm__ __volatile__ (
"vld4.32 {d0, d2, d4, d6}, [%[in0]]! \n\t"
"vld4.32 {d1, d3, d5, d7}, [%[in0]] \n\t"
"vld4.32 {d8, d10, d12, d14}, [%[in4]]! \n\t"
"vld4.32 {d9, d11, d13, d15}, [%[in4]] \n\t"
"vadd.i32 q0, q4 \n\t"
"vadd.i32 q1, q5 \n\t"
"vadd.i32 q2, q6 \n\t"
"vadd.i32 q3, q7 \n\t"
"vld4.32 {d16, d18, d20, d22}, [%[in1]]! \n\t"
"vld4.32 {d17, d19, d21, d23}, [%[in1]] \n\t"
"vld4.32 {d8, d10, d12, d14}, [%[in3]]! \n\t"
"vld4.32 {d9, d11, d13, d15}, [%[in3]] \n\t"
"vadd.i32 q4, q8 \n\t"
"vadd.i32 q5, q9 \n\t"
"vadd.i32 q6, q10 \n\t"
"vadd.i32 q7, q11 \n\t"
"vld4.32 {d16, d18, d20, d22}, [%[in2],:256] \n\t"
"vld4.32 {d17, d19, d21, d23}, [%[in22],:256] \n\t"
"vmla.i32 q0, q4, %q[c4] \n\t"
"vmla.i32 q1, q5, %q[c4] \n\t"
"vmla.i32 q2, q6, %q[c4] \n\t"
"vmla.i32 q3, q7, %q[c4] \n\t"
"vmla.i32 q0, q8, %q[c6] \n\t"
"vmla.i32 q1, q9, %q[c6] \n\t"
"vmla.i32 q2, q10, %q[c6] \n\t"
"vmla.i32 q3, q11, %q[c6] \n\t"
"vrshrn.i32 d8, q0, #8 \n\t"
"vrshrn.i32 d9, q1, #8 \n\t"
"vrshrn.i32 d10, q2, #8 \n\t"
"vrshrn.i32 d11, q3, #8 \n\t"
"vst4.16 {d8-d11}, [%[out]] \n\t"
: [in0] "=r" (lidx0),
[in1] "=r" (lidx1),
[in3] "=r" (lidx3),
[in4] "=r" (lidx4)
: [out] "r" (dst + x),
"0" (lidx0),
"1" (lidx1),
"2" (lidx3),
"3" (lidx4),
[in2] "r" (lane + x),
[in22] "r" (lane + x + 4*2),
[c4] "w" (vc4s32), [c6] "w" (vc6s32)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
);
*/
for (; x <= colsn - 4; x += 4)
{
internal::prefetch(lane + x);
int32x4_t lane0 = vld1q_s32(lane + x - 2);
int32x4_t lane4 = vld1q_s32(lane + x + 2);
int32x4_t lane1 = vld1q_s32(lane + x - 1);
int32x4_t lane3 = vld1q_s32(lane + x + 1);
int32x4_t lane2 = vld1q_s32(lane + x + 0);
int32x4_t ln04 = vaddq_s32(lane0, lane4);
int32x4_t ln13 = vaddq_s32(lane1, lane3);
int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
int16x4_t ls = vrshrn_n_s32(lsw, 8);
vst1_s16(dst + x, ls);
}
break;
}
for (s32 h = 0; h < cn; ++h)
{
s32* ln = lane + h;
s16* dt = dst + h;
for (size_t k = x; k < colsn; k += cn)
{
dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
}
}
}
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
(void)borderMargin;
#endif
}
void gaussianBlur5x5(const Size2D &size, s32 cn,
const s32 * srcBase, ptrdiff_t srcStride,
s32 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
#ifdef CAROTENE_NEON
size_t colsn = size.width * cn;
std::vector<s32> _tmp;
s32 *tmp = 0;
if (borderType == BORDER_MODE_CONSTANT)
{
_tmp.assign(colsn + 4*cn, borderValue);
tmp = &_tmp[cn << 1];
}
ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
//1-line buffer
std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
if (borderType == BORDER_MODE_CONSTANT)
for (s32 k = 0; k < cn; ++k)
{
lane[-cn+k] = borderValue;
lane[-cn-cn+k] = borderValue;
lane[colsn+k] = borderValue;
lane[colsn+cn+k] = borderValue;
}
int32x4_t vc6s32 = vmovq_n_s32(6);
int32x4_t vc4s32 = vmovq_n_s32(4);
for (size_t i = 0; i < size.height; ++i)
{
s32* dst = internal::getRowPtr(dstBase, dstStride, i);
//vertical convolution
ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i);
const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
size_t x = 0;
for (; x <= colsn - 4; x += 4)
{
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
int32x4_t v0 = vld1q_s32(ln0+x);
int32x4_t v1 = vld1q_s32(ln1+x);
int32x4_t v2 = vld1q_s32(ln2+x);
int32x4_t v3 = vld1q_s32(ln3+x);
int32x4_t v4 = vld1q_s32(ln4+x);
int32x4_t v = vaddq_s32(v0, v4);
int32x4_t v13 = vaddq_s32(v1, v3);
v = vmlaq_s32(v, v2, vc6s32);
v = vmlaq_s32(v, v13, vc4s32);
vst1q_s32(lane + x, v);
}
for (; x < colsn; ++x)
lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
//left&right borders
if (borderType != BORDER_MODE_CONSTANT)
for (s32 k = 0; k < cn; ++k)
{
lane[-cn+k] = lane[idx_l1 + k];
lane[-cn-cn+k] = lane[idx_l2 + k];
lane[colsn+k] = lane[idx_r1 + k];
lane[colsn+cn+k] = lane[idx_r2 + k];
}
//horizontal convolution
x = 0;
for (; x <= colsn - 4; x += 4)
{
internal::prefetch(lane + x);
int32x4_t lane0 = vld1q_s32(lane + x - 2);
int32x4_t lane4 = vld1q_s32(lane + x + 2);
int32x4_t lane1 = vld1q_s32(lane + x - 1);
int32x4_t lane3 = vld1q_s32(lane + x + 1);
int32x4_t lane2 = vld1q_s32(lane + x + 0);
int32x4_t ln04 = vaddq_s32(lane0, lane4);
int32x4_t ln13 = vaddq_s32(lane1, lane3);
int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
vst1q_s32(dst + x, lsw);
}
for (s32 h = 0; h < cn; ++h)
{
s32* ln = lane + h;
s32* dt = dst + h;
for (size_t k = x; k < colsn; k += cn)
{
dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k];
}
}
}
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
(void)borderMargin;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
template <typename T, int elsize> struct vtail
{
static inline void inRange(const T *, const T *, const T *,
u8 *, size_t &, size_t)
{
//do nothing since there couldn't be enough data
}
};
template <typename T> struct vtail<T, 2>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if( x + 8 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1(dst + x, internal::vmovn(vd));
x+=8;
}
}
};
template <typename T> struct vtail<T, 1>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
typedef typename internal::VecTraits<T>::unsign::vec64 uvec64;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if( x + 16 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1q(dst + x, vd);
x+=16;
}
if( x + 8 < width)
{
vec64 vs = internal::vld1( src + x);
vec64 vr1 = internal::vld1(rng1 + x);
vec64 vr2 = internal::vld1(rng2 + x);
uvec64 vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs));
internal::vst1(dst + x, vd);
x+=8;
}
}
};
template <typename T>
inline void inRangeCheck(const Size2D &_size,
const T * srcBase, ptrdiff_t srcStride,
const T * rng1Base, ptrdiff_t rng1Stride,
const T * rng2Base, ptrdiff_t rng2Stride,
u8 * dstBase, ptrdiff_t dstStride)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
Size2D size(_size);
if (srcStride == dstStride &&
srcStride == rng1Stride &&
srcStride == rng2Stride &&
srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const size_t width = size.width & ~( 32/sizeof(T) - 1 );
for(size_t j = 0; j < size.height; ++j)
{
const T * src = internal::getRowPtr( srcBase, srcStride, j);
const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j);
const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j);
u8 * dst = internal::getRowPtr( dstBase, dstStride, j);
size_t i = 0;
for( ; i < width; i += 32/sizeof(T) )
{
internal::prefetch(src + i);
internal::prefetch(rng1 + i);
internal::prefetch(rng2 + i);
vec128 vs = internal::vld1q( src + i);
vec128 vr1 = internal::vld1q(rng1 + i);
vec128 vr2 = internal::vld1q(rng2 + i);
uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vs = internal::vld1q( src + i + 16/sizeof(T));
vr1 = internal::vld1q(rng1 + i + 16/sizeof(T));
vr2 = internal::vld1q(rng2 + i + 16/sizeof(T));
uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vnst(dst + i, vd1, vd2);
}
vtail<T, sizeof(T)>::inRange(src, rng1, rng2, dst, i, size.width);
for( ; i < size.width; i++ )
dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i]));
}
}
}
#define INRANGEFUNC(T) \
void inRange(const Size2D &_size, \
const T * srcBase, ptrdiff_t srcStride, \
const T * rng1Base, ptrdiff_t rng1Stride, \
const T * rng2Base, ptrdiff_t rng2Stride, \
u8 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
inRangeCheck(_size, srcBase, srcStride, \
rng1Base, rng1Stride, rng2Base, rng2Stride, \
dstBase, dstStride); \
}
#else
#define INRANGEFUNC(T) \
void inRange(const Size2D &, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
u8 *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
INRANGEFUNC(u8)
INRANGEFUNC(s8)
INRANGEFUNC(u16)
INRANGEFUNC(s16)
INRANGEFUNC(s32)
INRANGEFUNC(f32)
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
void integral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u32 * sumBase, ptrdiff_t sumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint32x4_t v_zero = vmovq_n_u32(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
uint32x4_t prev = v_zero;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
uint32x4_t vsumh = vaddw_u16(prev, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
sum = internal::getRowPtr(sumBase, sumStride, i);
prev = v_zero;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint32x4_t vsuml = vld1q_u32(prevSum + j);
uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
vsuml = vaddq_u32(vsuml, prev);
vsumh = vaddq_u32(vsumh, prev);
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
vsumh = vaddw_u16(vsumh, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]) + prevSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sumBase;
(void)sumStride;
#endif
}
void sqrIntegral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f64 * sqsumBase, ptrdiff_t sqsumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint16x8_t v_zero8 = vmovq_n_u16(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
double prev = 0.;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
prev = 0.;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sqsumBase;
(void)sqsumStride;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_INTRINSICS_HPP
#define CAROTENE_INTRINSICS_HPP
#include <carotene/definitions.hpp>
#include <arm_neon.h>
namespace CAROTENE_NS { namespace internal {
/////////////// Custom NEON intrinsics ///////////////////
// calculate reciprocal value
inline float32x4_t vrecpq_f32(float32x4_t val)
{
float32x4_t reciprocal = vrecpeq_f32(val);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
return reciprocal;
}
inline float32x2_t vrecp_f32(float32x2_t val)
{
float32x2_t reciprocal = vrecpe_f32(val);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
return reciprocal;
}
// caclulate sqrt value
inline float32x4_t vrsqrtq_f32(float32x4_t val)
{
float32x4_t e = vrsqrteq_f32(val);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
return e;
}
inline float32x2_t vrsqrt_f32(float32x2_t val)
{
float32x2_t e = vrsqrte_f32(val);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
return e;
}
inline float32x4_t vsqrtq_f32(float32x4_t val)
{
return vrecpq_f32(vrsqrtq_f32(val));
}
inline float32x2_t vsqrt_f32(float32x2_t val)
{
return vrecp_f32(vrsqrt_f32(val));
}
// table lookup with the table in a 128-bit register
inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
{
#ifdef __aarch64__
// AArch64 supports this natively
return ::vqtbl1_u8(a, b);
#else
union { uint8x16_t v; uint8x8x2_t w; } u = { a };
return vtbl2_u8(u.w, b);
#endif
}
} }
#endif
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
namespace CAROTENE_NS {
bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
#ifdef CAROTENE_NEON
const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
const uint16x8_t v_zero = vdupq_n_u16(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
uint8x8_t vsub;
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
s16 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border_x3;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
vsub = x1;
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u16(tprev, tcurr, 7);
t1 = tcurr;
t2 = vextq_u16(tcurr, tnext, 1);
// and add them
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
uint8x8_t it0 = vqmovun_s16(tt0);
vst1_u8(drow + x - 8, it0);
vsub = x1;
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue * 3;
else if (border == BORDER_MODE_REPLICATE)
nextx = srow2[x] + srow1[x] + srow0[x];
}
else
{
nextx = (srow2 ? srow2[x + 1] : borderValue) +
srow1[x + 1] +
(srow0 ? srow0[x + 1] : borderValue);
}
s32 val = (prevx + currx + nextx) - 9 * srow1[x];
drow[x] = internal::saturate_cast<u8>((s32)val);
// make shift
prevx = currx;
currx = nextx;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() &&
size.width >= 8 && size.height >= 1 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian1OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t t0, t2;
uint8x8_t xx0 = vmov_n_u8(0x0);
uint8x8_t xx1 = vmov_n_u8(0x0);
uint8x8_t xx2 = vmov_n_u8(0x0);
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
if(x) {
xx0 = xx1;
xx1 = xx2;
} else {
xx1 = x1;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
xx1 = vset_lane_u8(borderValue, x1, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
}
}
xx2 = x1;
if(x) {
tcurr = tnext;
}
tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
if(!x) {
tcurr = tnext;
continue;
}
t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx;
s16 prevx;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v1[0] : v1[x-1];
nextx = x == cols-1 ? v1[x] : v1[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v1[1] : v1[x-1];
nextx = x == cols-1 ? v1[x-1] : v1[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v1[x-1];
nextx = x == cols-1 ? borderValue : v1[x+1];
}
*(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian3OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tprev = vmovq_n_s16(0x0);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t tc = vmovq_n_s16(0x0);
int16x8_t t0, t2, tcnext;
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
if(x) {
tprev = tcurr;
tcurr = tnext;
}
tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
if(!x) {
tcurr = tnext;
tc = tcnext;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
}
continue;
}
t0 = vextq_s16(tprev, tcurr, 7);
t2 = vextq_s16(tcurr, tnext, 1);
t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
tc = tcnext;
t0 = vshlq_n_s16(t0, 1);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx, nextx2;
s16 prevx, prevx2;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v0[0] : v0[x-1];
prevx2 = x == 0 ? v2[0] : v2[x-1];
nextx = x == cols-1 ? v0[x] : v0[x+1];
nextx2 = x == cols-1 ? v2[x] : v2[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v0[1] : v0[x-1];
prevx2 = x == 0 ? v2[1] : v2[x-1];
nextx = x == cols-1 ? v0[x-1] : v0[x+1];
nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v0[x-1];
prevx2 = x == 0 ? borderValue : v2[x-1];
nextx = x == cols-1 ? borderValue : v0[x+1];
nextx2 = x == cols-1 ? borderValue : v2[x+1];
}
s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian5OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = 0;
const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v3 = 0;
const u8* v4 = 0;
// make border
if (border == BORDER_MODE_REPLICATE) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
} else if (border == BORDER_MODE_REFLECT) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
} else if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1)
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tnext, tc, t0;
int16x8_t tnext2, tnext3;
int16x8_t tnext1Old, tnext2Old, tnext3Old;
int16x8_t tnext4OldOldOld, tnext5OldOldOld;
int16x8_t tcurr1 = vmovq_n_s16(0x0);
int16x8_t tnext1 = vmovq_n_s16(0x0);
int16x8_t tprev1 = vmovq_n_s16(0x0);
int16x8_t tpprev1 = vmovq_n_s16(0x0);
int16x8_t tppprev1 = vmovq_n_s16(0x0);
int16x8_t tnext4Old = vmovq_n_s16(0x0);
int16x8_t tnext5Old = vmovq_n_s16(0x0);
int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
// do vertical convolution
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
internal::prefetch(v3 + x);
internal::prefetch(v4 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
uint8x8_t x3 = vld1_u8(v3 + x);
uint8x8_t x4 = vld1_u8(v4 + x);
if(x) {
tcurr1 = tnext1;
}
tnext4OldOldOld = tnext4Old;
tnext5OldOldOld = tnext5Old;
tnext1Old = tnext1OldOld;
tnext2Old = tnext2OldOld;
tnext3Old = tnext3OldOld;
tnext4Old = tnext4OldOld;
tnext5Old = tnext5OldOld;
tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
tnext3 = vshlq_n_s16(tnext3, 1);
tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
tnext2 = vsubq_s16(tc, tnext);
tnext1 = vaddq_s16(tnext3, tnext2);
// tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
tnext2 = vshlq_n_s16(tnext2, 1);
// tnext2 = 2*x4 - 4*x2 + 2*x0
tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
// tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4
tnext1OldOld = tnext1;
tnext2OldOld = tnext2;
tnext3OldOld = tnext3;
tnext4OldOld = tnext2;
tnext5OldOld = tnext1;
if(x) {
tnext1 = vextq_s16(tnext1Old, tnext1, 2);
tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
tprev1 = tnext3Old;
if(x!=8) {
tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
}
}
if(!x) {
// make border
if (border == BORDER_MODE_REPLICATE) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT101) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
} else if (border == BORDER_MODE_CONSTANT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
}
tppprev1 = tprev1;
continue;
}
t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
t0 = vaddq_s16(t0, t0);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x >= cols - 1)
x = cols-2;
s16 pprevx = 0;
s16 prevx = 0;
s16 nextx = 0;
s16 nnextx = 0;
for( ; x < cols; x++ )
{
if (x == 0) {
// make border
if (border == BORDER_MODE_REPLICATE) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
prevx = 0;
}
} else if (x == 1) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
}
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else {
pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
}
s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
if (x == cols-1) {
// make border
if (border == BORDER_MODE_REPLICATE) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_REFLECT) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
} else if (border == BORDER_MODE_REFLECT101) {
nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
} else if (border == BORDER_MODE_CONSTANT) {
nextx = 0;
nnextx = 8 * borderValue;
}
} else if (x == cols-2) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
} else if (border == BORDER_MODE_REFLECT101) {
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_CONSTANT) {
nnextx = 8 * borderValue;
}
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
} else {
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
}
s16 res = pprevx + prevx + currx + nextx + nnextx;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cmath>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
struct Magnitude
{
typedef s16 type;
void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1,
int16x8_t & v_dst) const
{
int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1);
float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
v_src0_p = vget_high_s16(v_src0);
v_src1_p = vget_high_s16(v_src1);
float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0));
int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1));
v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1));
}
void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1,
int16x4_t & v_dst) const
{
float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)),
vcvtq_f32_s32(vmull_s16(v_src1, v_src1)));
int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp));
v_dst = vqmovn_s32(v_sqrt);
}
void operator() (const short * src0, const short * src1, short * dst) const
{
f32 src0val = (f32)src0[0], src1val = (f32)src1[0];
dst[0] = internal::saturate_cast<s16>((s32)sqrtf(src0val * src0val + src1val * src1val));
}
};
struct MagnitudeF32
{
typedef f32 type;
void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1,
float32x4_t & v_dst) const
{
v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1)));
}
void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1,
float32x2_t & v_dst) const
{
v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1)));
}
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
{
dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]);
}
};
} // namespace
#endif
void magnitude(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
Magnitude());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void magnitude(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
MagnitudeF32());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cmath>
namespace CAROTENE_NS {
void meanStdDev(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
f64 fsum = 0.0f, fsqsum = 0.0f;
sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
void meanStdDev(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
f64 fsum = 0.0f, fsqsum = 0.0f;
f32 arsum[8];
uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0u;
while (j < roiw4)
{
size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
v_sum = v_zero;
v_sqsum = v_zero_f;
for ( ; j + 16 < blockSize ; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
// 0
uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
// 1
v_srclo = vmovl_u16(vget_low_u16(v_src1));
v_srchi = vmovl_u16(vget_high_u16(v_src1));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
v_srclo_f = vcvtq_f32_u32(v_srclo);
v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
}
for ( ; j < blockSize; j += 4)
{
uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
float32x4_t v_src_f = vcvtq_f32_u32(v_src);
v_sum = vaddq_u32(v_sum, v_src);
v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
}
vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
vst1q_f32(arsum + 4, v_sqsum);
fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
}
// collect a few last elements in the current row
for ( ; j < size.width; ++j)
{
f32 srcval = src[j];
fsum += srcval;
fsqsum += srcval * srcval;
}
}
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
/*
* The code here is based on the code in
* <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
* See also <http://ndevilla.free.fr/median/median/index.html>.
*/
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn)
{
u8 buf[16+8];
vst1q_u8(buf+cn, r);
for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i];
return vld1q_u8(buf);
}
uint8x8_t getRightReplicate(uint8x8_t r, u32 cn)
{
u8 buf[8+8];
vst1_u8(buf, r);
for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i];
return vld1_u8(buf+cn);
}
} // namespace
//o------^-------^-----------------------------o 0
// | |
//o--^---v---^---|-------^---------------------o 1
// | | | |
//o--v-------v---|-------|-^-------^-------^---o 2
// | | | | |
//o------^-------v-----^-|-|-------|-------|---o 3
// | | | | | |
//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
// | | | | | | |
//o--v-------v---^-|---|---v---|-------|-------o 5
// | | | | |
//o------^-------|-|---v-------|-------v-------o 6
// | | | |
//o--^---v---^---|-v-----------v---------------o 7
// | | |
//o--v-------v---v-----------------------------o 8
#define ELT(num, level) v ## num ## _lv ## level
#define PIX_SORT(a, alvl, b, blvl, newlvl) \
PIX_MIN(a, alvl, b, blvl, newlvl); \
PIX_MAX(a, alvl, b, blvl, newlvl);
#define SORT9 \
PIX_SORT(1, 00, 2, 00, 01); \
PIX_SORT(4, 00, 5, 00, 02); \
PIX_SORT(7, 00, 8, 00, 03); \
PIX_SORT(0, 00, 1, 01, 04); \
PIX_SORT(3, 00, 4, 02, 05); \
PIX_SORT(6, 00, 7, 03, 06); \
PIX_SORT(1, 04, 2, 01, 07); \
PIX_SORT(4, 05, 5, 02, 08); \
PIX_SORT(7, 06, 8, 03, 09); \
PIX_MAX (0, 04, 3, 05, 10); \
PIX_MIN (5, 08, 8, 09, 11); \
PIX_SORT(4, 08, 7, 09, 12); \
PIX_MAX (3, 10, 6, 06, 13); \
PIX_MAX (1, 07, 4, 12, 14); \
PIX_MIN (2, 07, 5, 11, 15); \
PIX_MIN (4, 14, 7, 12, 16); \
PIX_SORT(4, 16, 2, 15, 17); \
PIX_MAX (6, 13, 4, 17, 18); \
PIX_MIN (4, 18, 2, 17, 19);
#endif
bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels)
{
return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8;
}
void medianFilter3x3(const Size2D &size, u32 numChannels,
const u8 *srcBase, ptrdiff_t srcStride,
const Margin &srcMargin,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels));
#ifdef CAROTENE_NEON
u32 cn = numChannels;
size_t colsn = size.width * cn;
for (size_t i = 0; i < size.height; ++i) {
const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i);
const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride;
const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride;
u8* pdst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
{
uint8x16_t v3_lv00 = vld1q_u8(psrc0);
uint8x16_t v4_lv00 = vld1q_u8(psrc1);
uint8x16_t v5_lv00 = vld1q_u8(psrc2);
uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn);
uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn);
uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn);
uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn);
uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn);
uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn);
goto medianBlur3x3_mainBody;
for (; j < colsn - 16; j += 16) {
internal::prefetch(psrc0 + j);
internal::prefetch(psrc1 + j);
internal::prefetch(psrc2 + j);
v0_lv00 = vld1q_u8(psrc0 + j - cn);
v1_lv00 = vld1q_u8(psrc1 + j - cn);
v2_lv00 = vld1q_u8(psrc2 + j - cn);
v3_lv00 = vld1q_u8(psrc0 + j);
v4_lv00 = vld1q_u8(psrc1 + j);
v5_lv00 = vld1q_u8(psrc2 + j);
v6_lv00 = vld1q_u8(psrc0 + j + cn);
v7_lv00 = vld1q_u8(psrc1 + j + cn);
v8_lv00 = vld1q_u8(psrc2 + j + cn);
medianBlur3x3_mainBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1q_u8(pdst + j, v4_lv19);
}
}
{
size_t k = colsn - 8;
uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn);
uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn);
uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn);
uint8x8_t v3_lv00 = vld1_u8(psrc0 + k);
uint8x8_t v4_lv00 = vld1_u8(psrc1 + k);
uint8x8_t v5_lv00 = vld1_u8(psrc2 + k);
uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn);
uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn);
uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn);
goto medianBlur3x3_tailBody;
for (; k >= j - 8; k -= 8) {
v0_lv00 = vld1_u8(psrc0 + k - cn);
v1_lv00 = vld1_u8(psrc1 + k - cn);
v2_lv00 = vld1_u8(psrc2 + k - cn);
v3_lv00 = vld1_u8(psrc0 + k);
v4_lv00 = vld1_u8(psrc1 + k);
v5_lv00 = vld1_u8(psrc2 + k);
v6_lv00 = vld1_u8(psrc0 + k + cn);
v7_lv00 = vld1_u8(psrc1 + k + cn);
v8_lv00 = vld1_u8(psrc2 + k + cn);
medianBlur3x3_tailBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1_u8(pdst + k, v4_lv19);
}
}
}
#else
(void)size;
(void)numChannels;
(void)srcBase;
(void)srcStride;
(void)srcMargin;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
struct Min
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vminq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmin(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::min(src0[0], src1[0]);
}
};
template <typename T>
struct Max
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vmaxq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmax(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::max(src0[0], src1[0]);
}
};
} // namespace
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, op<type>()); \
}
#else
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
IMPL_MINMAX(u8)
IMPL_MINMAX(s8)
IMPL_MINMAX(u16)
IMPL_MINMAX(s16)
IMPL_MINMAX(u32)
IMPL_MINMAX(s32)
IMPL_MINMAX(f32)
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <limits>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void minMaxVals(const Size2D &size,
const T * srcBase, ptrdiff_t srcStride,
T * pMinVal, T * pMaxVal)
{
using namespace internal;
typedef typename VecTraits<T>::vec128 vec128;
typedef typename VecTraits<T>::vec64 vec64;
u32 step_base = 32 / sizeof(T), step_tail = 8 / sizeof(T);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
T maxVal = std::numeric_limits<T>::min();
T minVal = std::numeric_limits<T>::max();
vec128 v_min_base = vdupq_n(minVal), v_max_base = vdupq_n(maxVal);
vec64 v_min_tail = vdup_n(minVal), v_max_tail = vdup_n(maxVal);
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for (; j < roiw_base; j += step_base)
{
prefetch(src + j);
vec128 v_src0 = vld1q(src + j), v_src1 = vld1q(src + j + 16 / sizeof(T));
v_min_base = vminq(v_min_base, v_src0);
v_max_base = vmaxq(v_max_base, v_src0);
v_min_base = vminq(v_min_base, v_src1);
v_max_base = vmaxq(v_max_base, v_src1);
}
for (; j < roiw_tail; j += step_tail)
{
vec64 v_src0 = vld1(src + j);
v_min_tail = vmin(v_min_tail, v_src0);
v_max_tail = vmax(v_max_tail, v_src0);
}
for (; j < size.width; j++)
{
T srcval = src[j];
minVal = std::min(srcval, minVal);
maxVal = std::max(srcval, maxVal);
}
}
// collect min & max values
T ar[16 / sizeof(T)];
vst1q(ar, vcombine(vmin(v_min_tail, vmin(vget_low(v_min_base), vget_high(v_min_base))),
vmax(v_max_tail, vmax(vget_low(v_max_base), vget_high(v_max_base)))));
for (size_t x = 0; x < 8u / sizeof(T); ++x)
{
minVal = std::min(minVal, ar[x]);
maxVal = std::max(maxVal, ar[x + 8 / sizeof(T)]);
}
if (pMaxVal)
*pMaxVal = maxVal;
if (pMinVal)
*pMinVal = minVal;
}
} // namespace
#endif
void minMaxVals(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * pMinVal, u8 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<u8>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxVals(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 * pMinVal, s16 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<s16>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxVals(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 * pMinVal, u16 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<u16>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxVals(const Size2D &size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 * pMinVal, s32 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<s32>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxVals(const Size2D &size,
const u32 * srcBase, ptrdiff_t srcStride,
u32 * pMinVal, u32 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<u32>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxLoc(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 &minVal, size_t &minCol, size_t &minRow,
f32 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width >= 16)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t c4 = vdupq_n_u32(4);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (4 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
#else
{
size_t bound = size.width - (4 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
float32x4_t n_min = vdupq_n_f32(minVal);
uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC);
float32x4_t n_max = vdupq_n_f32(maxVal);
uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC);
for(; i < bound; i+=4)
{
internal::prefetch(src + i);
float32x4_t line = vld1q_f32(src + i);
uint32x4_t minmask = vcltq_f32(line, n_min);
uint32x4_t maxmask = vcgtq_f32(line, n_max);
n_min = vbslq_f32(minmask, line, n_min);
n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
n_max = vbslq_f32(maxmask, line, n_max);
n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
// idx[] +=4
lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
}
f32 fmin[4], fmax[4];
u32 fminIdx[4], fmaxIdx[4];
vst1q_f32(fmin, n_min);
vst1q_f32(fmax, n_max);
vst1q_u32(fminIdx, n_minIdx);
vst1q_u32(fmaxIdx, n_maxIdx);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 4; ++j)
{
f32 minval = fmin[j];
f32 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; ++i )
{
float val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
const u8 * maskBase, ptrdiff_t maskStride,
f32 &minVal, size_t &minCol, size_t &minRow,
f32 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = std::numeric_limits<f32>::max();
minCol = size.width;
minRow = size.height;
maxVal = -std::numeric_limits<f32>::max();
maxCol = size.width;
maxRow = size.height;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
const u8 * mask = internal::getRowPtr( maskBase, maskStride, l);
if (size.width >= 16)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t uOne = vdupq_n_u32(1);
uint32x4_t c4 = vdupq_n_u32(4);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (4 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
#else
{
size_t bound = size.width - (4 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
float32x4_t n_min = vdupq_n_f32(minVal);
uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC);
float32x4_t n_max = vdupq_n_f32(maxVal);
uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC);
for(; i < bound; i+=4)
{
internal::prefetch(src + i);
internal::prefetch(mask + i);
float32x4_t line = vld1q_f32(src + i);
uint8x8_t maskLine = vld1_u8(mask + i);
uint32x4_t maskLine4 = vmovl_u16(vget_low_u16(vmovl_u8(maskLine)));
maskLine4 = vcgeq_u32(maskLine4, uOne);
uint32x4_t minmask = vcltq_f32(line, n_min);
uint32x4_t maxmask = vcgtq_f32(line, n_max);
minmask = vandq_u32(minmask, maskLine4);
maxmask = vandq_u32(maxmask, maskLine4);
n_min = vbslq_f32(minmask, line, n_min);
n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
n_max = vbslq_f32(maxmask, line, n_max);
n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
// idx[] +=4
lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
}
f32 fmin[4], fmax[4];
u32 fminIdx[4], fmaxIdx[4];
vst1q_f32(fmin, n_min);
vst1q_f32(fmax, n_max);
vst1q_u32(fminIdx, n_minIdx);
vst1q_u32(fmaxIdx, n_maxIdx);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 4; ++j)
{
f32 minval = fmin[j];
f32 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; i++ )
{
if (!mask[i])
continue;
f32 val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)maskBase;
(void)maskStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 &minVal, size_t &minCol, size_t &minRow,
s32 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const s32 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width >= 16)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t c4 = vdupq_n_u32(4);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (4 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
#else
{
size_t bound = size.width - (4 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
int32x4_t n_min = vdupq_n_s32(minVal);
uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC);
int32x4_t n_max = vdupq_n_s32(maxVal);
uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC);
for(; i < bound; i+=4 )
{
internal::prefetch(src + i);
int32x4_t line = vld1q_s32(src + i);
uint32x4_t minmask = vcltq_s32(line, n_min);
uint32x4_t maxmask = vcgtq_s32(line, n_max);
n_min = vbslq_s32(minmask, line, n_min);
n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
n_max = vbslq_s32(maxmask, line, n_max);
n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
// idx[] +=4
lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
}
s32 fmin[4], fmax[4];
u32 fminIdx[4], fmaxIdx[4];
vst1q_s32(fmin, n_min);
vst1q_s32(fmax, n_max);
vst1q_u32(fminIdx, n_minIdx);
vst1q_u32(fmaxIdx, n_maxIdx);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 4; ++j)
{
s32 minval = fmin[j];
s32 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; ++i )
{
s32 val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 &minVal, size_t &minCol, size_t &minRow,
s16 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const s16 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width >= 32)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t c8 = vdupq_n_u32(8);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (8 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
#else
{
size_t bound = size.width - (8 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
int16x8_t n_min = vdupq_n_s16(minVal);
uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
int16x8_t n_max = vdupq_n_s16(maxVal);
uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);
for(; i < bound; i+=8 )
{
internal::prefetch(src + i);
int16x8_t line = vld1q_s16(src + i);
uint16x8_t minmask = vcltq_s16(line, n_min);
uint16x8_t maxmask = vcgtq_s16(line, n_max);
n_min = vbslq_s16(minmask, line, n_min);
uint16x4_t minml = vget_low_u16(minmask);
uint16x4_t minmh = vget_high_u16(minmask);
uint32x4_t minml2 = vmovl_u16(minml);
uint32x4_t minmh2 = vmovl_u16(minmh);
minml2 = vqshlq_n_u32(minml2, 31);
minmh2 = vqshlq_n_u32(minmh2, 31);
n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);
n_max = vbslq_s16(maxmask, line, n_max);
uint16x4_t maxml = vget_low_u16(maxmask);
uint16x4_t maxmh = vget_high_u16(maxmask);
uint32x4_t maxml2 = vmovl_u16(maxml);
uint32x4_t maxmh2 = vmovl_u16(maxmh);
maxml2 = vqshlq_n_u32(maxml2, 31);
maxmh2 = vqshlq_n_u32(maxmh2, 31);
n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);
// idx[] +=8
lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
}
// fix high part of indexes
uint32x4_t c4 = vdupq_n_u32((int32_t) 4);
n_minIdxh = vaddq_u32(n_minIdxh, c4);
n_maxIdxh = vaddq_u32(n_maxIdxh, c4);
s16 fmin[8], fmax[8];
u32 fminIdx[8], fmaxIdx[8];
vst1q_s16(fmin, n_min);
vst1q_s16(fmax, n_max);
vst1q_u32(fminIdx+0, n_minIdxl);
vst1q_u32(fmaxIdx+0, n_maxIdxl);
vst1q_u32(fminIdx+4, n_minIdxh);
vst1q_u32(fmaxIdx+4, n_maxIdxh);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 8; ++j)
{
s16 minval = fmin[j];
s16 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFF8)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFF8)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; ++i )
{
short val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 &minVal, size_t &minCol, size_t &minRow,
u16 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const u16 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width >= 32)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t c8 = vdupq_n_u32(8);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (8 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
#else
{
size_t bound = size.width - (8 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
uint16x8_t n_min = vdupq_n_u16(minVal);
uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
uint16x8_t n_max = vdupq_n_u16(maxVal);
uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);
for(; i < bound; i+=8 )
{
internal::prefetch(src + i);
uint16x8_t line = vld1q_u16(src + i);
uint16x8_t minmask = vcltq_u16(line, n_min);
uint16x8_t maxmask = vcgtq_u16(line, n_max);
n_min = vbslq_u16(minmask, line, n_min);
uint16x4_t minml = vget_low_u16(minmask);
uint16x4_t minmh = vget_high_u16(minmask);
uint32x4_t minml2 = vmovl_u16(minml);
uint32x4_t minmh2 = vmovl_u16(minmh);
minml2 = vqshlq_n_u32(minml2, 31);
minmh2 = vqshlq_n_u32(minmh2, 31);
n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);
n_max = vbslq_u16(maxmask, line, n_max);
uint16x4_t maxml = vget_low_u16(maxmask);
uint16x4_t maxmh = vget_high_u16(maxmask);
uint32x4_t maxml2 = vmovl_u16(maxml);
uint32x4_t maxmh2 = vmovl_u16(maxmh);
maxml2 = vqshlq_n_u32(maxml2, 31);
maxmh2 = vqshlq_n_u32(maxmh2, 31);
n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);
// idx[] +=8
lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
}
// fix high part of indexes
uint32x4_t c4 = vdupq_n_u32(4);
n_minIdxh = vaddq_u32(n_minIdxh, c4);
n_maxIdxh = vaddq_u32(n_maxIdxh, c4);
u16 fmin[8], fmax[8];
u32 fminIdx[8], fmaxIdx[8];
vst1q_u16(fmin, n_min);
vst1q_u16(fmax, n_max);
vst1q_u32(fminIdx+0, n_minIdxl);
vst1q_u32(fmaxIdx+0, n_maxIdxl);
vst1q_u32(fminIdx+4, n_minIdxh);
vst1q_u32(fmaxIdx+4, n_maxIdxh);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 8; ++j)
{
u16 minval = fmin[j];
u16 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFF8)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFF8)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; ++i )
{
u16 val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
#ifdef CAROTENE_NEON
namespace {
void minMaxLocBlock(const u8 * src, u32 len,
u8 &minVal, u16 &minIdx,
u8 &maxVal, u16 &maxIdx)
{
u16 tmp0123[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
uint8x16_t n_min = vdupq_n_u8(src[0]);
uint16x8_t n_minIdxl = vdupq_n_u16(0);
uint16x8_t n_minIdxh = vdupq_n_u16(0);
uint8x16_t n_max = vdupq_n_u8(src[0]);
uint16x8_t n_maxIdxl = vdupq_n_u16(0);
uint16x8_t n_maxIdxh = vdupq_n_u16(0);
uint16x8_t c16 = vdupq_n_u16(16);
uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);
s32 i = 0;
s32 bound = len - (16 - 1);
for(; i < bound; i+=16 )
{
internal::prefetch(src + i);
uint8x16_t line = vld1q_u8(src + i);
uint8x16_t minmask = vcltq_u8(line, n_min);
uint8x16_t maxmask = vcgtq_u8(line, n_max);
n_min = vbslq_u8(minmask, line, n_min);
uint8x8_t minml = vget_low_u8(minmask);
uint8x8_t minmh = vget_high_u8(minmask);
uint16x8_t minml2 = vmovl_u8(minml);
uint16x8_t minmh2 = vmovl_u8(minmh);
minml2 = vqshlq_n_u16(minml2, 15);
minmh2 = vqshlq_n_u16(minmh2, 15);
n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);
n_max = vbslq_u8(maxmask, line, n_max);
uint8x8_t maxml = vget_low_u8(maxmask);
uint8x8_t maxmh = vget_high_u8(maxmask);
uint16x8_t maxml2 = vmovl_u8(maxml);
uint16x8_t maxmh2 = vmovl_u8(maxmh);
maxml2 = vqshlq_n_u16(maxml2, 15);
maxmh2 = vqshlq_n_u16(maxmh2, 15);
n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);
// idx[] +=16
lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
}
// fix high part of indexes
uint16x8_t c8 = vdupq_n_u16(8);
n_minIdxh = vaddq_u16(n_minIdxh, c8);
n_maxIdxh = vaddq_u16(n_maxIdxh, c8);
u8 fmin[16], fmax[16];
u16 fminIdx[16], fmaxIdx[16];
/*{
uint8x8_t min_low = vget_low_u8(n_min);
uint8x8_t min_high = vget_high_u8(n_min);
uint8x8_t max_low = vget_low_u8(n_max);
uint8x8_t max_high = vget_high_u8(n_max);
uint8x8_t minmask = vclt_u8(min_low, min_high);
uint8x8_t maxmask = vcgt_u8(max_low, max_high);
uint8x8_t min2 = vbsl_u8(minmask, min_low, min_high);
uint8x8_t max2 = vbsl_u8(maxmask, max_low, max_high);
uint16x8_t minidxmask = vmovl_u8(minmask);
uint16x8_t maxidxmask = vmovl_u8(maxmask);
minidxmask = vqshlq_n_u16(minidxmask, 15);
maxidxmask = vqshlq_n_u16(maxidxmask, 15);
uint16x8_t n_minIdx = vbslq_u16(minidxmask, n_minIdxl, n_minIdxh);
uint16x8_t n_maxIdx = vbslq_u16(maxidxmask, n_maxIdxl, n_maxIdxh);
vst1_u8((uint8_t*)fmin, min2);
vst1_u8((uint8_t*)fmax, max2);
vst1q_u16((uint16_t*)(fminIdx), n_minIdx);
vst1q_u16((uint16_t*)(fmaxIdx), n_maxIdx);
}*/
vst1q_u8(fmin, n_min);
vst1q_u8(fmax, n_max);
vst1q_u16(fminIdx+0, n_minIdxl);
vst1q_u16(fmaxIdx+0, n_maxIdxl);
vst1q_u16(fminIdx+8, n_minIdxh);
vst1q_u16(fmaxIdx+8, n_maxIdxh);
minIdx = fminIdx[0];
maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 16; ++j)
{
u8 minval = fmin[j];
u8 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
for(; i < (s32)len; ++i )
{
u8 val = src[i];
if( val < minVal )
{
minVal = val;
minIdx = (u16)i;
}
else if( val > maxVal )
{
maxVal = val;
maxIdx = (u16)i;
}
}
}
void minMaxLocBlock(const s8 * src, u32 len,
s8 &minVal, u16 &minIdx,
s8 &maxVal, u16 &maxIdx)
{
u16 tmp0123[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
int8x16_t n_min = vdupq_n_s8(src[0]);
uint16x8_t n_minIdxl = vdupq_n_u16(0);
uint16x8_t n_minIdxh = vdupq_n_u16(0);
int8x16_t n_max = vdupq_n_s8(src[0]);
uint16x8_t n_maxIdxl = vdupq_n_u16(0);
uint16x8_t n_maxIdxh = vdupq_n_u16(0);
uint16x8_t c16 = vdupq_n_u16(16);
uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);
s32 i = 0;
s32 bound = len - (16 - 1);
for(; i < bound; i+=16 )
{
internal::prefetch(src + i);
int8x16_t line = vld1q_s8(src + i);
uint8x16_t minmask = vcltq_s8(line, n_min);
uint8x16_t maxmask = vcgtq_s8(line, n_max);
n_min = vbslq_s8(minmask, line, n_min);
uint8x8_t minml = vget_low_u8(minmask);
uint8x8_t minmh = vget_high_u8(minmask);
uint16x8_t minml2 = vmovl_u8(minml);
uint16x8_t minmh2 = vmovl_u8(minmh);
minml2 = vqshlq_n_u16(minml2, 15);
minmh2 = vqshlq_n_u16(minmh2, 15);
n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);
n_max = vbslq_s8(maxmask, line, n_max);
uint8x8_t maxml = vget_low_u8(maxmask);
uint8x8_t maxmh = vget_high_u8(maxmask);
uint16x8_t maxml2 = vmovl_u8(maxml);
uint16x8_t maxmh2 = vmovl_u8(maxmh);
maxml2 = vqshlq_n_u16(maxml2, 15);
maxmh2 = vqshlq_n_u16(maxmh2, 15);
n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);
// idx[] +=16
lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
}
// fix high part of indexes
uint16x8_t c8 = vdupq_n_u16(8);
n_minIdxh = vaddq_u16(n_minIdxh, c8);
n_maxIdxh = vaddq_u16(n_maxIdxh, c8);
s8 fmin[16], fmax[16];
u16 fminIdx[16], fmaxIdx[16];
vst1q_s8(fmin, n_min);
vst1q_s8(fmax, n_max);
vst1q_u16(fminIdx+0, n_minIdxl);
vst1q_u16(fmaxIdx+0, n_maxIdxl);
vst1q_u16(fminIdx+8, n_minIdxh);
vst1q_u16(fmaxIdx+8, n_maxIdxh);
minIdx = fminIdx[0];
maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 16; ++j)
{
s8 minval = fmin[j];
s8 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
for(; i < (s32)len; ++i )
{
s8 val = src[i];
if( val < minVal )
{
minVal = val;
minIdx = (u16)i;
}
else if( val > maxVal )
{
maxVal = val;
maxIdx = (u16)i;
}
}
}
} // namespace
#endif // CAROTENE_NEON
#define USHORT_BLOCK_MAX_SIZE (1 << 16)
void minMaxLoc(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 &minVal, size_t &minCol, size_t &minRow,
u8 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0; l < size.height; ++l)
{
const u8 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width > 128)
{
for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
{
u8 locMinVal, locMaxVal;
u16 locMinIdx, locMaxIdx;
size_t tail = size.width - blockStart;
minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
locMinVal, locMinIdx, locMaxVal, locMaxIdx);
if (locMinVal == 0 && locMaxVal == 255)
{
minCol = blockStart + locMinIdx;
maxCol = blockStart + locMaxIdx;
minRow = l;
maxRow = l;
minVal = 0;
maxVal = 255;
return;
}
else
{
if (locMinVal < minVal)
{
minCol = blockStart + locMinIdx;
minRow = l;
minVal = locMinVal;
}
if (locMaxVal > maxVal)
{
maxCol = blockStart + locMaxIdx;
maxRow = l;
maxVal = locMaxVal;
}
}
}
}
else
{
for(size_t i = 0; i < size.width; ++i )
{
u8 val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const s8 * srcBase, ptrdiff_t srcStride,
s8 &minVal, size_t &minCol, size_t &minRow,
s8 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0; l < size.height; ++l)
{
const s8 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width > 128)
{
for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
{
s8 locMinVal, locMaxVal;
u16 locMinIdx, locMaxIdx;
size_t tail = size.width - blockStart;
minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
locMinVal, locMinIdx, locMaxVal, locMaxIdx);
if (locMinVal == -128 && locMaxVal == 127)
{
minCol = blockStart + locMinIdx;
maxCol = blockStart + locMaxIdx;
minRow = l;
maxRow = l;
minVal = -128;
maxVal = 127;
return;
}
else
{
if (locMinVal < minVal)
{
minCol = blockStart + locMinIdx;
minRow = l;
minVal = locMinVal;
}
if (locMaxVal > maxVal)
{
maxCol = blockStart + locMaxIdx;
maxRow = l;
maxVal = locMaxVal;
}
}
}
}
else
{
for(size_t i = 0; i < size.width; ++i )
{
s8 val = src[i];
if( val < minVal )
{
minVal = val;
minRow = l;
minCol = i;
}
else if( val > maxVal )
{
maxVal = val;
maxRow = l;
maxCol = i;
}
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
} // namespace CAROTENE_NS
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <algorithm>
#include <limits>
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 16 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
#ifdef CAROTENE_NEON
namespace {
struct ErodeVecOp
{
ErodeVecOp():borderValue(0){}
ErodeVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::max();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vminq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmin_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::min(a, b);
}
u8 borderValue;
};
struct DilateVecOp
{
DilateVecOp():borderValue(0){}
DilateVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::min();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vmaxq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmax_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::max(a, b);
}
u8 borderValue;
};
template <typename VecOp>
void morph3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, const VecOp & vop)
{
u8 borderValue = vop.borderValue;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
const uint8x16_t v_zero = vdupq_n_u8(0);
const uint8x16_t v_border = vdupq_n_u8(borderValue);
uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16);
// perform vertical convolution
for ( ; x <= bwidth; x += 16)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x);
uint8x16_t x1 = vld1q_u8(srow1 + x);
uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 16 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = vop(srow1[x4],
vop(srow2 ? srow2[x4] : borderValue,
srow0 ? srow0[x4] : borderValue));
currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue));
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vop(vop(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0));
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u8(tprev, tcurr, 15);
t1 = tcurr;
t2 = vextq_u8(tcurr, tnext, 1);
// and add them
t0 = vop(t0, vop(t1, t2));
vst1q_u8(drow + x - 16, t0);
}
x -= 16;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue;
else if (border == BORDER_MODE_REPLICATE)
nextx = vop(srow2[x], vop(srow1[x], srow0[x]));
}
else
nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue,
srow0 ? srow0[x + 1] : borderValue),
srow1[x + 1]);
drow[x] = vop(prevx, vop(currx, nextx));
// make shift
prevx = currx;
currx = nextx;
}
}
}
} // namespace
#endif
void erode3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, ErodeVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void dilate3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, DilateVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
#ifdef CAROTENE_NEON
namespace {
template<class VecUpdate>
void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize)
{
size_t i, j, k;
size_t width16 = (width & -16) * cn;
size_t width8 = (width & -8) * cn;
width *= cn;
if (ksize == 1)
{
for (i = 0; i < width; i++)
dst[i] = src[i];
return;
}
ksize = ksize*cn;
VecUpdate updateOp;
switch(cn)
{
case 1:
for (i = 0; i < width16; i += 16)
{
const u8* sptr = src + i;
uint8x16_t s = vld1q_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1q_u8(sptr + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
const u8* sptr = src + i;
uint8x8_t s = vld1_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1_u8(sptr + k));
vst1_u8(dst + i, s);
}
break;
default:
for (i = 0; i < width16; i += 16)
{
uint8x16_t s = vld1q_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1q_u8(src + i + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
uint8x8_t s = vld1_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1_u8(src + i + k));
vst1_u8(dst + i, s);
}
break;
}
ptrdiff_t i0 = i;
for( k = 0; k < (size_t)cn; k++, src++, dst++ )
{
for( i = i0; i <= width - cn*2; i += cn*2 )
{
const u8* s = src + i;
u8 m = s[cn];
for( j = cn*2; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = updateOp(m, s[0]);
dst[i+cn] = updateOp(m, s[j]);
}
for( ; i < width; i += cn )
{
const u8* s = src + i;
u8 m = s[0];
for( j = cn; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = m;
}
}
}
template<class VecUpdate>
void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize)
{
size_t i, k;
size_t width32 = width & -32;
VecUpdate updateOp;
uint8x16_t x0,x1,s0,s1;
if (ksize == 3)
{
for (; count > 1; count -= 2, dst += dststep * 2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
sptr = src[2] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[3] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
}
else if (ksize > 1)
for (; count > 1; count -= 2, dst += dststep*2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 2; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
for (; count > 0; count--, dst += dststep, src++)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[0] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 1; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
vst1q_u8(dst + i, s0);
vst1q_u8(dst + i + 16, s1);
}
for(; i < width; i++ )
{
u8 s = src[0][i];
for( k = 1; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = s;
}
}
}
template <class Op>
inline void morphology(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
//Temporary buffers common for all iterations
std::vector<u8> _srcRow(cn*(ssize.width + ksize.width - 1));
u8* srcRow = &_srcRow[0];
size_t bufRows = std::max<size_t>(ksize.height + 3, std::max<size_t>(anchorY, ksize.height-anchorY-1)*2+1);
std::vector<u8*> _rows(bufRows);
u8** rows = &_rows[0];
// adjust swidthcn so that the used part of buffers stays compact in memory
ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size)
std::vector<u8> _ringBuf(swidthcn*bufRows+16);
u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16);
size_t borderLength = std::max<size_t>(ksize.width - 1, 1) * cn;
std::vector<ptrdiff_t> _borderTab(borderLength);
ptrdiff_t * borderTab = &_borderTab[0];
std::vector<u8> _constBorderValue;
std::vector<u8> _constBorderRow;
u8 * constBorderValue = NULL;
u8 * constBorderRow = NULL;
if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderValue.resize(borderLength);
constBorderValue = &_constBorderValue[0];
size_t i;
for(i = 0; i < cn; i++)
constBorderValue[i] = borderValues[i];
for(; i < borderLength; i++)
constBorderValue[i] = constBorderValue[i-cn];
if( columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16));
constBorderRow = internal::alignPtr(&_constBorderRow[0], 16);
size_t N = (ssize.width + ksize.width - 1)*cn;
for( i = 0; i < N; i += borderLength )
{
size_t n = std::min( borderLength, N - i );
for(size_t j = 0; j < n; j++)
srcRow[i+j] = constBorderValue[j];
}
MorphRow<Op>(srcRow, constBorderRow, ssize.width, cn, ksize.width);
}
}
Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right,
ssize.height + borderMargin.top + borderMargin.bottom);
ptrdiff_t dx1 = std::max<ptrdiff_t>(anchorX - (ptrdiff_t)borderMargin.left, 0);
ptrdiff_t dx2 = std::max<ptrdiff_t>((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0);
// recompute border tables
if( dx1 > 0 || dx2 > 0 )
{
if( rowBorderType == BORDER_MODE_CONSTANT )
{
memcpy( srcRow, &constBorderValue[0], dx1*cn );
memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn );
}
else
{
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX) - borderMargin.left;
ptrdiff_t wholeWidth = wholeSize.width;
ptrdiff_t i, j;
for( i = 0; i < dx1; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[i*cn + j] = p0 + j;
}
for( i = 0; i < dx2; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[(i + dx1)*cn + j] = p0 + j;
}
}
}
ptrdiff_t startY, startY0, endY, rowCount;
startY = startY0 = std::max<ptrdiff_t>(borderMargin.top - anchorY, 0);
endY = std::min<ptrdiff_t>(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height);
const u8* src = srcBase + (startY - borderMargin.top)*srcStride;
u8* dst = dstBase;
ptrdiff_t width = ssize.width, kwidth = ksize.width;
ptrdiff_t kheight = ksize.height, ay = anchorY;
ptrdiff_t width1 = ssize.width + kwidth - 1;
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX);
bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT;
ptrdiff_t dy = 0, i = 0;
src -= xofs1*cn;
ptrdiff_t count = endY - startY;
rowCount = 0;
for(;; dst += dstStride*i, dy += i)
{
ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top;
dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
dcount = std::min(dcount, count);
count -= dcount;
for( ; dcount-- > 0; src += srcStride )
{
ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows;
u8* brow = ringBuf + bi*swidthcn;
if( (size_t)(++rowCount) > bufRows )
{
--rowCount;
++startY;
}
memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn );
if( makeBorder )
{
for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ )
srcRow[i] = src[borderTab[i]];
for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ )
srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]];
}
MorphRow<Op>(srcRow, brow, width, cn, ksize.width);
}
ptrdiff_t max_i = std::min<ptrdiff_t>(bufRows, ssize.height - dy + (kheight - 1));
for( i = 0; i < max_i; i++ )
{
ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay,
wholeSize.height, columnBorderType);
if( srcY < 0 ) // can happen only with constant border type
rows[i] = constBorderRow;
else
{
if( srcY >= startY + rowCount )
break;
ptrdiff_t bi = (srcY - startY0) % bufRows;
rows[i] = ringBuf + bi*swidthcn;
}
}
if( i < kheight )
break;
i -= kheight - 1;
MorphColumn<Op>((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height);
}
}
} // namespace
#endif // CAROTENE_NEON
void erode(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<ErodeVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
void dilate(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<DilateVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
} // namespace CAROTENE_NS
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment