Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenCV-python
Commits
fbd3199c
Commit
fbd3199c
authored
Apr 18, 2025
by
fengzch-das
Browse files
Initial commit
parents
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
10390 additions
and
0 deletions
+10390
-0
opencv/3rdparty/carotene/src/convert_scale.cpp
opencv/3rdparty/carotene/src/convert_scale.cpp
+2498
-0
opencv/3rdparty/carotene/src/convolution.cpp
opencv/3rdparty/carotene/src/convolution.cpp
+340
-0
opencv/3rdparty/carotene/src/count_nonzero.cpp
opencv/3rdparty/carotene/src/count_nonzero.cpp
+430
-0
opencv/3rdparty/carotene/src/div.cpp
opencv/3rdparty/carotene/src/div.cpp
+694
-0
opencv/3rdparty/carotene/src/dot_product.cpp
opencv/3rdparty/carotene/src/dot_product.cpp
+260
-0
opencv/3rdparty/carotene/src/dummy.cpp
opencv/3rdparty/carotene/src/dummy.cpp
+2
-0
opencv/3rdparty/carotene/src/fast.cpp
opencv/3rdparty/carotene/src/fast.cpp
+428
-0
opencv/3rdparty/carotene/src/fill_minmaxloc.cpp
opencv/3rdparty/carotene/src/fill_minmaxloc.cpp
+442
-0
opencv/3rdparty/carotene/src/flip.cpp
opencv/3rdparty/carotene/src/flip.cpp
+222
-0
opencv/3rdparty/carotene/src/gaussian_blur.cpp
opencv/3rdparty/carotene/src/gaussian_blur.cpp
+1059
-0
opencv/3rdparty/carotene/src/in_range.cpp
opencv/3rdparty/carotene/src/in_range.cpp
+195
-0
opencv/3rdparty/carotene/src/integral.cpp
opencv/3rdparty/carotene/src/integral.cpp
+238
-0
opencv/3rdparty/carotene/src/intrinsics.hpp
opencv/3rdparty/carotene/src/intrinsics.hpp
+112
-0
opencv/3rdparty/carotene/src/laplacian.cpp
opencv/3rdparty/carotene/src/laplacian.cpp
+713
-0
opencv/3rdparty/carotene/src/magnitude.cpp
opencv/3rdparty/carotene/src/magnitude.cpp
+160
-0
opencv/3rdparty/carotene/src/meanstddev.cpp
opencv/3rdparty/carotene/src/meanstddev.cpp
+163
-0
opencv/3rdparty/carotene/src/median_filter.cpp
opencv/3rdparty/carotene/src/median_filter.cpp
+227
-0
opencv/3rdparty/carotene/src/min_max.cpp
opencv/3rdparty/carotene/src/min_max.cpp
+139
-0
opencv/3rdparty/carotene/src/minmaxloc.cpp
opencv/3rdparty/carotene/src/minmaxloc.cpp
+1340
-0
opencv/3rdparty/carotene/src/morph.cpp
opencv/3rdparty/carotene/src/morph.cpp
+728
-0
No files found.
Too many changes to show.
To preserve performance only
103 of 103+
files are displayed.
Plain diff
Email patch
opencv/3rdparty/carotene/src/convert_scale.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace
CAROTENE_NS
{
#ifdef CAROTENE_NEON
#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
void convertScale(const Size2D &_size, \
const T1 * srcBase, ptrdiff_t srcStride, \
T2 * dstBase, ptrdiff_t dstStride, \
f64 alpha, f64 beta) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (srcStride == dstStride && \
srcStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
const ptrdiff_t sstep = srcStride / sizeof(T1); \
const ptrdiff_t dstep = dstStride / sizeof(T2); \
const size_t w = size.width & ~(SIMD_SIZE-1); \
if (size.width >= SIMD_SIZE) \
{ \
const T1* _src = srcBase; \
T2* _dst = dstBase; \
CVTINIT \
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
CVTROW \
} \
if(w < size.width) \
{ \
const T1* _src = srcBase; \
T2* _dst = dstBase; \
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
for(size_t i = w; i < size.width; i++ ) \
_dst[i] = internal::saturate_cast<T2>(_src[i]*alpha + beta); \
} \
}
#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \
void convertScale(const Size2D &_size, \
const T1 * srcBase, ptrdiff_t srcStride, \
T1 * dstBase, ptrdiff_t dstStride, \
f64 alpha, f64 beta) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (srcStride == dstStride && \
srcStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
const ptrdiff_t sstep = srcStride / sizeof(T1); \
const ptrdiff_t dstep = dstStride / sizeof(T1); \
const size_t w = size.width & ~(SIMD_SIZE-1); \
if (size.width >= SIMD_SIZE) \
{ \
const T1* _src = srcBase; \
T1* _dst = dstBase; \
CVTSINIT \
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
CVTSROW \
} \
if(w < size.width) \
{ \
const T1* _src = srcBase; \
T1* _dst = dstBase; \
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
for(size_t i = w; i < size.width; i++ ) \
_dst[i] = internal::saturate_cast<T1>(_src[i]*alpha + beta); \
} \
}
#else
#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
void convertScale(const Size2D &, \
const T1 *, ptrdiff_t, \
T2 *, ptrdiff_t, \
f64, f64) \
{ \
internal::assertSupportedConfiguration(); \
}
#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \
void convertScale(const Size2D &, \
const T1 *, ptrdiff_t, \
T1 *, ptrdiff_t, \
f64, f64) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC1
(
u8
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u8 q3, d4
\n\t
"
"vmovl.u8 q4, d5
\n\t
"
"vmovl.u16 q5, d6
\n\t
"
"vmovl.u16 q6, d7
\n\t
"
"vmovl.u16 q7, d8
\n\t
"
"vmovl.u16 q8, d9
\n\t
"
"vcvt.f32.u32 q9, q5
\n\t
"
"vcvt.f32.u32 q10, q6
\n\t
"
"vcvt.f32.u32 q11, q7
\n\t
"
"vcvt.f32.u32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vqmovun.s32 d22, q7
\n\t
"
"vqmovun.s32 d23, q8
\n\t
"
"vqmovun.s32 d24, q9
\n\t
"
"vqmovun.s32 d25, q10
\n\t
"
"vqmovn.u16 d26, q11
\n\t
"
"vqmovn.u16 d27, q12
\n\t
"
"vst1.8 {d26-d27}, [%[dst1]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC1
(
u8
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
uint8x16_t
vline
=
vld1q_u8
(
_src
+
i
);
uint16x8_t
vline1_u16
=
vmovl_u8
(
vget_low_u8
(
vline
));
uint16x8_t
vline2_u16
=
vmovl_u8
(
vget_high_u8
(
vline
));
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline1_u16
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline1_u16
));
uint32x4_t
vline3_u32
=
vmovl_u16
(
vget_low_u16
(
vline2_u16
));
uint32x4_t
vline4_u32
=
vmovl_u16
(
vget_high_u16
(
vline2_u16
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
float32x4_t
vline3_f32
=
vcvtq_f32_u32
(
vline3_u32
);
float32x4_t
vline4_f32
=
vcvtq_f32_u32
(
vline4_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int32x4_t
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
int32x4_t
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
uint16x8_t
vRes1_u16
=
vcombine_u16
(
vqmovun_s32
(
vline1_s32
),
vqmovun_s32
(
vline2_s32
));
uint16x8_t
vRes2_u16
=
vcombine_u16
(
vqmovun_s32
(
vline3_s32
),
vqmovun_s32
(
vline4_s32
));
vst1q_u8
(
_dst
+
i
,
vcombine_u8
(
vqmovn_u16
(
vRes1_u16
),
vqmovn_u16
(
vRes2_u16
)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC
(
u8
,
s8
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u8 q3, d4
\n\t
"
"vmovl.u8 q4, d5
\n\t
"
"vmovl.u16 q5, d6
\n\t
"
"vmovl.u16 q6, d7
\n\t
"
"vmovl.u16 q7, d8
\n\t
"
"vmovl.u16 q8, d9
\n\t
"
"vcvt.f32.u32 q9, q5
\n\t
"
"vcvt.f32.u32 q10, q6
\n\t
"
"vcvt.f32.u32 q11, q7
\n\t
"
"vcvt.f32.u32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vqmovn.s32 d22, q7
\n\t
"
"vqmovn.s32 d23, q8
\n\t
"
"vqmovn.s32 d24, q9
\n\t
"
"vqmovn.s32 d25, q10
\n\t
"
"vqmovn.s16 d26, q11
\n\t
"
"vqmovn.s16 d27, q12
\n\t
"
"vst1.8 {d26-d27}, [%[dst1]]
\n\t
"
:
//no output
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
u8
,
s8
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
uint8x16_t
vline
=
vld1q_u8
(
_src
+
i
);
uint16x8_t
vline1_u16
=
vmovl_u8
(
vget_low_u8
(
vline
));
uint16x8_t
vline2_u16
=
vmovl_u8
(
vget_high_u8
(
vline
));
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline1_u16
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline1_u16
));
uint32x4_t
vline3_u32
=
vmovl_u16
(
vget_low_u16
(
vline2_u16
));
uint32x4_t
vline4_u32
=
vmovl_u16
(
vget_high_u16
(
vline2_u16
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
float32x4_t
vline3_f32
=
vcvtq_f32_u32
(
vline3_u32
);
float32x4_t
vline4_f32
=
vcvtq_f32_u32
(
vline4_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int32x4_t
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
int32x4_t
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
int16x8_t
vRes1_u16
=
vcombine_s16
(
vqmovn_s32
(
vline1_s32
),
vqmovn_s32
(
vline2_s32
));
int16x8_t
vRes2_u16
=
vcombine_s16
(
vqmovn_s32
(
vline3_s32
),
vqmovn_s32
(
vline4_s32
));
vst1q_s8
(
_dst
+
i
,
vcombine_s8
(
vqmovn_s16
(
vRes1_u16
),
vqmovn_s16
(
vRes2_u16
)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC
(
u8
,
u16
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u8 q3, d4
\n\t
"
"vmovl.u8 q4, d5
\n\t
"
"vmovl.u16 q5, d6
\n\t
"
"vmovl.u16 q6, d7
\n\t
"
"vmovl.u16 q7, d8
\n\t
"
"vmovl.u16 q8, d9
\n\t
"
"vcvt.f32.u32 q9, q5
\n\t
"
"vcvt.f32.u32 q10, q6
\n\t
"
"vcvt.f32.u32 q11, q7
\n\t
"
"vcvt.f32.u32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vqmovun.s32 d22, q7
\n\t
"
"vqmovun.s32 d23, q8
\n\t
"
"vqmovun.s32 d24, q9
\n\t
"
"vqmovun.s32 d25, q10
\n\t
"
"vst1.16 {d22-d23}, [%[dst1]]
\n\t
"
"vst1.16 {d24-d25}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
8
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
u8
,
u16
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
uint8x16_t
vline
=
vld1q_u8
(
_src
+
i
);
uint16x8_t
vline1_u16
=
vmovl_u8
(
vget_low_u8
(
vline
));
uint16x8_t
vline2_u16
=
vmovl_u8
(
vget_high_u8
(
vline
));
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline1_u16
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline1_u16
));
uint32x4_t
vline3_u32
=
vmovl_u16
(
vget_low_u16
(
vline2_u16
));
uint32x4_t
vline4_u32
=
vmovl_u16
(
vget_high_u16
(
vline2_u16
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
float32x4_t
vline3_f32
=
vcvtq_f32_u32
(
vline3_u32
);
float32x4_t
vline4_f32
=
vcvtq_f32_u32
(
vline4_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int32x4_t
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
int32x4_t
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
vst1q_u16
(
_dst
+
i
+
0
,
vcombine_u16
(
vqmovun_s32
(
vline1_s32
),
vqmovun_s32
(
vline2_s32
)));
vst1q_u16
(
_dst
+
i
+
8
,
vcombine_u16
(
vqmovun_s32
(
vline3_s32
),
vqmovun_s32
(
vline4_s32
)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC
(
u8
,
s16
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u8 q3, d4
\n\t
"
"vmovl.u8 q4, d5
\n\t
"
"vmovl.u16 q5, d6
\n\t
"
"vmovl.u16 q6, d7
\n\t
"
"vmovl.u16 q7, d8
\n\t
"
"vmovl.u16 q8, d9
\n\t
"
"vcvt.f32.u32 q9, q5
\n\t
"
"vcvt.f32.u32 q10, q6
\n\t
"
"vcvt.f32.u32 q11, q7
\n\t
"
"vcvt.f32.u32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vqmovn.s32 d22, q7
\n\t
"
"vqmovn.s32 d23, q8
\n\t
"
"vqmovn.s32 d24, q9
\n\t
"
"vqmovn.s32 d25, q10
\n\t
"
"vst1.16 {d22-d23}, [%[dst1]]
\n\t
"
"vst1.16 {d24-d25}, [%[dst2]]
\n\t
"
:
//no output
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
8
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
u8
,
s16
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
uint8x16_t
vline
=
vld1q_u8
(
_src
+
i
);
uint16x8_t
vline1_u16
=
vmovl_u8
(
vget_low_u8
(
vline
));
uint16x8_t
vline2_u16
=
vmovl_u8
(
vget_high_u8
(
vline
));
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline1_u16
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline1_u16
));
uint32x4_t
vline3_u32
=
vmovl_u16
(
vget_low_u16
(
vline2_u16
));
uint32x4_t
vline4_u32
=
vmovl_u16
(
vget_high_u16
(
vline2_u16
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
float32x4_t
vline3_f32
=
vcvtq_f32_u32
(
vline3_u32
);
float32x4_t
vline4_f32
=
vcvtq_f32_u32
(
vline4_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int32x4_t
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
int32x4_t
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
vst1q_s16
(
_dst
+
i
+
0
,
vcombine_s16
(
vqmovn_s32
(
vline1_s32
),
vqmovn_s32
(
vline2_s32
)));
vst1q_s16
(
_dst
+
i
+
8
,
vcombine_s16
(
vqmovn_s32
(
vline3_s32
),
vqmovn_s32
(
vline4_s32
)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
u8
,
s32
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u8 q3, d4
\n\t
"
"vmovl.u8 q4, d5
\n\t
"
"vmovl.u16 q5, d6
\n\t
"
"vmovl.u16 q6, d7
\n\t
"
"vmovl.u16 q7, d8
\n\t
"
"vmovl.u16 q8, d9
\n\t
"
"vcvt.f32.u32 q9, q5
\n\t
"
"vcvt.f32.u32 q10, q6
\n\t
"
"vcvt.f32.u32 q11, q7
\n\t
"
"vcvt.f32.u32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vst1.32 {d14-d15}, [%[dst1]]
\n\t
"
"vst1.32 {d16-d17}, [%[dst2]]
\n\t
"
"vst1.32 {d18-d19}, [%[dst3]]
\n\t
"
"vst1.32 {d20-d21}, [%[dst4]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
[
dst3
]
"r"
(
_dst
+
i
+
8
),
[
dst4
]
"r"
(
_dst
+
i
+
12
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
u8
,
s32
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
uint8x16_t
vline
=
vld1q_u8
(
_src
+
i
);
uint16x8_t
vline1_u16
=
vmovl_u8
(
vget_low_u8
(
vline
));
uint16x8_t
vline2_u16
=
vmovl_u8
(
vget_high_u8
(
vline
));
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline1_u16
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline1_u16
));
uint32x4_t
vline3_u32
=
vmovl_u16
(
vget_low_u16
(
vline2_u16
));
uint32x4_t
vline4_u32
=
vmovl_u16
(
vget_high_u16
(
vline2_u16
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
float32x4_t
vline3_f32
=
vcvtq_f32_u32
(
vline3_u32
);
float32x4_t
vline4_f32
=
vcvtq_f32_u32
(
vline4_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int32x4_t
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
int32x4_t
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
vst1q_s32
(
_dst
+
i
+
0
,
vline1_s32
);
vst1q_s32
(
_dst
+
i
+
4
,
vline2_s32
);
vst1q_s32
(
_dst
+
i
+
8
,
vline3_s32
);
vst1q_s32
(
_dst
+
i
+
12
,
vline4_s32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
u8
,
f32
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u8 q3, d4
\n\t
"
"vmovl.u8 q4, d5
\n\t
"
"vmovl.u16 q5, d6
\n\t
"
"vmovl.u16 q6, d7
\n\t
"
"vmovl.u16 q7, d8
\n\t
"
"vmovl.u16 q8, d9
\n\t
"
"vcvt.f32.u32 q9, q5
\n\t
"
"vcvt.f32.u32 q10, q6
\n\t
"
"vcvt.f32.u32 q11, q7
\n\t
"
"vcvt.f32.u32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vst1.32 {d6-d7}, [%[dst1]]
\n\t
"
"vst1.32 {d8-d9}, [%[dst2]]
\n\t
"
"vst1.32 {d10-d11}, [%[dst3]]
\n\t
"
"vst1.32 {d12-d13}, [%[dst4]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
[
dst3
]
"r"
(
_dst
+
i
+
8
),
[
dst4
]
"r"
(
_dst
+
i
+
12
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
u8
,
f32
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
uint8x16_t
vline
=
vld1q_u8
(
_src
+
i
);
uint16x8_t
vline1_u16
=
vmovl_u8
(
vget_low_u8
(
vline
));
uint16x8_t
vline2_u16
=
vmovl_u8
(
vget_high_u8
(
vline
));
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline1_u16
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline1_u16
));
uint32x4_t
vline3_u32
=
vmovl_u16
(
vget_low_u16
(
vline2_u16
));
uint32x4_t
vline4_u32
=
vmovl_u16
(
vget_high_u16
(
vline2_u16
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
float32x4_t
vline3_f32
=
vcvtq_f32_u32
(
vline3_u32
);
float32x4_t
vline4_f32
=
vcvtq_f32_u32
(
vline4_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
vst1q_f32
(
_dst
+
i
+
0
,
vline1_f32
);
vst1q_f32
(
_dst
+
i
+
4
,
vline2_f32
);
vst1q_f32
(
_dst
+
i
+
8
,
vline3_f32
);
vst1q_f32
(
_dst
+
i
+
12
,
vline4_f32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC
(
s8
,
u8
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s8 q3, d4
\n\t
"
"vmovl.s8 q4, d5
\n\t
"
"vmovl.s16 q5, d6
\n\t
"
"vmovl.s16 q6, d7
\n\t
"
"vmovl.s16 q7, d8
\n\t
"
"vmovl.s16 q8, d9
\n\t
"
"vcvt.f32.s32 q9, q5
\n\t
"
"vcvt.f32.s32 q10, q6
\n\t
"
"vcvt.f32.s32 q11, q7
\n\t
"
"vcvt.f32.s32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vqmovun.s32 d22, q7
\n\t
"
"vqmovun.s32 d23, q8
\n\t
"
"vqmovun.s32 d24, q9
\n\t
"
"vqmovun.s32 d25, q10
\n\t
"
"vqmovn.u16 d26, q11
\n\t
"
"vqmovn.u16 d27, q12
\n\t
"
"vst1.8 {d26-d27}, [%[dst1]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
s8
,
u8
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
int8x16_t
vline
=
vld1q_s8
(
_src
+
i
);
int16x8_t
vline1_s16
=
vmovl_s8
(
vget_low_s8
(
vline
));
int16x8_t
vline2_s16
=
vmovl_s8
(
vget_high_s8
(
vline
));
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline1_s16
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline1_s16
));
int32x4_t
vline3_s32
=
vmovl_s16
(
vget_low_s16
(
vline2_s16
));
int32x4_t
vline4_s32
=
vmovl_s16
(
vget_high_s16
(
vline2_s16
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
float32x4_t
vline3_f32
=
vcvtq_f32_s32
(
vline3_s32
);
float32x4_t
vline4_f32
=
vcvtq_f32_s32
(
vline4_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
uint16x8_t
vRes1_u16
=
vcombine_u16
(
vqmovun_s32
(
vline1_s32
),
vqmovun_s32
(
vline2_s32
));
uint16x8_t
vRes2_u16
=
vcombine_u16
(
vqmovun_s32
(
vline3_s32
),
vqmovun_s32
(
vline4_s32
));
vst1q_u8
(
_dst
+
i
,
vcombine_u8
(
vqmovn_u16
(
vRes1_u16
),
vqmovn_u16
(
vRes2_u16
)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC1
(
s8
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s8 q3, d4
\n\t
"
"vmovl.s8 q4, d5
\n\t
"
"vmovl.s16 q5, d6
\n\t
"
"vmovl.s16 q6, d7
\n\t
"
"vmovl.s16 q7, d8
\n\t
"
"vmovl.s16 q8, d9
\n\t
"
"vcvt.f32.s32 q9, q5
\n\t
"
"vcvt.f32.s32 q10, q6
\n\t
"
"vcvt.f32.s32 q11, q7
\n\t
"
"vcvt.f32.s32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vqmovn.s32 d22, q7
\n\t
"
"vqmovn.s32 d23, q8
\n\t
"
"vqmovn.s32 d24, q9
\n\t
"
"vqmovn.s32 d25, q10
\n\t
"
"vqmovn.s16 d26, q11
\n\t
"
"vqmovn.s16 d27, q12
\n\t
"
"vst1.8 {d26-d27}, [%[dst1]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC1
(
s8
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
int8x16_t
vline
=
vld1q_s8
(
_src
+
i
);
int16x8_t
vline1_s16
=
vmovl_s8
(
vget_low_s8
(
vline
));
int16x8_t
vline2_s16
=
vmovl_s8
(
vget_high_s8
(
vline
));
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline1_s16
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline1_s16
));
int32x4_t
vline3_s32
=
vmovl_s16
(
vget_low_s16
(
vline2_s16
));
int32x4_t
vline4_s32
=
vmovl_s16
(
vget_high_s16
(
vline2_s16
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
float32x4_t
vline3_f32
=
vcvtq_f32_s32
(
vline3_s32
);
float32x4_t
vline4_f32
=
vcvtq_f32_s32
(
vline4_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
int16x8_t
vRes1_s16
=
vcombine_s16
(
vqmovn_s32
(
vline1_s32
),
vqmovn_s32
(
vline2_s32
));
int16x8_t
vRes2_s16
=
vcombine_s16
(
vqmovn_s32
(
vline3_s32
),
vqmovn_s32
(
vline4_s32
));
vst1q_s8
(
_dst
+
i
,
vcombine_s8
(
vqmovn_s16
(
vRes1_s16
),
vqmovn_s16
(
vRes2_s16
)));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC
(
s8
,
u16
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s8 q3, d4
\n\t
"
"vmovl.s8 q4, d5
\n\t
"
"vmovl.s16 q5, d6
\n\t
"
"vmovl.s16 q6, d7
\n\t
"
"vmovl.s16 q7, d8
\n\t
"
"vmovl.s16 q8, d9
\n\t
"
"vcvt.f32.s32 q9, q5
\n\t
"
"vcvt.f32.s32 q10, q6
\n\t
"
"vcvt.f32.s32 q11, q7
\n\t
"
"vcvt.f32.s32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vqmovun.s32 d22, q7
\n\t
"
"vqmovun.s32 d23, q8
\n\t
"
"vqmovun.s32 d24, q9
\n\t
"
"vqmovun.s32 d25, q10
\n\t
"
"vst1.16 {d22-d23}, [%[dst1]]
\n\t
"
"vst1.16 {d24-d25}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
8
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
s8
,
u16
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
int8x16_t
vline
=
vld1q_s8
(
_src
+
i
);
int16x8_t
vline1_s16
=
vmovl_s8
(
vget_low_s8
(
vline
));
int16x8_t
vline2_s16
=
vmovl_s8
(
vget_high_s8
(
vline
));
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline1_s16
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline1_s16
));
int32x4_t
vline3_s32
=
vmovl_s16
(
vget_low_s16
(
vline2_s16
));
int32x4_t
vline4_s32
=
vmovl_s16
(
vget_high_s16
(
vline2_s16
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
float32x4_t
vline3_f32
=
vcvtq_f32_s32
(
vline3_s32
);
float32x4_t
vline4_f32
=
vcvtq_f32_s32
(
vline4_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
uint16x8_t
vRes1_u16
=
vcombine_u16
(
vqmovun_s32
(
vline1_s32
),
vqmovun_s32
(
vline2_s32
));
uint16x8_t
vRes2_u16
=
vcombine_u16
(
vqmovun_s32
(
vline3_s32
),
vqmovun_s32
(
vline4_s32
));
vst1q_u16
(
_dst
+
i
+
0
,
vRes1_u16
);
vst1q_u16
(
_dst
+
i
+
8
,
vRes2_u16
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
CVTS_FUNC
(
s8
,
s16
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s8 q3, d4
\n\t
"
"vmovl.s8 q4, d5
\n\t
"
"vmovl.s16 q5, d6
\n\t
"
"vmovl.s16 q6, d7
\n\t
"
"vmovl.s16 q7, d8
\n\t
"
"vmovl.s16 q8, d9
\n\t
"
"vcvt.f32.s32 q9, q5
\n\t
"
"vcvt.f32.s32 q10, q6
\n\t
"
"vcvt.f32.s32 q11, q7
\n\t
"
"vcvt.f32.s32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vqmovn.s32 d22, q7
\n\t
"
"vqmovn.s32 d23, q8
\n\t
"
"vqmovn.s32 d24, q9
\n\t
"
"vqmovn.s32 d25, q10
\n\t
"
"vst1.16 {d22-d23}, [%[dst1]]
\n\t
"
"vst1.16 {d24-d25}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
8
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
s8
,
s16
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
int8x16_t
vline
=
vld1q_s8
(
_src
+
i
);
int16x8_t
vline1_s16
=
vmovl_s8
(
vget_low_s8
(
vline
));
int16x8_t
vline2_s16
=
vmovl_s8
(
vget_high_s8
(
vline
));
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline1_s16
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline1_s16
));
int32x4_t
vline3_s32
=
vmovl_s16
(
vget_low_s16
(
vline2_s16
));
int32x4_t
vline4_s32
=
vmovl_s16
(
vget_high_s16
(
vline2_s16
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
float32x4_t
vline3_f32
=
vcvtq_f32_s32
(
vline3_s32
);
float32x4_t
vline4_f32
=
vcvtq_f32_s32
(
vline4_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
int16x8_t
vRes1_s16
=
vcombine_s16
(
vqmovn_s32
(
vline1_s32
),
vqmovn_s32
(
vline2_s32
));
int16x8_t
vRes2_s16
=
vcombine_s16
(
vqmovn_s32
(
vline3_s32
),
vqmovn_s32
(
vline4_s32
));
vst1q_s16
(
_dst
+
i
+
0
,
vRes1_s16
);
vst1q_s16
(
_dst
+
i
+
8
,
vRes2_s16
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s8
,
s32
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s8 q3, d4
\n\t
"
"vmovl.s8 q4, d5
\n\t
"
"vmovl.s16 q5, d6
\n\t
"
"vmovl.s16 q6, d7
\n\t
"
"vmovl.s16 q7, d8
\n\t
"
"vmovl.s16 q8, d9
\n\t
"
"vcvt.f32.s32 q9, q5
\n\t
"
"vcvt.f32.s32 q10, q6
\n\t
"
"vcvt.f32.s32 q11, q7
\n\t
"
"vcvt.f32.s32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vcvt.s32.f32 q7, q3
\n\t
"
"vcvt.s32.f32 q8, q4
\n\t
"
"vcvt.s32.f32 q9, q5
\n\t
"
"vcvt.s32.f32 q10, q6
\n\t
"
"vst1.32 {d14-d15}, [%[dst1]]
\n\t
"
"vst1.32 {d16-d17}, [%[dst2]]
\n\t
"
"vst1.32 {d18-d19}, [%[dst3]]
\n\t
"
"vst1.32 {d20-d21}, [%[dst4]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
[
dst3
]
"r"
(
_dst
+
i
+
8
),
[
dst4
]
"r"
(
_dst
+
i
+
12
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
s8
,
s32
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
int8x16_t
vline
=
vld1q_s8
(
_src
+
i
);
int16x8_t
vline1_s16
=
vmovl_s8
(
vget_low_s8
(
vline
));
int16x8_t
vline2_s16
=
vmovl_s8
(
vget_high_s8
(
vline
));
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline1_s16
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline1_s16
));
int32x4_t
vline3_s32
=
vmovl_s16
(
vget_low_s16
(
vline2_s16
));
int32x4_t
vline4_s32
=
vmovl_s16
(
vget_high_s16
(
vline2_s16
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
float32x4_t
vline3_f32
=
vcvtq_f32_s32
(
vline3_s32
);
float32x4_t
vline4_f32
=
vcvtq_f32_s32
(
vline4_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
vline3_s32
=
vcvtq_s32_f32
(
vline3_f32
);
vline4_s32
=
vcvtq_s32_f32
(
vline4_f32
);
vst1q_s32
(
_dst
+
i
+
0
,
vline1_s32
);
vst1q_s32
(
_dst
+
i
+
4
,
vline2_s32
);
vst1q_s32
(
_dst
+
i
+
8
,
vline3_s32
);
vst1q_s32
(
_dst
+
i
+
12
,
vline4_s32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s8
,
f32
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s8 q3, d4
\n\t
"
"vmovl.s8 q4, d5
\n\t
"
"vmovl.s16 q5, d6
\n\t
"
"vmovl.s16 q6, d7
\n\t
"
"vmovl.s16 q7, d8
\n\t
"
"vmovl.s16 q8, d9
\n\t
"
"vcvt.f32.s32 q9, q5
\n\t
"
"vcvt.f32.s32 q10, q6
\n\t
"
"vcvt.f32.s32 q11, q7
\n\t
"
"vcvt.f32.s32 q12, q8
\n\t
"
"vmul.f32 q13, q9, q0
\n\t
"
"vmul.f32 q14, q10, q0
\n\t
"
"vmul.f32 q15, q11, q0
\n\t
"
"vmul.f32 q2, q12, q0
\n\t
"
"vadd.f32 q3, q13, q1
\n\t
"
"vadd.f32 q4, q14, q1
\n\t
"
"vadd.f32 q5, q15, q1
\n\t
"
"vadd.f32 q6, q2, q1
\n\t
"
"vst1.32 {d6-d7}, [%[dst1]]
\n\t
"
"vst1.32 {d8-d9}, [%[dst2]]
\n\t
"
"vst1.32 {d10-d11}, [%[dst3]]
\n\t
"
"vst1.32 {d12-d13}, [%[dst4]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
[
dst3
]
"r"
(
_dst
+
i
+
8
),
[
dst4
]
"r"
(
_dst
+
i
+
12
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
,
"d31"
);
}
})
#else
CVTS_FUNC
(
s8
,
f32
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
16
)
{
internal
::
prefetch
(
_src
+
i
);
int8x16_t
vline
=
vld1q_s8
(
_src
+
i
);
int16x8_t
vline1_s16
=
vmovl_s8
(
vget_low_s8
(
vline
));
int16x8_t
vline2_s16
=
vmovl_s8
(
vget_high_s8
(
vline
));
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline1_s16
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline1_s16
));
int32x4_t
vline3_s32
=
vmovl_s16
(
vget_low_s16
(
vline2_s16
));
int32x4_t
vline4_s32
=
vmovl_s16
(
vget_high_s16
(
vline2_s16
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
float32x4_t
vline3_f32
=
vcvtq_f32_s32
(
vline3_s32
);
float32x4_t
vline4_f32
=
vcvtq_f32_s32
(
vline4_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline3_f32
=
vmulq_f32
(
vline3_f32
,
vscale
);
vline4_f32
=
vmulq_f32
(
vline4_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline3_f32
=
vaddq_f32
(
vline3_f32
,
vshift
);
vline4_f32
=
vaddq_f32
(
vline4_f32
,
vshift
);
vst1q_f32
(
_dst
+
i
+
0
,
vline1_f32
);
vst1q_f32
(
_dst
+
i
+
4
,
vline2_f32
);
vst1q_f32
(
_dst
+
i
+
8
,
vline3_f32
);
vst1q_f32
(
_dst
+
i
+
12
,
vline4_f32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
u16
,
u8
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src1]]
\n\t
"
"vmovl.u16 q3, d4
\n\t
"
"vmovl.u16 q4, d5
\n\t
"
"vcvt.f32.u32 q5, q3
\n\t
"
"vcvt.f32.u32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vqmovn.s32 d26, q11
\n\t
"
"vqmovn.s32 d27, q12
\n\t
"
"vqmovun.s16 d28, q13
\n\t
"
"vst1.8 {d28}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
),
[
dst
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
);
}
})
#else
CVTS_FUNC
(
u16
,
u8
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
uint16x8_t
vline
=
vld1q_u16
(
_src
+
i
);
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
uint8x8_t
vRes
=
vqmovun_s16
(
vcombine_s16
(
vRes1
,
vRes2
));
vst1_u8
(
_dst
+
i
,
vRes
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
u16
,
s8
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src1]]
\n\t
"
"vmovl.u16 q3, d4
\n\t
"
"vmovl.u16 q4, d5
\n\t
"
"vcvt.f32.u32 q5, q3
\n\t
"
"vcvt.f32.u32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vqmovn.s32 d26, q11
\n\t
"
"vqmovn.s32 d27, q12
\n\t
"
"vqmovn.s16 d28, q13
\n\t
"
"vst1.8 {d28}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
),
[
dst
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
);
}
})
#else
CVTS_FUNC
(
u16
,
s8
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
uint16x8_t
vline
=
vld1q_u16
(
_src
+
i
);
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
int8x8_t
vRes
=
vqmovn_s16
(
vcombine_s16
(
vRes1
,
vRes2
));
vst1_s8
(
_dst
+
i
,
vRes
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC1
(
u16
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.16 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u16 q3, d4
\n\t
"
"vmovl.u16 q4, d5
\n\t
"
"vcvt.f32.u32 q5, q3
\n\t
"
"vcvt.f32.u32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vqmovun.s32 d26, q11
\n\t
"
"vqmovun.s32 d27, q12
\n\t
"
"vst1.16 {d26-d27}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vshift
),
"w"
(
vscale
)
:
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
);
}
})
#else
CVTS_FUNC1
(
u16
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
uint16x8_t
vline
=
vld1q_u16
(
_src
+
i
);
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
uint16x4_t
vRes1
=
vqmovun_s32
(
vline1_s32
);
uint16x4_t
vRes2
=
vqmovun_s32
(
vline2_s32
);
vst1q_u16
(
_dst
+
i
,
vcombine_u16
(
vRes1
,
vRes2
));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
u16
,
s16
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.16 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u16 q3, d4
\n\t
"
"vmovl.u16 q4, d5
\n\t
"
"vcvt.f32.u32 q5, q3
\n\t
"
"vcvt.f32.u32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vqmovn.s32 d26, q11
\n\t
"
"vqmovn.s32 d27, q12
\n\t
"
"vst1.16 {d26-d27}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vshift
),
"w"
(
vscale
)
:
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
);
}
})
#else
CVTS_FUNC
(
u16
,
s16
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
uint16x8_t
vline
=
vld1q_u16
(
_src
+
i
);
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
vst1q_s16
(
_dst
+
i
,
vcombine_s16
(
vRes1
,
vRes2
));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
u16
,
s32
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.16 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u16 q3, d4
\n\t
"
"vmovl.u16 q4, d5
\n\t
"
"vcvt.f32.u32 q5, q3
\n\t
"
"vcvt.f32.u32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vst1.32 {d22-d23}, [%[dst1]]
\n\t
"
"vst1.32 {d24-d25}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
"w"
(
vshift
),
"w"
(
vscale
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
);
}
})
#else
CVTS_FUNC
(
u16
,
s32
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
uint16x8_t
vline
=
vld1q_u16
(
_src
+
i
);
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
vst1q_s32
(
_dst
+
i
+
0
,
vline1_s32
);
vst1q_s32
(
_dst
+
i
+
4
,
vline2_s32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
u16
,
f32
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.16 {d4-d5}, [%[src]]
\n\t
"
"vmovl.u16 q3, d4
\n\t
"
"vmovl.u16 q4, d5
\n\t
"
"vcvt.f32.u32 q5, q3
\n\t
"
"vcvt.f32.u32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vst1.32 {d18-d19}, [%[dst1]]
\n\t
"
"vst1.32 {d20-d21}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
);
}
})
#else
CVTS_FUNC
(
u16
,
f32
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
uint16x8_t
vline
=
vld1q_u16
(
_src
+
i
);
uint32x4_t
vline1_u32
=
vmovl_u16
(
vget_low_u16
(
vline
));
uint32x4_t
vline2_u32
=
vmovl_u16
(
vget_high_u16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_u32
(
vline1_u32
);
float32x4_t
vline2_f32
=
vcvtq_f32_u32
(
vline2_u32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vst1q_f32
(
_dst
+
i
+
0
,
vline1_f32
);
vst1q_f32
(
_dst
+
i
+
4
,
vline2_f32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s16
,
u8
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src1]]
\n\t
"
"vmovl.s16 q3, d4
\n\t
"
"vmovl.s16 q4, d5
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vcvt.f32.s32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vqmovn.s32 d26, q11
\n\t
"
"vqmovn.s32 d27, q12
\n\t
"
"vqmovun.s16 d28, q13
\n\t
"
"vst1.8 {d28}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
),
[
dst
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
);
}
})
#else
CVTS_FUNC
(
s16
,
u8
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int16x8_t
vline
=
vld1q_s16
(
_src
+
i
);
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
uint8x8_t
vRes
=
vqmovun_s16
(
vcombine_s16
(
vRes1
,
vRes2
));
vst1_u8
(
_dst
+
i
,
vRes
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s16
,
s8
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.8 {d4-d5}, [%[src1]]
\n\t
"
"vmovl.s16 q3, d4
\n\t
"
"vmovl.s16 q4, d5
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vcvt.f32.s32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vqmovn.s32 d26, q11
\n\t
"
"vqmovn.s32 d27, q12
\n\t
"
"vqmovn.s16 d28, q13
\n\t
"
"vst1.8 {d28}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
),
[
dst
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
);
}
})
#else
CVTS_FUNC
(
s16
,
s8
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int16x8_t
vline
=
vld1q_s16
(
_src
+
i
);
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
int8x8_t
vRes
=
vqmovn_s16
(
vcombine_s16
(
vRes1
,
vRes2
));
vst1_s8
(
_dst
+
i
,
vRes
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s16
,
u16
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.16 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s16 q3, d4
\n\t
"
"vmovl.s16 q4, d5
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vcvt.f32.s32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vqmovun.s32 d26, q11
\n\t
"
"vqmovun.s32 d27, q12
\n\t
"
"vst1.16 {d26-d27}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
);
}
})
#else
CVTS_FUNC
(
s16
,
u16
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int16x8_t
vline
=
vld1q_s16
(
_src
+
i
);
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
uint16x4_t
vRes1
=
vqmovun_s32
(
vline1_s32
);
uint16x4_t
vRes2
=
vqmovun_s32
(
vline2_s32
);
vst1q_u16
(
_dst
+
i
,
vcombine_u16
(
vRes1
,
vRes2
));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC1
(
s16
,
16
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.16 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s16 q3, d4
\n\t
"
"vmovl.s16 q4, d5
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vcvt.f32.s32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vqmovn.s32 d26, q11
\n\t
"
"vqmovn.s32 d27, q12
\n\t
"
"vst1.16 {d26-d27}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst
]
"r"
(
_dst
+
i
+
0
),
"w"
(
vshift
),
"w"
(
vscale
)
:
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
);
}
})
#else
CVTS_FUNC1
(
s16
,
16
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int16x8_t
vline
=
vld1q_s16
(
_src
+
i
);
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
vst1q_s16
(
_dst
+
i
,
vcombine_s16
(
vRes1
,
vRes2
));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s16
,
s32
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.16 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s16 q3, d4
\n\t
"
"vmovl.s16 q4, d5
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vcvt.f32.s32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vcvt.s32.f32 q12, q10
\n\t
"
"vst1.32 {d22-d23}, [%[dst1]]
\n\t
"
"vst1.32 {d24-d25}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
);
}
})
#else
CVTS_FUNC
(
s16
,
s32
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int16x8_t
vline
=
vld1q_s16
(
_src
+
i
);
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
vst1q_s32
(
_dst
+
i
+
0
,
vline1_s32
);
vst1q_s32
(
_dst
+
i
+
4
,
vline2_s32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s16
,
f32
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.16 {d4-d5}, [%[src]]
\n\t
"
"vmovl.s16 q3, d4
\n\t
"
"vmovl.s16 q4, d5
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vcvt.f32.s32 q6, q4
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vmul.f32 q8, q6, q0
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vadd.f32 q10, q8, q1
\n\t
"
"vst1.32 {d18-d19}, [%[dst1]]
\n\t
"
"vst1.32 {d20-d21}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src
]
"r"
(
_src
+
i
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
);
}
})
#else
CVTS_FUNC
(
s16
,
f32
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int16x8_t
vline
=
vld1q_s16
(
_src
+
i
);
int32x4_t
vline1_s32
=
vmovl_s16
(
vget_low_s16
(
vline
));
int32x4_t
vline2_s32
=
vmovl_s16
(
vget_high_s16
(
vline
));
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vst1q_f32
(
_dst
+
i
+
0
,
vline1_f32
);
vst1q_f32
(
_dst
+
i
+
4
,
vline2_f32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s32
,
u8
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vcvt.f32.s32 q4, q2
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vmul.f32 q6, q4, q0
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vadd.f32 q8, q6, q1
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vcvt.s32.f32 q10, q8
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vqmovun.s32 d24, q10
\n\t
"
"vqmovun.s32 d25, q11
\n\t
"
"vqmovn.u16 d26, q12
\n\t
"
"vst1.8 {d26}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst
]
"r"
(
_dst
+
i
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
);
}
})
#else
CVTS_FUNC
(
s32
,
u8
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int32x4_t
vline1_s32
=
vld1q_s32
(
_src
+
i
+
0
);
int32x4_t
vline2_s32
=
vld1q_s32
(
_src
+
i
+
4
);
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
uint16x4_t
vRes1
=
vqmovun_s32
(
vline1_s32
);
uint16x4_t
vRes2
=
vqmovun_s32
(
vline2_s32
);
uint8x8_t
vRes
=
vqmovn_u16
(
vcombine_u16
(
vRes1
,
vRes2
));
vst1_u8
(
_dst
+
i
,
vRes
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s32
,
s8
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vcvt.f32.s32 q4, q2
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vmul.f32 q6, q4, q0
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vadd.f32 q8, q6, q1
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vcvt.s32.f32 q10, q8
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vqmovn.s32 d24, q10
\n\t
"
"vqmovn.s32 d25, q11
\n\t
"
"vqmovn.s16 d26, q12
\n\t
"
"vst1.8 {d26}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst
]
"r"
(
_dst
+
i
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
);
}
})
#else
CVTS_FUNC
(
s32
,
s8
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int32x4_t
vline1_s32
=
vld1q_s32
(
_src
+
i
+
0
);
int32x4_t
vline2_s32
=
vld1q_s32
(
_src
+
i
+
4
);
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
int8x8_t
vRes
=
vqmovn_s16
(
vcombine_s16
(
vRes1
,
vRes2
));
vst1_s8
(
_dst
+
i
,
vRes
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s32
,
u16
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vcvt.f32.s32 q4, q2
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vmul.f32 q6, q4, q0
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vadd.f32 q8, q6, q1
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vcvt.s32.f32 q10, q8
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vqmovun.s32 d24, q10
\n\t
"
"vqmovun.s32 d25, q11
\n\t
"
"vst1.16 {d24-d25}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst
]
"r"
(
_dst
+
i
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
);
}
})
#else
CVTS_FUNC
(
s32
,
u16
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int32x4_t
vline1_s32
=
vld1q_s32
(
_src
+
i
+
0
);
int32x4_t
vline2_s32
=
vld1q_s32
(
_src
+
i
+
4
);
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
uint16x4_t
vRes1
=
vqmovun_s32
(
vline1_s32
);
uint16x4_t
vRes2
=
vqmovun_s32
(
vline2_s32
);
vst1q_u16
(
_dst
+
i
,
vcombine_u16
(
vRes1
,
vRes2
));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s32
,
s16
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vcvt.f32.s32 q4, q2
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vmul.f32 q6, q4, q0
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vadd.f32 q8, q6, q1
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vcvt.s32.f32 q10, q8
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vqmovn.s32 d24, q10
\n\t
"
"vqmovn.s32 d25, q11
\n\t
"
"vst1.8 {d24-d25}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst
]
"r"
(
_dst
+
i
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
);
}
})
#else
CVTS_FUNC
(
s32
,
s16
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int32x4_t
vline1_s32
=
vld1q_s32
(
_src
+
i
+
0
);
int32x4_t
vline2_s32
=
vld1q_s32
(
_src
+
i
+
4
);
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
vst1q_s16
(
_dst
+
i
,
vcombine_s16
(
vRes1
,
vRes2
));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC1
(
s32
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vcvt.f32.s32 q4, q2
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vmul.f32 q6, q4, q0
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vadd.f32 q8, q6, q1
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vcvt.s32.f32 q10, q8
\n\t
"
"vcvt.s32.f32 q11, q9
\n\t
"
"vst1.32 {d20-d21}, [%[dst1]]
\n\t
"
"vst1.32 {d22-d23}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
);
}
})
#else
CVTS_FUNC1
(
s32
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int32x4_t
vline1_s32
=
vld1q_s32
(
_src
+
i
+
0
);
int32x4_t
vline2_s32
=
vld1q_s32
(
_src
+
i
+
4
);
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
vst1q_s32
(
_dst
+
i
+
0
,
vline1_s32
);
vst1q_s32
(
_dst
+
i
+
4
,
vline2_s32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
s32
,
f32
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vcvt.f32.s32 q4, q2
\n\t
"
"vcvt.f32.s32 q5, q3
\n\t
"
"vmul.f32 q6, q4, q0
\n\t
"
"vmul.f32 q7, q5, q0
\n\t
"
"vadd.f32 q8, q6, q1
\n\t
"
"vadd.f32 q9, q7, q1
\n\t
"
"vst1.32 {d16-d17}, [%[dst1]]
\n\t
"
"vst1.32 {d18-d19}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst1
]
"r"
(
_dst
+
i
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
);
}
})
#else
CVTS_FUNC
(
s32
,
f32
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
int32x4_t
vline1_s32
=
vld1q_s32
(
_src
+
i
+
0
);
int32x4_t
vline2_s32
=
vld1q_s32
(
_src
+
i
+
4
);
float32x4_t
vline1_f32
=
vcvtq_f32_s32
(
vline1_s32
);
float32x4_t
vline2_f32
=
vcvtq_f32_s32
(
vline2_s32
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vst1q_f32
(
_dst
+
i
+
0
,
vline1_f32
);
vst1q_f32
(
_dst
+
i
+
4
,
vline2_f32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
f32
,
u8
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)((
1
<<
16
)
*
alpha
));
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)((
1
<<
16
)
*
beta
));
register
uint32x4_t
vmask
asm
(
"q2"
)
=
vdupq_n_u32
(
1
<<
16
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d6-d7}, [%[src1]]
\n\t
"
"vld1.32 {d8-d9}, [%[src2]]
\n\t
"
"vmul.f32 q5, q3, q0
\n\t
"
"vmul.f32 q6, q4, q0
\n\t
"
"vadd.f32 q7, q5, q1
\n\t
"
"vadd.f32 q8, q6, q1
\n\t
"
"vcvt.u32.f32 q9, q7
\n\t
"
"vcvt.u32.f32 q10, q8
\n\t
"
"vbic q11, q2, q6
\n\t
"
"vbic q12, q2, q7
\n\t
"
"vshr.u32 q13, q11, #16
\n\t
"
"vshr.u32 q14, q12, #16
\n\t
"
"vqsub.u32 q7, q9, q13
\n\t
"
"vqsub.u32 q8, q10, q14
\n\t
"
"vqrshrn.u32 d22, q7, #16
\n\t
"
"vqrshrn.u32 d23, q8, #16
\n\t
"
"vqmovn.u16 d30, q11
\n\t
"
"vst1.8 {d30}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst
]
"r"
(
_dst
+
i
),
"w"
(
vscale
),
"w"
(
vshift
),
"w"
(
vmask
)
:
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
,
"d24"
,
"d25"
,
"d26"
,
"d27"
,
"d28"
,
"d29"
,
"d30"
);
}
})
#else
CVTS_FUNC
(
f32
,
u8
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)((
1
<<
16
)
*
alpha
));
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)((
1
<<
16
)
*
beta
));
uint32x4_t
vmask
=
vdupq_n_u32
(
1
<<
16
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
float32x4_t
vline1_f32
=
vld1q_f32
(
_src
+
i
+
0
);
float32x4_t
vline2_f32
=
vld1q_f32
(
_src
+
i
+
4
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
float32x4_t
vline1Shifted_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
float32x4_t
vline2Shifted_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
uint32x4_t
vline1_u32
=
vcvtq_u32_f32
(
vline1Shifted_f32
);
uint32x4_t
vline2_u32
=
vcvtq_u32_f32
(
vline2Shifted_f32
);
uint32x4_t
vline1Mask
=
vbicq_u32
(
vmask
,
vreinterpretq_u32_f32
(
vline2_f32
));
uint32x4_t
vline2Mask
=
vbicq_u32
(
vmask
,
vreinterpretq_u32_f32
(
vline1Shifted_f32
));
vline1Mask
=
vshrq_n_u32
(
vline1Mask
,
16
);
vline2Mask
=
vshrq_n_u32
(
vline2Mask
,
16
);
vline1_u32
=
vqsubq_u32
(
vline1_u32
,
vline1Mask
);
vline2_u32
=
vqsubq_u32
(
vline2_u32
,
vline2Mask
);
uint16x4_t
vRes1
=
vqrshrn_n_u32
(
vline1_u32
,
16
);
uint16x4_t
vRes2
=
vqrshrn_n_u32
(
vline2_u32
,
16
);
uint8x8_t
vRes
=
vqmovn_u16
(
vcombine_u16
(
vRes1
,
vRes2
));
vst1_u8
(
_dst
+
i
,
vRes
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
f32
,
s8
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vmul.f32 q4, q2, q0
\n\t
"
"vmul.f32 q5, q3, q0
\n\t
"
"vadd.f32 q6, q4, q1
\n\t
"
"vadd.f32 q7, q5, q1
\n\t
"
"vcvt.s32.f32 q8, q6
\n\t
"
"vcvt.s32.f32 q9, q7
\n\t
"
"vqmovn.s32 d14, q8
\n\t
"
"vqmovn.s32 d15, q9
\n\t
"
"vqmovn.s16 d16, q7
\n\t
"
"vst1.8 {d16}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst
]
"r"
(
_dst
+
i
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
);
}
})
#else
CVTS_FUNC
(
f32
,
s8
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
float32x4_t
vline1_f32
=
vld1q_f32
(
_src
+
i
+
0
);
float32x4_t
vline2_f32
=
vld1q_f32
(
_src
+
i
+
4
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
int8x8_t
vRes
=
vqmovn_s16
(
vcombine_s16
(
vRes1
,
vRes2
));
vst1_s8
(
_dst
+
i
,
vRes
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
f32
,
u16
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vmul.f32 q4, q2, q0
\n\t
"
"vmul.f32 q5, q3, q0
\n\t
"
"vadd.f32 q6, q4, q1
\n\t
"
"vadd.f32 q7, q5, q1
\n\t
"
"vcvt.u32.f32 q8, q6
\n\t
"
"vcvt.u32.f32 q9, q7
\n\t
"
"vqmovn.u32 d8, q8
\n\t
"
"vqmovn.u32 d9, q9
\n\t
"
"vst1.16 {d8-d9}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst
]
"r"
(
_dst
+
i
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
);
}
})
#else
CVTS_FUNC
(
f32
,
u16
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
float32x4_t
vline1_f32
=
vld1q_f32
(
_src
+
i
+
0
);
float32x4_t
vline2_f32
=
vld1q_f32
(
_src
+
i
+
4
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
uint32x4_t
vline1_u32
=
vcvtq_u32_f32
(
vline1_f32
);
uint32x4_t
vline2_u32
=
vcvtq_u32_f32
(
vline2_f32
);
uint16x4_t
vRes1
=
vqmovn_u32
(
vline1_u32
);
uint16x4_t
vRes2
=
vqmovn_u32
(
vline2_u32
);
vst1q_u16
(
_dst
+
i
,
vcombine_u16
(
vRes1
,
vRes2
));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
f32
,
s16
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vmul.f32 q4, q2, q0
\n\t
"
"vmul.f32 q5, q3, q0
\n\t
"
"vadd.f32 q6, q4, q1
\n\t
"
"vadd.f32 q7, q5, q1
\n\t
"
"vcvt.s32.f32 q8, q6
\n\t
"
"vcvt.s32.f32 q9, q7
\n\t
"
"vqmovn.s32 d8, q8
\n\t
"
"vqmovn.s32 d9, q9
\n\t
"
"vst1.16 {d8-d9}, [%[dst]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst
]
"r"
(
_dst
+
i
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
);
}
})
#else
CVTS_FUNC
(
f32
,
s16
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
float32x4_t
vline1_f32
=
vld1q_f32
(
_src
+
i
+
0
);
float32x4_t
vline2_f32
=
vld1q_f32
(
_src
+
i
+
4
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
int16x4_t
vRes1
=
vqmovn_s32
(
vline1_s32
);
int16x4_t
vRes2
=
vqmovn_s32
(
vline2_s32
);
vst1q_s16
(
_dst
+
i
,
vcombine_s16
(
vRes1
,
vRes2
));
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC
(
f32
,
s32
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vmul.f32 q4, q2, q0
\n\t
"
"vmul.f32 q5, q3, q0
\n\t
"
"vadd.f32 q6, q4, q1
\n\t
"
"vadd.f32 q7, q5, q1
\n\t
"
"vcvt.s32.f32 q4, q6
\n\t
"
"vcvt.s32.f32 q5, q7
\n\t
"
"vst1.32 {d8-d9}, [%[dst1]]
\n\t
"
"vst1.32 {d10-d11}, [%[dst2]]
\n\t
"
:
//no output
:
[
src1
]
"r"
(
_src
+
i
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst1
]
"r"
(
_dst
+
i
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
);
}
})
#else
CVTS_FUNC
(
f32
,
s32
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
+
0.5
f
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
float32x4_t
vline1_f32
=
vld1q_f32
(
_src
+
i
+
0
);
float32x4_t
vline2_f32
=
vld1q_f32
(
_src
+
i
+
4
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
int32x4_t
vline1_s32
=
vcvtq_s32_f32
(
vline1_f32
);
int32x4_t
vline2_s32
=
vcvtq_s32_f32
(
vline2_f32
);
vst1q_s32
(
_dst
+
i
+
0
,
vline1_s32
);
vst1q_s32
(
_dst
+
i
+
4
,
vline2_s32
);
}
})
#endif
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
CVTS_FUNC1
(
f32
,
8
,
register
float32x4_t
vscale
asm
(
"q0"
)
=
vdupq_n_f32
((
f32
)
alpha
);
register
float32x4_t
vshift
asm
(
"q1"
)
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
__asm__
(
"vld1.32 {d4-d5}, [%[src1]]
\n\t
"
"vld1.32 {d6-d7}, [%[src2]]
\n\t
"
"vmul.f32 q4, q2, q0
\n\t
"
"vmul.f32 q5, q3, q0
\n\t
"
"vadd.f32 q6, q4, q1
\n\t
"
"vadd.f32 q7, q5, q1
\n\t
"
"vst1.32 {d12-d13}, [%[dst1]]
\n\t
"
"vst1.32 {d14-d15}, [%[dst2]]
\n\t
"
:
/*no output*/
:
[
src1
]
"r"
(
_src
+
i
+
0
),
[
src2
]
"r"
(
_src
+
i
+
4
),
[
dst1
]
"r"
(
_dst
+
i
+
0
),
[
dst2
]
"r"
(
_dst
+
i
+
4
),
"w"
(
vscale
),
"w"
(
vshift
)
:
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
);
}
})
#else
CVTS_FUNC1
(
f32
,
8
,
float32x4_t
vscale
=
vdupq_n_f32
((
f32
)
alpha
);
float32x4_t
vshift
=
vdupq_n_f32
((
f32
)
beta
);,
{
for
(
size_t
i
=
0
;
i
<
w
;
i
+=
8
)
{
internal
::
prefetch
(
_src
+
i
);
float32x4_t
vline1_f32
=
vld1q_f32
(
_src
+
i
+
0
);
float32x4_t
vline2_f32
=
vld1q_f32
(
_src
+
i
+
4
);
vline1_f32
=
vmulq_f32
(
vline1_f32
,
vscale
);
vline2_f32
=
vmulq_f32
(
vline2_f32
,
vscale
);
vline1_f32
=
vaddq_f32
(
vline1_f32
,
vshift
);
vline2_f32
=
vaddq_f32
(
vline2_f32
,
vshift
);
vst1q_f32
(
_dst
+
i
+
0
,
vline1_f32
);
vst1q_f32
(
_dst
+
i
+
4
,
vline2_f32
);
}
})
#endif
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/convolution.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
namespace
CAROTENE_NS
{
bool
isConvolutionSupported
(
const
Size2D
&
size
,
const
Size2D
&
ksize
,
BORDER_MODE
border
)
{
return
isSupportedConfiguration
()
&&
size
.
width
>=
8
&&
(
border
==
BORDER_MODE_CONSTANT
||
border
==
BORDER_MODE_REPLICATE
)
&&
(
ksize
.
width
==
3
)
&&
(
ksize
.
height
==
3
);
}
#ifdef CAROTENE_NEON
namespace
{
template
<
int
shift
>
int32x4_t
vshrq_s32
(
int32x4_t
value
)
{
return
vshrq_n_s32
(
value
,
shift
);
}
template
<
>
int32x4_t
vshrq_s32
<
0
>
(
int32x4_t
value
)
{
return
value
;
}
}
// namespace
typedef
int32x4_t
(
*
vshrq_s32_func
)(
int32x4_t
value
);
#endif
void
convolution
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
u8
borderValue
,
const
Size2D
&
ksize
,
s16
*
kernelBase
,
u32
scale
)
{
internal
::
assertSupportedConfiguration
(
isConvolutionSupported
(
size
,
ksize
,
border
));
#ifdef CAROTENE_NEON
const
uint8x8_t
v_zero_u8
=
vdup_n_u8
(
0
);
const
uint8x8_t
v_border
=
vdup_n_u8
(
borderValue
);
const
int32x4_t
v_zero_s32
=
vdupq_n_s32
(
0
);
uint8x8_t
tprev
[
3
]
=
{
v_zero_u8
,
v_zero_u8
,
v_zero_u8
},
tcurr
[
3
]
=
{
v_zero_u8
,
v_zero_u8
,
v_zero_u8
},
tnext
[
3
]
=
{
v_zero_u8
,
v_zero_u8
,
v_zero_u8
};
uint8x8_t
t0
=
v_zero_u8
,
t1
=
v_zero_u8
,
t2
=
v_zero_u8
;
ptrdiff_t
width
=
(
ptrdiff_t
)
size
.
width
,
height
=
(
ptrdiff_t
)
size
.
height
;
static
const
vshrq_s32_func
vshrq_s32_a
[
33
]
=
{
vshrq_s32
<
0
>
,
vshrq_s32
<
1
>
,
vshrq_s32
<
2
>
,
vshrq_s32
<
3
>
,
vshrq_s32
<
4
>
,
vshrq_s32
<
5
>
,
vshrq_s32
<
6
>
,
vshrq_s32
<
7
>
,
vshrq_s32
<
8
>
,
vshrq_s32
<
9
>
,
vshrq_s32
<
10
>
,
vshrq_s32
<
11
>
,
vshrq_s32
<
12
>
,
vshrq_s32
<
13
>
,
vshrq_s32
<
14
>
,
vshrq_s32
<
15
>
,
vshrq_s32
<
16
>
,
vshrq_s32
<
17
>
,
vshrq_s32
<
18
>
,
vshrq_s32
<
19
>
,
vshrq_s32
<
20
>
,
vshrq_s32
<
21
>
,
vshrq_s32
<
22
>
,
vshrq_s32
<
23
>
,
vshrq_s32
<
24
>
,
vshrq_s32
<
25
>
,
vshrq_s32
<
26
>
,
vshrq_s32
<
27
>
,
vshrq_s32
<
28
>
,
vshrq_s32
<
29
>
,
vshrq_s32
<
30
>
,
vshrq_s32
<
31
>
,
vshrq_s32
<
32
>
};
vshrq_s32_func
vshrq_s32_p
=
vshrq_s32_a
[
scale
];
for
(
ptrdiff_t
y
=
0
;
y
<
height
;
++
y
)
{
const
u8
*
srow0
=
y
==
0
&&
border
==
BORDER_MODE_CONSTANT
?
NULL
:
internal
::
getRowPtr
(
srcBase
,
srcStride
,
std
::
max
<
ptrdiff_t
>
(
y
-
1
,
0
));
const
u8
*
srow1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
);
const
u8
*
srow2
=
y
+
1
==
height
&&
border
==
BORDER_MODE_CONSTANT
?
NULL
:
internal
::
getRowPtr
(
srcBase
,
srcStride
,
std
::
min
(
y
+
1
,
height
-
1
));
u8
*
drow
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
u8
prevx
[
3
]
=
{
0
,
0
,
0
},
currx
[
3
]
=
{
0
,
0
,
0
},
nextx
[
3
]
=
{
0
,
0
,
0
};
ptrdiff_t
x
=
0
;
const
ptrdiff_t
bwidth
=
y
+
2
<
height
?
width
:
(
width
-
8
);
// perform vertical convolution
for
(
;
x
<=
bwidth
;
x
+=
8
)
{
internal
::
prefetch
(
srow0
+
x
);
internal
::
prefetch
(
srow1
+
x
);
internal
::
prefetch
(
srow2
+
x
);
uint8x8_t
x0
=
!
srow0
?
v_border
:
vld1_u8
(
srow0
+
x
);
uint8x8_t
x1
=
vld1_u8
(
srow1
+
x
);
uint8x8_t
x2
=
!
srow2
?
v_border
:
vld1_u8
(
srow2
+
x
);
// calculate values for plain CPU part below if needed
if
(
x
+
8
>=
bwidth
)
{
ptrdiff_t
x3
=
x
==
width
?
width
-
1
:
x
;
ptrdiff_t
x4
=
border
==
BORDER_MODE_CONSTANT
?
x3
-
1
:
std
::
max
<
ptrdiff_t
>
(
x3
-
1
,
0
);
if
(
border
==
BORDER_MODE_CONSTANT
&&
x4
<
0
)
prevx
[
0
]
=
prevx
[
1
]
=
prevx
[
2
]
=
borderValue
;
else
{
prevx
[
0
]
=
srow0
?
srow0
[
x4
]
:
borderValue
;
prevx
[
1
]
=
srow1
[
x4
]
;
prevx
[
2
]
=
srow2
?
srow2
[
x4
]
:
borderValue
;
}
currx
[
0
]
=
srow0
?
srow0
[
x3
]
:
borderValue
;
currx
[
1
]
=
srow1
[
x3
]
;
currx
[
2
]
=
srow2
?
srow2
[
x3
]
:
borderValue
;
}
// make shift
if
(
x
)
{
tprev
[
0
]
=
tcurr
[
0
];
tcurr
[
0
]
=
tnext
[
0
];
tprev
[
1
]
=
tcurr
[
1
];
tcurr
[
1
]
=
tnext
[
1
];
tprev
[
2
]
=
tcurr
[
2
];
tcurr
[
2
]
=
tnext
[
2
];
}
tnext
[
0
]
=
x0
;
tnext
[
1
]
=
x1
;
tnext
[
2
]
=
x2
;
// make extrapolation for the first elements
if
(
!
x
)
{
// make border
if
(
border
==
BORDER_MODE_CONSTANT
)
tcurr
[
0
]
=
tcurr
[
1
]
=
tcurr
[
2
]
=
v_border
;
else
if
(
border
==
BORDER_MODE_REPLICATE
)
{
tcurr
[
0
]
=
vdup_n_u8
(
vget_lane_u8
(
tnext
[
0
],
0
));
tcurr
[
1
]
=
vdup_n_u8
(
vget_lane_u8
(
tnext
[
1
],
0
));
tcurr
[
2
]
=
vdup_n_u8
(
vget_lane_u8
(
tnext
[
2
],
0
));
}
continue
;
}
int32x4_t
v_dst0
=
v_zero_s32
,
v_dst1
=
v_zero_s32
;
{
// combine 3 "shifted" vectors
t0
=
vext_u8
(
tprev
[
0
],
tcurr
[
0
],
7
);
t1
=
tcurr
[
0
];
t2
=
vext_u8
(
tcurr
[
0
],
tnext
[
0
],
1
);
int16x8_t
t0_16s
=
vreinterpretq_s16_u16
(
vmovl_u8
(
t0
));
int16x8_t
t1_16s
=
vreinterpretq_s16_u16
(
vmovl_u8
(
t1
));
int16x8_t
t2_16s
=
vreinterpretq_s16_u16
(
vmovl_u8
(
t2
));
v_dst0
=
vmlal_n_s16
(
v_dst0
,
vget_low_s16
(
t0_16s
),
kernelBase
[
8
]);
v_dst0
=
vmlal_n_s16
(
v_dst0
,
vget_low_s16
(
t1_16s
),
kernelBase
[
7
]);
v_dst0
=
vmlal_n_s16
(
v_dst0
,
vget_low_s16
(
t2_16s
),
kernelBase
[
6
]);
v_dst1
=
vmlal_n_s16
(
v_dst1
,
vget_high_s16
(
t0_16s
),
kernelBase
[
8
]);
v_dst1
=
vmlal_n_s16
(
v_dst1
,
vget_high_s16
(
t1_16s
),
kernelBase
[
7
]);
v_dst1
=
vmlal_n_s16
(
v_dst1
,
vget_high_s16
(
t2_16s
),
kernelBase
[
6
]);
}
{
// combine 3 "shifted" vectors
t0
=
vext_u8
(
tprev
[
1
],
tcurr
[
1
],
7
);
t1
=
tcurr
[
1
];
t2
=
vext_u8
(
tcurr
[
1
],
tnext
[
1
],
1
);
int16x8_t
t0_16s
=
vreinterpretq_s16_u16
(
vmovl_u8
(
t0
));
int16x8_t
t1_16s
=
vreinterpretq_s16_u16
(
vmovl_u8
(
t1
));
int16x8_t
t2_16s
=
vreinterpretq_s16_u16
(
vmovl_u8
(
t2
));
v_dst0
=
vmlal_n_s16
(
v_dst0
,
vget_low_s16
(
t0_16s
),
kernelBase
[
5
]);
v_dst0
=
vmlal_n_s16
(
v_dst0
,
vget_low_s16
(
t1_16s
),
kernelBase
[
4
]);
v_dst0
=
vmlal_n_s16
(
v_dst0
,
vget_low_s16
(
t2_16s
),
kernelBase
[
3
]);
v_dst1
=
vmlal_n_s16
(
v_dst1
,
vget_high_s16
(
t0_16s
),
kernelBase
[
5
]);
v_dst1
=
vmlal_n_s16
(
v_dst1
,
vget_high_s16
(
t1_16s
),
kernelBase
[
4
]);
v_dst1
=
vmlal_n_s16
(
v_dst1
,
vget_high_s16
(
t2_16s
),
kernelBase
[
3
]);
}
{
// combine 3 "shifted" vectors
t0
=
vext_u8
(
tprev
[
2
],
tcurr
[
2
],
7
);
t1
=
tcurr
[
2
];
t2
=
vext_u8
(
tcurr
[
2
],
tnext
[
2
],
1
);
int16x8_t
t0_16s
=
vreinterpretq_s16_u16
(
vmovl_u8
(
t0
));
int16x8_t
t1_16s
=
vreinterpretq_s16_u16
(
vmovl_u8
(
t1
));
int16x8_t
t2_16s
=
vreinterpretq_s16_u16
(
vmovl_u8
(
t2
));
v_dst0
=
vmlal_n_s16
(
v_dst0
,
vget_low_s16
(
t0_16s
),
kernelBase
[
2
]);
v_dst0
=
vmlal_n_s16
(
v_dst0
,
vget_low_s16
(
t1_16s
),
kernelBase
[
1
]);
v_dst0
=
vmlal_n_s16
(
v_dst0
,
vget_low_s16
(
t2_16s
),
kernelBase
[
0
]);
v_dst1
=
vmlal_n_s16
(
v_dst1
,
vget_high_s16
(
t0_16s
),
kernelBase
[
2
]);
v_dst1
=
vmlal_n_s16
(
v_dst1
,
vget_high_s16
(
t1_16s
),
kernelBase
[
1
]);
v_dst1
=
vmlal_n_s16
(
v_dst1
,
vget_high_s16
(
t2_16s
),
kernelBase
[
0
]);
}
// make scale
v_dst0
=
vshrq_s32_p
(
v_dst0
);
v_dst1
=
vshrq_s32_p
(
v_dst1
);
// and add them
vst1_u8
(
drow
+
x
-
8
,
vqmovn_u16
(
vcombine_u16
(
vqmovun_s32
(
v_dst0
),
vqmovun_s32
(
v_dst1
))));
}
x
-=
8
;
if
(
x
==
width
)
--
x
;
for
(
;
x
<
width
;
++
x
)
{
// make extrapolation for the last elements
if
(
x
+
1
>=
width
)
{
if
(
border
==
BORDER_MODE_CONSTANT
)
{
nextx
[
0
]
=
borderValue
;
nextx
[
1
]
=
borderValue
;
nextx
[
2
]
=
borderValue
;
}
else
if
(
border
==
BORDER_MODE_REPLICATE
)
{
nextx
[
0
]
=
srow0
[
x
];
nextx
[
1
]
=
srow1
[
x
];
nextx
[
2
]
=
srow2
[
x
];
}
}
else
{
nextx
[
0
]
=
srow0
?
srow0
[
x
+
1
]
:
borderValue
;
nextx
[
1
]
=
srow1
[
x
+
1
]
;
nextx
[
2
]
=
srow2
?
srow2
[
x
+
1
]
:
borderValue
;
}
s32
val
=
0
;
for
(
s32
_y
=
0
;
_y
<
3
;
++
_y
)
val
+=
prevx
[
_y
]
*
kernelBase
[(
2
-
_y
)
*
3
+
2
]
+
currx
[
_y
]
*
kernelBase
[(
2
-
_y
)
*
3
+
1
]
+
nextx
[
_y
]
*
kernelBase
[(
2
-
_y
)
*
3
+
0
];
drow
[
x
]
=
internal
::
saturate_cast
<
u8
>
(
val
>>
scale
);
// make shift
prevx
[
0
]
=
currx
[
0
];
currx
[
0
]
=
nextx
[
0
];
prevx
[
1
]
=
currx
[
1
];
currx
[
1
]
=
nextx
[
1
];
prevx
[
2
]
=
currx
[
2
];
currx
[
2
]
=
nextx
[
2
];
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
border
;
(
void
)
borderValue
;
(
void
)
ksize
;
(
void
)
kernelBase
;
(
void
)
scale
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/count_nonzero.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <limits>
namespace
CAROTENE_NS
{
s32
countNonZero
(
const
Size2D
&
_size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
Size2D
size
(
_size
);
if
(
srcStride
==
(
ptrdiff_t
)(
size
.
width
))
{
size
.
width
*=
size
.
height
;
size
.
height
=
1
;
}
size_t
roiw16
=
size
.
width
&
~
15u
;
s32
result
=
0
;
for
(
size_t
k
=
0
;
k
<
size
.
height
;
++
k
)
{
const
u8
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
k
);
size_t
i
=
0
;
#define COUNTNONZERO8U_BLOCK_SIZE (16*255)
uint8x16_t
vc1
=
vmovq_n_u8
(
1
);
for
(;
i
<
roiw16
;)
{
size_t
lim
=
std
::
min
(
i
+
COUNTNONZERO8U_BLOCK_SIZE
,
size
.
width
)
-
16
;
uint8x16_t
vs
=
vmovq_n_u8
(
0
);
for
(;
i
<=
lim
;
i
+=
16
)
{
internal
::
prefetch
(
src
+
i
);
uint8x16_t
vln
=
vld1q_u8
(
src
+
i
);
uint8x16_t
vnz
=
vminq_u8
(
vln
,
vc1
);
vs
=
vaddq_u8
(
vs
,
vnz
);
}
uint32x4_t
vs4
=
vpaddlq_u16
(
vpaddlq_u8
(
vs
));
uint32x2_t
vs2
=
vadd_u32
(
vget_low_u32
(
vs4
),
vget_high_u32
(
vs4
));
s32
s
[
2
];
vst1_u32
((
u32
*
)
s
,
vs2
);
if
(
s
[
0
]
<
0
||
s
[
1
]
<
0
)
//saturate in case of overflow ~ 2GB of non-zeros...
{
return
0x7fFFffFF
;
}
result
+=
(
s
[
0
]
+=
s
[
1
]);
if
(
s
[
0
]
<
0
||
result
<
0
)
{
return
0x7fFFffFF
;
}
}
for
(;
i
<
size
.
width
;
i
++
)
result
+=
(
src
[
i
]
!=
0
)
?
1
:
0
;
if
(
result
<
0
)
//saturate in case of overflow ~ 2GB of non-zeros...
{
return
0x7fFFffFF
;
}
}
return
result
;
#else
(
void
)
_size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
return
0
;
#endif
}
s32
countNonZero
(
const
Size2D
&
_size
,
const
u16
*
srcBase
,
ptrdiff_t
srcStride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
Size2D
size
(
_size
);
if
(
srcStride
==
(
ptrdiff_t
)(
size
.
width
))
{
size
.
width
*=
size
.
height
;
size
.
height
=
1
;
}
size_t
roiw8
=
size
.
width
&
~
7u
;
s32
result
=
0
;
for
(
size_t
k
=
0
;
k
<
size
.
height
;
++
k
)
{
const
u16
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
k
);
size_t
i
=
0
;
#define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
uint16x8_t
vc1
=
vmovq_n_u16
(
1
);
for
(;
i
<
roiw8
;)
{
size_t
lim
=
std
::
min
(
i
+
COUNTNONZERO16U_BLOCK_SIZE
,
size
.
width
)
-
8
;
uint16x8_t
vs
=
vmovq_n_u16
(
0
);
for
(;
i
<=
lim
;
i
+=
8
)
{
internal
::
prefetch
(
src
+
i
);
uint16x8_t
vln
=
vld1q_u16
(
src
+
i
);
uint16x8_t
vnz
=
vminq_u16
(
vln
,
vc1
);
vs
=
vaddq_u16
(
vs
,
vnz
);
}
uint32x4_t
vs4
=
vpaddlq_u16
(
vs
);
uint32x2_t
vs2
=
vadd_u32
(
vget_low_u32
(
vs4
),
vget_high_u32
(
vs4
));
s32
s
[
2
];
vst1_u32
((
u32
*
)
s
,
vs2
);
if
(
s
[
0
]
<
0
||
s
[
1
]
<
0
)
//saturate in case of overflow ~ 4GB of non-zeros...
{
return
0x7fFFffFF
;
}
result
+=
(
s
[
0
]
+=
s
[
1
]);
if
(
s
[
0
]
<
0
||
result
<
0
)
{
return
0x7fFFffFF
;
}
}
for
(;
i
<
size
.
width
;
i
++
)
result
+=
(
src
[
i
]
!=
0
)
?
1
:
0
;
if
(
result
<
0
)
//saturate in case of overflow ~ 4GB of non-zeros...
{
return
0x7fFFffFF
;
}
}
return
result
;
#else
(
void
)
_size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
return
0
;
#endif
}
s32
countNonZero
(
const
Size2D
&
_size
,
const
s32
*
srcBase
,
ptrdiff_t
srcStride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
Size2D
size
(
_size
);
if
(
srcStride
==
(
ptrdiff_t
)(
size
.
width
))
{
size
.
width
*=
size
.
height
;
size
.
height
=
1
;
}
size_t
roiw4
=
size
.
width
&
~
3u
;
s32
result
=
0
;
for
(
size_t
k
=
0
;
k
<
size
.
height
;
++
k
)
{
const
u32
*
src
=
(
const
u32
*
)
internal
::
getRowPtr
(
srcBase
,
srcStride
,
k
);
u32
i
=
0
;
uint32x4_t
vc1
=
vmovq_n_u32
(
1
);
uint32x4_t
vs
=
vmovq_n_u32
(
0
);
for
(;
i
<
roiw4
;
i
+=
4
)
{
internal
::
prefetch
(
src
+
i
);
uint32x4_t
vln
=
vld1q_u32
(
src
+
i
);
uint32x4_t
vnz
=
vminq_u32
(
vln
,
vc1
);
vs
=
vqaddq_u32
(
vs
,
vnz
);
}
uint32x2_t
vs2
=
vqadd_u32
(
vget_low_u32
(
vs
),
vget_high_u32
(
vs
));
s32
s
[
2
];
vst1_u32
((
u32
*
)
s
,
vs2
);
if
(
s
[
0
]
<
0
||
s
[
1
]
<
0
)
//saturate in case of overflow ~ 8GB of non-zeros...
{
return
0x7fFFffFF
;
}
result
+=
(
s
[
0
]
+=
s
[
1
]);
if
(
s
[
0
]
<
0
||
result
<
0
)
{
return
0x7fFFffFF
;
}
for
(;
i
<
size
.
width
;
i
++
)
result
+=
(
src
[
i
]
!=
0
)
?
1
:
0
;
if
(
result
<
0
)
//saturate in case of overflow ~ 8GB of non-zeros...
{
return
0x7fFFffFF
;
}
}
return
result
;
#else
(
void
)
_size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
return
0
;
#endif
}
s32
countNonZero
(
const
Size2D
&
_size
,
const
f32
*
srcBase
,
ptrdiff_t
srcStride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
Size2D
size
(
_size
);
if
(
srcStride
==
(
ptrdiff_t
)(
size
.
width
))
{
size
.
width
*=
size
.
height
;
size
.
height
=
1
;
}
size_t
roiw4
=
size
.
width
&
~
3u
;
s32
result
=
0
;
for
(
size_t
k
=
0
;
k
<
size
.
height
;
++
k
)
{
const
f32
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
k
);
size_t
i
=
0
;
float32x4_t
vc0
=
vmovq_n_f32
(
0
);
int32x4_t
vs
=
vmovq_n_s32
(
0
);
for
(;
i
<
roiw4
;
i
+=
4
)
{
internal
::
prefetch
(
src
+
i
);
float32x4_t
vln
=
vld1q_f32
(
src
+
i
);
int32x4_t
vnz
=
vreinterpretq_s32_u32
(
vmvnq_u32
(
vceqq_f32
(
vln
,
vc0
)));
vs
=
vqaddq_s32
(
vs
,
vnz
);
}
int32x2_t
vs2
=
vqneg_s32
(
vqadd_s32
(
vget_low_s32
(
vs
),
vget_high_s32
(
vs
)));
int
s
[
2
];
vst1_s32
(
s
,
vs2
);
result
+=
(
s
[
0
]
+=
s
[
1
]);
if
(
s
[
0
]
<
0
||
result
<
0
)
//case of overflow ~ 8GB of non-zeros...
{
return
0x7fFFffFF
;
}
for
(;
i
<
size
.
width
;
i
++
)
result
+=
(
src
[
i
]
<
std
::
numeric_limits
<
float
>::
min
()
&&
src
[
i
]
>
-
std
::
numeric_limits
<
float
>::
min
())
?
0
:
1
;
if
(
result
<
0
)
{
return
0x7fFFffFF
;
}
}
return
result
;
#else
(
void
)
_size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
return
0
;
#endif
}
s32
countNonZero
(
const
Size2D
&
_size
,
const
f64
*
srcBase
,
ptrdiff_t
srcStride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
Size2D
size
(
_size
);
if
(
srcStride
==
(
ptrdiff_t
)(
size
.
width
))
{
size
.
width
*=
size
.
height
;
size
.
height
=
1
;
}
size_t
roiw8
=
size
.
width
&
~
7u
;
size_t
roiw4
=
size
.
width
&
~
3u
;
size_t
roiw2
=
size
.
width
&
~
1u
;
uint64x2_t
vmask1
=
vdupq_n_u64
(
0x7fFFffFFffFFffFFULL
);
//will treat denormals as non-zero
uint32x4_t
vc0
=
vmovq_n_u32
(
0
);
s32
result
=
0
;
for
(
size_t
k
=
0
;
k
<
size
.
height
;
++
k
)
{
const
f64
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
k
);
size_t
i
=
0
;
int32x2_t
vs1
=
vmov_n_s32
(
0
);
int32x2_t
vs2
=
vmov_n_s32
(
0
);
int32x2_t
vs3
=
vmov_n_s32
(
0
);
int32x2_t
vs4
=
vmov_n_s32
(
0
);
for
(;
i
<
roiw8
;
i
+=
8
)
{
internal
::
prefetch
(
src
+
i
+
6
);
uint64x2_t
vln1
=
vld1q_u64
((
const
u64
*
)(
src
+
i
));
uint64x2_t
vln2
=
vld1q_u64
((
const
u64
*
)(
src
+
i
+
2
));
uint64x2_t
vln3
=
vld1q_u64
((
const
u64
*
)(
src
+
i
+
4
));
uint64x2_t
vln4
=
vld1q_u64
((
const
u64
*
)(
src
+
i
+
6
));
uint64x2_t
vm1
=
vandq_u64
(
vln1
,
vmask1
);
uint64x2_t
vm2
=
vandq_u64
(
vln2
,
vmask1
);
uint64x2_t
vm3
=
vandq_u64
(
vln3
,
vmask1
);
uint64x2_t
vm4
=
vandq_u64
(
vln4
,
vmask1
);
uint32x4_t
vequ1
=
vceqq_u32
(
vreinterpretq_u32_u64
(
vm1
),
vc0
);
uint32x4_t
vequ2
=
vceqq_u32
(
vreinterpretq_u32_u64
(
vm2
),
vc0
);
uint32x4_t
vequ3
=
vceqq_u32
(
vreinterpretq_u32_u64
(
vm3
),
vc0
);
uint32x4_t
vequ4
=
vceqq_u32
(
vreinterpretq_u32_u64
(
vm4
),
vc0
);
uint32x4_t
vlx1
=
vmvnq_u32
(
vequ1
);
uint32x4_t
vlx2
=
vmvnq_u32
(
vequ2
);
uint32x4_t
vlx3
=
vmvnq_u32
(
vequ3
);
uint32x4_t
vlx4
=
vmvnq_u32
(
vequ4
);
int32x2_t
vnz1
=
vreinterpret_s32_u32
(
vpmax_u32
(
vget_low_u32
(
vlx1
),
vget_high_u32
(
vlx1
)));
int32x2_t
vnz2
=
vreinterpret_s32_u32
(
vpmax_u32
(
vget_low_u32
(
vlx2
),
vget_high_u32
(
vlx2
)));
int32x2_t
vnz3
=
vreinterpret_s32_u32
(
vpmax_u32
(
vget_low_u32
(
vlx3
),
vget_high_u32
(
vlx3
)));
int32x2_t
vnz4
=
vreinterpret_s32_u32
(
vpmax_u32
(
vget_low_u32
(
vlx4
),
vget_high_u32
(
vlx4
)));
vs1
=
vqadd_s32
(
vs1
,
vnz1
);
vs2
=
vqadd_s32
(
vs2
,
vnz2
);
vs3
=
vqadd_s32
(
vs3
,
vnz3
);
vs4
=
vqadd_s32
(
vs4
,
vnz4
);
}
if
(
i
<
roiw4
)
{
internal
::
prefetch
(
src
+
i
+
2
);
uint64x2_t
vln1
=
vld1q_u64
((
const
u64
*
)(
src
+
i
));
uint64x2_t
vln2
=
vld1q_u64
((
const
u64
*
)(
src
+
i
+
2
));
uint64x2_t
vm1
=
vandq_u64
(
vln1
,
vmask1
);
uint64x2_t
vm2
=
vandq_u64
(
vln2
,
vmask1
);
uint32x4_t
vequ1
=
vceqq_u32
(
vreinterpretq_u32_u64
(
vm1
),
vc0
);
uint32x4_t
vequ2
=
vceqq_u32
(
vreinterpretq_u32_u64
(
vm2
),
vc0
);
uint32x4_t
vlx1
=
vmvnq_u32
(
vequ1
);
uint32x4_t
vlx2
=
vmvnq_u32
(
vequ2
);
int32x2_t
vnz1
=
vreinterpret_s32_u32
(
vpmax_u32
(
vget_low_u32
(
vlx1
),
vget_high_u32
(
vlx1
)));
int32x2_t
vnz2
=
vreinterpret_s32_u32
(
vpmax_u32
(
vget_low_u32
(
vlx2
),
vget_high_u32
(
vlx2
)));
vs1
=
vqadd_s32
(
vs1
,
vnz1
);
vs2
=
vqadd_s32
(
vs2
,
vnz2
);
i
+=
4
;
}
if
(
i
<
roiw2
)
{
internal
::
prefetch
(
src
+
i
);
uint64x2_t
vln1
=
vld1q_u64
((
const
u64
*
)(
src
+
i
));
uint64x2_t
vm1
=
vandq_u64
(
vln1
,
vmask1
);
uint32x4_t
vequ1
=
vceqq_u32
(
vreinterpretq_u32_u64
(
vm1
),
vc0
);
uint32x4_t
vlx1
=
vmvnq_u32
(
vequ1
);
int32x2_t
vnz1
=
vreinterpret_s32_u32
(
vpmax_u32
(
vget_low_u32
(
vlx1
),
vget_high_u32
(
vlx1
)));
vs1
=
vqadd_s32
(
vs1
,
vnz1
);
i
+=
2
;
}
vs1
=
vqadd_s32
(
vs1
,
vs2
);
vs3
=
vqadd_s32
(
vs3
,
vs4
);
vs1
=
vqadd_s32
(
vs1
,
vs3
);
int32x2_t
vsneg
=
vqneg_s32
(
vs1
);
s32
s
[
2
];
vst1_s32
(
s
,
vsneg
);
result
+=
(
s
[
0
]
+=
s
[
1
]);
if
(
s
[
0
]
<
0
||
result
<
0
)
//case of overflow ~ 16GB of non-zeros...
{
return
0x7fFFffFF
;
}
for
(;
i
<
size
.
width
;
i
++
)
result
+=
(
src
[
i
]
<
std
::
numeric_limits
<
double
>::
min
()
&&
src
[
i
]
>
-
std
::
numeric_limits
<
double
>::
min
())
?
0
:
1
;
if
(
result
<
0
)
{
return
0x7fFFffFF
;
}
}
return
result
;
#else
(
void
)
_size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
return
0
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/div.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
#include <cfloat>
#include <cmath>
#include <limits>
namespace
CAROTENE_NS
{
namespace
{
#ifdef CAROTENE_NEON
inline
float32x4_t
vroundq
(
const
float32x4_t
&
v
)
{
const
int32x4_t
signMask
=
vdupq_n_s32
(
1
<<
31
),
half
=
vreinterpretq_s32_f32
(
vdupq_n_f32
(
0.5
f
));
float32x4_t
v_addition
=
vreinterpretq_f32_s32
(
vorrq_s32
(
half
,
vandq_s32
(
signMask
,
vreinterpretq_s32_f32
(
v
))));
return
vaddq_f32
(
v
,
v_addition
);
}
template
<
typename
T
>
inline
T
divSaturateQ
(
const
T
&
v1
,
const
T
&
v2
,
const
float
scale
)
{
return
internal
::
vcombine
(
internal
::
vqmovn
(
divSaturateQ
(
internal
::
vmovl
(
internal
::
vget_low
(
v1
)),
internal
::
vmovl
(
internal
::
vget_low
(
v2
)),
scale
)),
internal
::
vqmovn
(
divSaturateQ
(
internal
::
vmovl
(
internal
::
vget_high
(
v1
)),
internal
::
vmovl
(
internal
::
vget_high
(
v2
)),
scale
))
);
}
template
<
>
inline
int32x4_t
divSaturateQ
<
int32x4_t
>
(
const
int32x4_t
&
v1
,
const
int32x4_t
&
v2
,
const
float
scale
)
{
return
vcvtq_s32_f32
(
vroundq
(
vmulq_f32
(
vmulq_n_f32
(
vcvtq_f32_s32
(
v1
),
scale
),
internal
::
vrecpq_f32
(
vcvtq_f32_s32
(
v2
)))));
}
template
<
>
inline
uint32x4_t
divSaturateQ
<
uint32x4_t
>
(
const
uint32x4_t
&
v1
,
const
uint32x4_t
&
v2
,
const
float
scale
)
{
return
vcvtq_u32_f32
(
vroundq
(
vmulq_f32
(
vmulq_n_f32
(
vcvtq_f32_u32
(
v1
),
scale
),
internal
::
vrecpq_f32
(
vcvtq_f32_u32
(
v2
)))));
}
inline
float32x2_t
vround
(
const
float32x2_t
&
v
)
{
const
int32x2_t
signMask
=
vdup_n_s32
(
1
<<
31
),
half
=
vreinterpret_s32_f32
(
vdup_n_f32
(
0.5
f
));
float32x2_t
v_addition
=
vreinterpret_f32_s32
(
vorr_s32
(
half
,
vand_s32
(
signMask
,
vreinterpret_s32_f32
(
v
))));
return
vadd_f32
(
v
,
v_addition
);
}
template
<
typename
T
>
inline
T
divSaturate
(
const
T
&
v1
,
const
T
&
v2
,
const
float
scale
)
{
return
internal
::
vqmovn
(
divSaturateQ
(
internal
::
vmovl
(
v1
),
internal
::
vmovl
(
v2
),
scale
));
}
template
<
>
inline
int32x2_t
divSaturate
<
int32x2_t
>
(
const
int32x2_t
&
v1
,
const
int32x2_t
&
v2
,
const
float
scale
)
{
return
vcvt_s32_f32
(
vround
(
vmul_f32
(
vmul_n_f32
(
vcvt_f32_s32
(
v1
),
scale
),
internal
::
vrecp_f32
(
vcvt_f32_s32
(
v2
)))));
}
template
<
>
inline
uint32x2_t
divSaturate
<
uint32x2_t
>
(
const
uint32x2_t
&
v1
,
const
uint32x2_t
&
v2
,
const
float
scale
)
{
return
vcvt_u32_f32
(
vround
(
vmul_f32
(
vmul_n_f32
(
vcvt_f32_u32
(
v1
),
scale
),
internal
::
vrecp_f32
(
vcvt_f32_u32
(
v2
)))));
}
template
<
typename
T
>
inline
T
divWrapQ
(
const
T
&
v1
,
const
T
&
v2
,
const
float
scale
)
{
return
internal
::
vcombine
(
internal
::
vmovn
(
divWrapQ
(
internal
::
vmovl
(
internal
::
vget_low
(
v1
)),
internal
::
vmovl
(
internal
::
vget_low
(
v2
)),
scale
)),
internal
::
vmovn
(
divWrapQ
(
internal
::
vmovl
(
internal
::
vget_high
(
v1
)),
internal
::
vmovl
(
internal
::
vget_high
(
v2
)),
scale
))
);
}
template
<
>
inline
int32x4_t
divWrapQ
<
int32x4_t
>
(
const
int32x4_t
&
v1
,
const
int32x4_t
&
v2
,
const
float
scale
)
{
return
vcvtq_s32_f32
(
vmulq_f32
(
vmulq_n_f32
(
vcvtq_f32_s32
(
v1
),
scale
),
internal
::
vrecpq_f32
(
vcvtq_f32_s32
(
v2
))));
}
template
<
>
inline
uint32x4_t
divWrapQ
<
uint32x4_t
>
(
const
uint32x4_t
&
v1
,
const
uint32x4_t
&
v2
,
const
float
scale
)
{
return
vcvtq_u32_f32
(
vmulq_f32
(
vmulq_n_f32
(
vcvtq_f32_u32
(
v1
),
scale
),
internal
::
vrecpq_f32
(
vcvtq_f32_u32
(
v2
))));
}
template
<
typename
T
>
inline
T
divWrap
(
const
T
&
v1
,
const
T
&
v2
,
const
float
scale
)
{
return
internal
::
vmovn
(
divWrapQ
(
internal
::
vmovl
(
v1
),
internal
::
vmovl
(
v2
),
scale
));
}
template
<
>
inline
int32x2_t
divWrap
<
int32x2_t
>
(
const
int32x2_t
&
v1
,
const
int32x2_t
&
v2
,
const
float
scale
)
{
return
vcvt_s32_f32
(
vmul_f32
(
vmul_n_f32
(
vcvt_f32_s32
(
v1
),
scale
),
internal
::
vrecp_f32
(
vcvt_f32_s32
(
v2
))));
}
template
<
>
inline
uint32x2_t
divWrap
<
uint32x2_t
>
(
const
uint32x2_t
&
v1
,
const
uint32x2_t
&
v2
,
const
float
scale
)
{
return
vcvt_u32_f32
(
vmul_f32
(
vmul_n_f32
(
vcvt_f32_u32
(
v1
),
scale
),
internal
::
vrecp_f32
(
vcvt_f32_u32
(
v2
))));
}
inline
uint8x16_t
vtstq
(
const
uint8x16_t
&
v0
,
const
uint8x16_t
&
v1
)
{
return
vtstq_u8
(
v0
,
v1
);
}
inline
uint16x8_t
vtstq
(
const
uint16x8_t
&
v0
,
const
uint16x8_t
&
v1
)
{
return
vtstq_u16
(
v0
,
v1
);
}
inline
uint32x4_t
vtstq
(
const
uint32x4_t
&
v0
,
const
uint32x4_t
&
v1
)
{
return
vtstq_u32
(
v0
,
v1
);
}
inline
int8x16_t
vtstq
(
const
int8x16_t
&
v0
,
const
int8x16_t
&
v1
)
{
return
vreinterpretq_s8_u8
(
vtstq_s8
(
v0
,
v1
));
}
inline
int16x8_t
vtstq
(
const
int16x8_t
&
v0
,
const
int16x8_t
&
v1
)
{
return
vreinterpretq_s16_u16
(
vtstq_s16
(
v0
,
v1
));
}
inline
int32x4_t
vtstq
(
const
int32x4_t
&
v0
,
const
int32x4_t
&
v1
)
{
return
vreinterpretq_s32_u32
(
vtstq_s32
(
v0
,
v1
));
}
inline
uint8x8_t
vtst
(
const
uint8x8_t
&
v0
,
const
uint8x8_t
&
v1
)
{
return
vtst_u8
(
v0
,
v1
);
}
inline
uint16x4_t
vtst
(
const
uint16x4_t
&
v0
,
const
uint16x4_t
&
v1
)
{
return
vtst_u16
(
v0
,
v1
);
}
inline
uint32x2_t
vtst
(
const
uint32x2_t
&
v0
,
const
uint32x2_t
&
v1
)
{
return
vtst_u32
(
v0
,
v1
);
}
inline
int8x8_t
vtst
(
const
int8x8_t
&
v0
,
const
int8x8_t
&
v1
)
{
return
vreinterpret_s8_u8
(
vtst_s8
(
v0
,
v1
));
}
inline
int16x4_t
vtst
(
const
int16x4_t
&
v0
,
const
int16x4_t
&
v1
)
{
return
vreinterpret_s16_u16
(
vtst_s16
(
v0
,
v1
));
}
inline
int32x2_t
vtst
(
const
int32x2_t
&
v0
,
const
int32x2_t
&
v1
)
{
return
vreinterpret_s32_u32
(
vtst_s32
(
v0
,
v1
));
}
#endif
template
<
typename
T
>
void
div
(
const
Size2D
&
size
,
const
T
*
src0Base
,
ptrdiff_t
src0Stride
,
const
T
*
src1Base
,
ptrdiff_t
src1Stride
,
T
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
typedef
typename
internal
::
VecTraits
<
T
>::
vec128
vec128
;
typedef
typename
internal
::
VecTraits
<
T
>::
vec64
vec64
;
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
static_assert
(
std
::
numeric_limits
<
T
>::
is_integer
,
"template implementation is for integer types only"
);
#endif
if
(
scale
==
0.0
f
||
(
std
::
numeric_limits
<
T
>::
is_integer
&&
(
scale
*
std
::
numeric_limits
<
T
>::
max
())
<
1.0
f
&&
(
scale
*
std
::
numeric_limits
<
T
>::
max
())
>
-
1.0
f
))
{
for
(
size_t
y
=
0
;
y
<
size
.
height
;
++
y
)
{
T
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
std
::
memset
(
dst
,
0
,
sizeof
(
T
)
*
size
.
width
);
}
return
;
}
const
size_t
step128
=
16
/
sizeof
(
T
);
size_t
roiw128
=
size
.
width
>=
(
step128
-
1
)
?
size
.
width
-
step128
+
1
:
0
;
const
size_t
step64
=
8
/
sizeof
(
T
);
size_t
roiw64
=
size
.
width
>=
(
step64
-
1
)
?
size
.
width
-
step64
+
1
:
0
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
T
*
src0
=
internal
::
getRowPtr
(
src0Base
,
src0Stride
,
i
);
const
T
*
src1
=
internal
::
getRowPtr
(
src1Base
,
src1Stride
,
i
);
T
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
size_t
j
=
0
;
if
(
cpolicy
==
CONVERT_POLICY_SATURATE
)
{
for
(;
j
<
roiw128
;
j
+=
step128
)
{
internal
::
prefetch
(
src0
+
j
);
internal
::
prefetch
(
src1
+
j
);
vec128
v_src0
=
internal
::
vld1q
(
src0
+
j
);
vec128
v_src1
=
internal
::
vld1q
(
src1
+
j
);
vec128
v_mask
=
vtstq
(
v_src1
,
v_src1
);
internal
::
vst1q
(
dst
+
j
,
internal
::
vandq
(
v_mask
,
divSaturateQ
(
v_src0
,
v_src1
,
scale
)));
}
for
(;
j
<
roiw64
;
j
+=
step64
)
{
vec64
v_src0
=
internal
::
vld1
(
src0
+
j
);
vec64
v_src1
=
internal
::
vld1
(
src1
+
j
);
vec64
v_mask
=
vtst
(
v_src1
,
v_src1
);
internal
::
vst1
(
dst
+
j
,
internal
::
vand
(
v_mask
,
divSaturate
(
v_src0
,
v_src1
,
scale
)));
}
for
(;
j
<
size
.
width
;
j
++
)
{
dst
[
j
]
=
src1
[
j
]
?
internal
::
saturate_cast
<
T
>
(
scale
*
src0
[
j
]
/
src1
[
j
])
:
0
;
}
}
else
// CONVERT_POLICY_WRAP
{
for
(;
j
<
roiw128
;
j
+=
step128
)
{
internal
::
prefetch
(
src0
+
j
);
internal
::
prefetch
(
src1
+
j
);
vec128
v_src0
=
internal
::
vld1q
(
src0
+
j
);
vec128
v_src1
=
internal
::
vld1q
(
src1
+
j
);
vec128
v_mask
=
vtstq
(
v_src1
,
v_src1
);
internal
::
vst1q
(
dst
+
j
,
internal
::
vandq
(
v_mask
,
divWrapQ
(
v_src0
,
v_src1
,
scale
)));
}
for
(;
j
<
roiw64
;
j
+=
step64
)
{
vec64
v_src0
=
internal
::
vld1
(
src0
+
j
);
vec64
v_src1
=
internal
::
vld1
(
src1
+
j
);
vec64
v_mask
=
vtst
(
v_src1
,
v_src1
);
internal
::
vst1
(
dst
+
j
,
internal
::
vand
(
v_mask
,
divWrap
(
v_src0
,
v_src1
,
scale
)));
}
for
(;
j
<
size
.
width
;
j
++
)
{
dst
[
j
]
=
src1
[
j
]
?
(
T
)((
s32
)
trunc
(
scale
*
src0
[
j
]
/
src1
[
j
]))
:
0
;
}
}
}
#else
(
void
)
size
;
(
void
)
src0Base
;
(
void
)
src0Stride
;
(
void
)
src1Base
;
(
void
)
src1Stride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
cpolicy
;
(
void
)
scale
;
#endif
}
#ifdef CAROTENE_NEON
template
<
typename
T
>
inline
T
recipSaturateQ
(
const
T
&
v2
,
const
float
scale
)
{
return
internal
::
vcombine
(
internal
::
vqmovn
(
recipSaturateQ
(
internal
::
vmovl
(
internal
::
vget_low
(
v2
)),
scale
)),
internal
::
vqmovn
(
recipSaturateQ
(
internal
::
vmovl
(
internal
::
vget_high
(
v2
)),
scale
))
);
}
template
<
>
inline
int32x4_t
recipSaturateQ
<
int32x4_t
>
(
const
int32x4_t
&
v2
,
const
float
scale
)
{
return
vcvtq_s32_f32
(
vmulq_n_f32
(
internal
::
vrecpq_f32
(
vcvtq_f32_s32
(
v2
)),
scale
));
}
template
<
>
inline
uint32x4_t
recipSaturateQ
<
uint32x4_t
>
(
const
uint32x4_t
&
v2
,
const
float
scale
)
{
return
vcvtq_u32_f32
(
vmulq_n_f32
(
internal
::
vrecpq_f32
(
vcvtq_f32_u32
(
v2
)),
scale
));
}
template
<
typename
T
>
inline
T
recipSaturate
(
const
T
&
v2
,
const
float
scale
)
{
return
internal
::
vqmovn
(
recipSaturateQ
(
internal
::
vmovl
(
v2
),
scale
));
}
template
<
>
inline
int32x2_t
recipSaturate
<
int32x2_t
>
(
const
int32x2_t
&
v2
,
const
float
scale
)
{
return
vcvt_s32_f32
(
vmul_n_f32
(
internal
::
vrecp_f32
(
vcvt_f32_s32
(
v2
)),
scale
));
}
template
<
>
inline
uint32x2_t
recipSaturate
<
uint32x2_t
>
(
const
uint32x2_t
&
v2
,
const
float
scale
)
{
return
vcvt_u32_f32
(
vmul_n_f32
(
internal
::
vrecp_f32
(
vcvt_f32_u32
(
v2
)),
scale
));
}
template
<
typename
T
>
inline
T
recipWrapQ
(
const
T
&
v2
,
const
float
scale
)
{
return
internal
::
vcombine
(
internal
::
vmovn
(
recipWrapQ
(
internal
::
vmovl
(
internal
::
vget_low
(
v2
)),
scale
)),
internal
::
vmovn
(
recipWrapQ
(
internal
::
vmovl
(
internal
::
vget_high
(
v2
)),
scale
))
);
}
template
<
>
inline
int32x4_t
recipWrapQ
<
int32x4_t
>
(
const
int32x4_t
&
v2
,
const
float
scale
)
{
return
vcvtq_s32_f32
(
vmulq_n_f32
(
internal
::
vrecpq_f32
(
vcvtq_f32_s32
(
v2
)),
scale
));
}
template
<
>
inline
uint32x4_t
recipWrapQ
<
uint32x4_t
>
(
const
uint32x4_t
&
v2
,
const
float
scale
)
{
return
vcvtq_u32_f32
(
vmulq_n_f32
(
internal
::
vrecpq_f32
(
vcvtq_f32_u32
(
v2
)),
scale
));
}
template
<
typename
T
>
inline
T
recipWrap
(
const
T
&
v2
,
const
float
scale
)
{
return
internal
::
vmovn
(
recipWrapQ
(
internal
::
vmovl
(
v2
),
scale
));
}
template
<
>
inline
int32x2_t
recipWrap
<
int32x2_t
>
(
const
int32x2_t
&
v2
,
const
float
scale
)
{
return
vcvt_s32_f32
(
vmul_n_f32
(
internal
::
vrecp_f32
(
vcvt_f32_s32
(
v2
)),
scale
));
}
template
<
>
inline
uint32x2_t
recipWrap
<
uint32x2_t
>
(
const
uint32x2_t
&
v2
,
const
float
scale
)
{
return
vcvt_u32_f32
(
vmul_n_f32
(
internal
::
vrecp_f32
(
vcvt_f32_u32
(
v2
)),
scale
));
}
#endif
template
<
typename
T
>
void
recip
(
const
Size2D
&
size
,
const
T
*
src1Base
,
ptrdiff_t
src1Stride
,
T
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
typedef
typename
internal
::
VecTraits
<
T
>::
vec128
vec128
;
typedef
typename
internal
::
VecTraits
<
T
>::
vec64
vec64
;
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
static_assert
(
std
::
numeric_limits
<
T
>::
is_integer
,
"template implementation is for integer types only"
);
#endif
if
(
scale
==
0.0
f
||
(
std
::
numeric_limits
<
T
>::
is_integer
&&
scale
<
1.0
f
&&
scale
>
-
1.0
f
))
{
for
(
size_t
y
=
0
;
y
<
size
.
height
;
++
y
)
{
T
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
std
::
memset
(
dst
,
0
,
sizeof
(
T
)
*
size
.
width
);
}
return
;
}
const
size_t
step128
=
16
/
sizeof
(
T
);
size_t
roiw128
=
size
.
width
>=
(
step128
-
1
)
?
size
.
width
-
step128
+
1
:
0
;
const
size_t
step64
=
8
/
sizeof
(
T
);
size_t
roiw64
=
size
.
width
>=
(
step64
-
1
)
?
size
.
width
-
step64
+
1
:
0
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
T
*
src1
=
internal
::
getRowPtr
(
src1Base
,
src1Stride
,
i
);
T
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
size_t
j
=
0
;
if
(
cpolicy
==
CONVERT_POLICY_SATURATE
)
{
for
(;
j
<
roiw128
;
j
+=
step128
)
{
internal
::
prefetch
(
src1
+
j
);
vec128
v_src1
=
internal
::
vld1q
(
src1
+
j
);
vec128
v_mask
=
vtstq
(
v_src1
,
v_src1
);
internal
::
vst1q
(
dst
+
j
,
internal
::
vandq
(
v_mask
,
recipSaturateQ
(
v_src1
,
scale
)));
}
for
(;
j
<
roiw64
;
j
+=
step64
)
{
vec64
v_src1
=
internal
::
vld1
(
src1
+
j
);
vec64
v_mask
=
vtst
(
v_src1
,
v_src1
);
internal
::
vst1
(
dst
+
j
,
internal
::
vand
(
v_mask
,
recipSaturate
(
v_src1
,
scale
)));
}
for
(;
j
<
size
.
width
;
j
++
)
{
dst
[
j
]
=
src1
[
j
]
?
internal
::
saturate_cast
<
T
>
(
scale
/
src1
[
j
])
:
0
;
}
}
else
// CONVERT_POLICY_WRAP
{
for
(;
j
<
roiw128
;
j
+=
step128
)
{
internal
::
prefetch
(
src1
+
j
);
vec128
v_src1
=
internal
::
vld1q
(
src1
+
j
);
vec128
v_mask
=
vtstq
(
v_src1
,
v_src1
);
internal
::
vst1q
(
dst
+
j
,
internal
::
vandq
(
v_mask
,
recipWrapQ
(
v_src1
,
scale
)));
}
for
(;
j
<
roiw64
;
j
+=
step64
)
{
vec64
v_src1
=
internal
::
vld1
(
src1
+
j
);
vec64
v_mask
=
vtst
(
v_src1
,
v_src1
);
internal
::
vst1
(
dst
+
j
,
internal
::
vand
(
v_mask
,
recipWrap
(
v_src1
,
scale
)));
}
for
(;
j
<
size
.
width
;
j
++
)
{
dst
[
j
]
=
src1
[
j
]
?
(
T
)((
s32
)
trunc
(
scale
/
src1
[
j
]))
:
0
;
}
}
}
#else
(
void
)
size
;
(
void
)
src1Base
;
(
void
)
src1Stride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
cpolicy
;
(
void
)
scale
;
#endif
}
}
void
div
(
const
Size2D
&
size
,
const
u8
*
src0Base
,
ptrdiff_t
src0Stride
,
const
u8
*
src1Base
,
ptrdiff_t
src1Stride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
div
<
u8
>
(
size
,
src0Base
,
src0Stride
,
src1Base
,
src1Stride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
div
(
const
Size2D
&
size
,
const
s8
*
src0Base
,
ptrdiff_t
src0Stride
,
const
s8
*
src1Base
,
ptrdiff_t
src1Stride
,
s8
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
div
<
s8
>
(
size
,
src0Base
,
src0Stride
,
src1Base
,
src1Stride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
div
(
const
Size2D
&
size
,
const
u16
*
src0Base
,
ptrdiff_t
src0Stride
,
const
u16
*
src1Base
,
ptrdiff_t
src1Stride
,
u16
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
div
<
u16
>
(
size
,
src0Base
,
src0Stride
,
src1Base
,
src1Stride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
div
(
const
Size2D
&
size
,
const
s16
*
src0Base
,
ptrdiff_t
src0Stride
,
const
s16
*
src1Base
,
ptrdiff_t
src1Stride
,
s16
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
div
<
s16
>
(
size
,
src0Base
,
src0Stride
,
src1Base
,
src1Stride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
div
(
const
Size2D
&
size
,
const
s32
*
src0Base
,
ptrdiff_t
src0Stride
,
const
s32
*
src1Base
,
ptrdiff_t
src1Stride
,
s32
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
div
<
s32
>
(
size
,
src0Base
,
src0Stride
,
src1Base
,
src1Stride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
div
(
const
Size2D
&
size
,
const
f32
*
src0Base
,
ptrdiff_t
src0Stride
,
const
f32
*
src1Base
,
ptrdiff_t
src1Stride
,
f32
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
if
(
scale
==
0.0
f
)
{
for
(
size_t
y
=
0
;
y
<
size
.
height
;
++
y
)
{
f32
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
std
::
memset
(
dst
,
0
,
sizeof
(
f32
)
*
size
.
width
);
}
return
;
}
size_t
roiw128
=
size
.
width
>=
3
?
size
.
width
-
3
:
0
;
size_t
roiw64
=
size
.
width
>=
1
?
size
.
width
-
1
:
0
;
if
(
std
::
fabs
(
scale
-
1.0
f
)
<
FLT_EPSILON
)
{
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
f32
*
src0
=
internal
::
getRowPtr
(
src0Base
,
src0Stride
,
i
);
const
f32
*
src1
=
internal
::
getRowPtr
(
src1Base
,
src1Stride
,
i
);
f32
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
size_t
j
=
0
;
for
(;
j
<
roiw128
;
j
+=
4
)
{
internal
::
prefetch
(
src0
+
j
);
internal
::
prefetch
(
src1
+
j
);
float32x4_t
v_src0
=
vld1q_f32
(
src0
+
j
);
float32x4_t
v_src1
=
vld1q_f32
(
src1
+
j
);
vst1q_f32
(
dst
+
j
,
vmulq_f32
(
v_src0
,
internal
::
vrecpq_f32
(
v_src1
)));
}
for
(;
j
<
roiw64
;
j
+=
2
)
{
float32x2_t
v_src0
=
vld1_f32
(
src0
+
j
);
float32x2_t
v_src1
=
vld1_f32
(
src1
+
j
);
vst1_f32
(
dst
+
j
,
vmul_f32
(
v_src0
,
internal
::
vrecp_f32
(
v_src1
)));
}
for
(;
j
<
size
.
width
;
j
++
)
{
dst
[
j
]
=
src0
[
j
]
/
src1
[
j
];
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
f32
*
src0
=
internal
::
getRowPtr
(
src0Base
,
src0Stride
,
i
);
const
f32
*
src1
=
internal
::
getRowPtr
(
src1Base
,
src1Stride
,
i
);
f32
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
size_t
j
=
0
;
for
(;
j
<
roiw128
;
j
+=
4
)
{
internal
::
prefetch
(
src0
+
j
);
internal
::
prefetch
(
src1
+
j
);
float32x4_t
v_src0
=
vld1q_f32
(
src0
+
j
);
float32x4_t
v_src1
=
vld1q_f32
(
src1
+
j
);
vst1q_f32
(
dst
+
j
,
vmulq_f32
(
vmulq_n_f32
(
v_src0
,
scale
),
internal
::
vrecpq_f32
(
v_src1
)));
}
for
(;
j
<
roiw64
;
j
+=
2
)
{
float32x2_t
v_src0
=
vld1_f32
(
src0
+
j
);
float32x2_t
v_src1
=
vld1_f32
(
src1
+
j
);
vst1_f32
(
dst
+
j
,
vmul_f32
(
vmul_n_f32
(
v_src0
,
scale
),
internal
::
vrecp_f32
(
v_src1
)));
}
for
(;
j
<
size
.
width
;
j
++
)
{
dst
[
j
]
=
src0
[
j
]
*
scale
/
src1
[
j
];
}
}
}
#else
(
void
)
size
;
(
void
)
src0Base
;
(
void
)
src0Stride
;
(
void
)
src1Base
;
(
void
)
src1Stride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
scale
;
#endif
}
void
reciprocal
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
recip
<
u8
>
(
size
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
reciprocal
(
const
Size2D
&
size
,
const
s8
*
srcBase
,
ptrdiff_t
srcStride
,
s8
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
recip
<
s8
>
(
size
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
reciprocal
(
const
Size2D
&
size
,
const
u16
*
srcBase
,
ptrdiff_t
srcStride
,
u16
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
recip
<
u16
>
(
size
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
reciprocal
(
const
Size2D
&
size
,
const
s16
*
srcBase
,
ptrdiff_t
srcStride
,
s16
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
recip
<
s16
>
(
size
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
reciprocal
(
const
Size2D
&
size
,
const
s32
*
srcBase
,
ptrdiff_t
srcStride
,
s32
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
,
CONVERT_POLICY
cpolicy
)
{
recip
<
s32
>
(
size
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
scale
,
cpolicy
);
}
void
reciprocal
(
const
Size2D
&
size
,
const
f32
*
srcBase
,
ptrdiff_t
srcStride
,
f32
*
dstBase
,
ptrdiff_t
dstStride
,
f32
scale
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
if
(
scale
==
0.0
f
)
{
for
(
size_t
y
=
0
;
y
<
size
.
height
;
++
y
)
{
f32
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
std
::
memset
(
dst
,
0
,
sizeof
(
f32
)
*
size
.
width
);
}
return
;
}
size_t
roiw128
=
size
.
width
>=
3
?
size
.
width
-
3
:
0
;
size_t
roiw64
=
size
.
width
>=
1
?
size
.
width
-
1
:
0
;
if
(
std
::
fabs
(
scale
-
1.0
f
)
<
FLT_EPSILON
)
{
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
f32
*
src1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
f32
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
size_t
j
=
0
;
for
(;
j
<
roiw128
;
j
+=
4
)
{
internal
::
prefetch
(
src1
+
j
);
float32x4_t
v_src1
=
vld1q_f32
(
src1
+
j
);
vst1q_f32
(
dst
+
j
,
internal
::
vrecpq_f32
(
v_src1
));
}
for
(;
j
<
roiw64
;
j
+=
2
)
{
float32x2_t
v_src1
=
vld1_f32
(
src1
+
j
);
vst1_f32
(
dst
+
j
,
internal
::
vrecp_f32
(
v_src1
));
}
for
(;
j
<
size
.
width
;
j
++
)
{
dst
[
j
]
=
1.0
f
/
src1
[
j
];
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
f32
*
src1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
f32
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
size_t
j
=
0
;
for
(;
j
<
roiw128
;
j
+=
4
)
{
internal
::
prefetch
(
src1
+
j
);
float32x4_t
v_src1
=
vld1q_f32
(
src1
+
j
);
vst1q_f32
(
dst
+
j
,
vmulq_n_f32
(
internal
::
vrecpq_f32
(
v_src1
),
scale
));
}
for
(;
j
<
roiw64
;
j
+=
2
)
{
float32x2_t
v_src1
=
vld1_f32
(
src1
+
j
);
vst1_f32
(
dst
+
j
,
vmul_n_f32
(
internal
::
vrecp_f32
(
v_src1
),
scale
));
}
for
(;
j
<
size
.
width
;
j
++
)
{
dst
[
j
]
=
scale
/
src1
[
j
];
}
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
scale
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/dot_product.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace
CAROTENE_NS
{
f64
dotProduct
(
const
Size2D
&
_size
,
const
u8
*
src0Base
,
ptrdiff_t
src0Stride
,
const
u8
*
src1Base
,
ptrdiff_t
src1Stride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
Size2D
size
(
_size
);
if
(
src0Stride
==
src1Stride
&&
src0Stride
==
(
ptrdiff_t
)(
size
.
width
))
{
size
.
width
*=
size
.
height
;
size
.
height
=
1
;
}
// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
#define DOT_UINT_BLOCKSIZE 66050*8
f64
result
=
0.0
;
for
(
size_t
row
=
0
;
row
<
size
.
height
;
++
row
)
{
const
u8
*
src0
=
internal
::
getRowPtr
(
src0Base
,
src0Stride
,
row
);
const
u8
*
src1
=
internal
::
getRowPtr
(
src1Base
,
src1Stride
,
row
);
size_t
i
=
0
;
uint64x2_t
ws
=
vmovq_n_u64
(
0
);
while
(
i
+
16
<=
size
.
width
)
{
size_t
lim
=
std
::
min
(
i
+
DOT_UINT_BLOCKSIZE
,
size
.
width
)
-
16
;
uint32x4_t
s1
=
vmovq_n_u32
(
0
);
uint32x4_t
s2
=
vmovq_n_u32
(
0
);
for
(;
i
<=
lim
;
i
+=
16
)
{
internal
::
prefetch
(
src0
+
i
);
internal
::
prefetch
(
src1
+
i
);
uint8x16_t
vs1
=
vld1q_u8
(
src0
+
i
);
uint8x16_t
vs2
=
vld1q_u8
(
src1
+
i
);
uint16x8_t
vdot1
=
vmull_u8
(
vget_low_u8
(
vs1
),
vget_low_u8
(
vs2
));
uint16x8_t
vdot2
=
vmull_u8
(
vget_high_u8
(
vs1
),
vget_high_u8
(
vs2
));
s1
=
vpadalq_u16
(
s1
,
vdot1
);
s2
=
vpadalq_u16
(
s2
,
vdot2
);
}
ws
=
vpadalq_u32
(
ws
,
s1
);
ws
=
vpadalq_u32
(
ws
,
s2
);
}
if
(
i
+
8
<=
size
.
width
)
{
uint8x8_t
vs1
=
vld1_u8
(
src0
+
i
);
uint8x8_t
vs2
=
vld1_u8
(
src1
+
i
);
ws
=
vpadalq_u32
(
ws
,
vpaddlq_u16
(
vmull_u8
(
vs1
,
vs2
)));
i
+=
8
;
}
result
+=
(
double
)
vget_lane_u64
(
vadd_u64
(
vget_low_u64
(
ws
),
vget_high_u64
(
ws
)),
0
);
for
(;
i
<
size
.
width
;
++
i
)
result
+=
s32
(
src0
[
i
])
*
s32
(
src1
[
i
]);
}
return
result
;
#else
(
void
)
_size
;
(
void
)
src0Base
;
(
void
)
src0Stride
;
(
void
)
src1Base
;
(
void
)
src1Stride
;
return
0
;
#endif
}
f64
dotProduct
(
const
Size2D
&
_size
,
const
s8
*
src0Base
,
ptrdiff_t
src0Stride
,
const
s8
*
src1Base
,
ptrdiff_t
src1Stride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
Size2D
size
(
_size
);
if
(
src0Stride
==
src1Stride
&&
src0Stride
==
(
ptrdiff_t
)(
size
.
width
))
{
size
.
width
*=
size
.
height
;
size
.
height
=
1
;
}
// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
f64
result
=
0.0
;
for
(
size_t
row
=
0
;
row
<
size
.
height
;
++
row
)
{
const
s8
*
src0
=
internal
::
getRowPtr
(
src0Base
,
src0Stride
,
row
);
const
s8
*
src1
=
internal
::
getRowPtr
(
src1Base
,
src1Stride
,
row
);
size_t
i
=
0
;
int64x2_t
ws
=
vmovq_n_s64
(
0
);
while
(
i
+
16
<=
size
.
width
)
{
size_t
lim
=
std
::
min
(
i
+
DOT_UINT_BLOCKSIZE
,
size
.
width
)
-
16
;
int32x4_t
s1
=
vmovq_n_s32
(
0
);
int32x4_t
s2
=
vmovq_n_s32
(
0
);
for
(;
i
<=
lim
;
i
+=
16
)
{
internal
::
prefetch
(
src0
+
i
);
internal
::
prefetch
(
src1
+
i
);
int8x16_t
vs1
=
vld1q_s8
(
src0
+
i
);
int8x16_t
vs2
=
vld1q_s8
(
src1
+
i
);
int16x8_t
vdot1
=
vmull_s8
(
vget_low_s8
(
vs1
),
vget_low_s8
(
vs2
));
int16x8_t
vdot2
=
vmull_s8
(
vget_high_s8
(
vs1
),
vget_high_s8
(
vs2
));
s1
=
vpadalq_s16
(
s1
,
vdot1
);
s2
=
vpadalq_s16
(
s2
,
vdot2
);
}
ws
=
vpadalq_s32
(
ws
,
s1
);
ws
=
vpadalq_s32
(
ws
,
s2
);
}
if
(
i
+
8
<=
size
.
width
)
{
int8x8_t
vs1
=
vld1_s8
(
src0
+
i
);
int8x8_t
vs2
=
vld1_s8
(
src1
+
i
);
ws
=
vpadalq_s32
(
ws
,
vpaddlq_s16
(
vmull_s8
(
vs1
,
vs2
)));
i
+=
8
;
}
result
+=
(
double
)
vget_lane_s64
(
vadd_s64
(
vget_low_s64
(
ws
),
vget_high_s64
(
ws
)),
0
);
for
(;
i
<
size
.
width
;
++
i
)
result
+=
s32
(
src0
[
i
])
*
s32
(
src1
[
i
]);
}
return
result
;
#else
(
void
)
_size
;
(
void
)
src0Base
;
(
void
)
src0Stride
;
(
void
)
src1Base
;
(
void
)
src1Stride
;
return
0
;
#endif
}
f64
dotProduct
(
const
Size2D
&
_size
,
const
f32
*
src0Base
,
ptrdiff_t
src0Stride
,
const
f32
*
src1Base
,
ptrdiff_t
src1Stride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
Size2D
size
(
_size
);
if
(
src0Stride
==
src1Stride
&&
src0Stride
==
(
ptrdiff_t
)(
size
.
width
*
sizeof
(
f32
)))
{
size
.
width
*=
size
.
height
;
size
.
height
=
1
;
}
#define DOT_FLOAT_BLOCKSIZE (1 << 13)
f64
result
=
0.0
;
for
(
size_t
row
=
0
;
row
<
size
.
height
;
++
row
)
{
const
f32
*
src0
=
internal
::
getRowPtr
(
src0Base
,
src0Stride
,
row
);
const
f32
*
src1
=
internal
::
getRowPtr
(
src1Base
,
src1Stride
,
row
);
size_t
i
=
0
;
while
(
i
+
4
<=
size
.
width
)
{
size_t
lim
=
std
::
min
(
i
+
DOT_FLOAT_BLOCKSIZE
,
size
.
width
)
-
4
;
float32x4_t
v_sum
=
vdupq_n_f32
(
0.0
f
);
for
(
;
i
<=
lim
;
i
+=
4
)
{
internal
::
prefetch
(
src0
+
i
);
internal
::
prefetch
(
src1
+
i
);
v_sum
=
vmlaq_f32
(
v_sum
,
vld1q_f32
(
src0
+
i
),
vld1q_f32
(
src1
+
i
));
}
float32x2_t
vres
=
vpadd_f32
(
vget_low_f32
(
v_sum
),
vget_high_f32
(
v_sum
));
result
+=
vget_lane_f32
(
vres
,
0
)
+
vget_lane_f32
(
vres
,
1
);
}
if
(
i
+
2
<=
size
.
width
)
{
float32x2_t
vres
=
vmul_f32
(
vld1_f32
(
src0
+
i
),
vld1_f32
(
src1
+
i
));
result
+=
vget_lane_f32
(
vres
,
0
)
+
vget_lane_f32
(
vres
,
1
);
i
+=
2
;
}
for
(;
i
<
size
.
width
;
++
i
)
result
+=
src0
[
i
]
*
src1
[
i
];
}
return
result
;
#else
(
void
)
_size
;
(
void
)
src0Base
;
(
void
)
src0Stride
;
(
void
)
src1Base
;
(
void
)
src1Stride
;
return
0
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/dummy.cpp
0 → 100644
View file @
fbd3199c
// This file is needed for compilation on some platforms e.g. with XCode generator
// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457
opencv/3rdparty/carotene/src/fast.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
Below is the original copyright and the references */
/*
Copyright (c) 2006, 2008 Edward Rosten
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
*Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
*Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
*Neither the name of the University of Cambridge nor the names of
its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
The references are:
* Machine learning for high-speed corner detection,
E. Rosten and T. Drummond, ECCV 2006
* Faster and better: A machine learning approach to corner detection
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace
CAROTENE_NS
{
#ifdef CAROTENE_NEON
namespace
{
void
makeOffsets
(
ptrdiff_t
pixel
[],
ptrdiff_t
row_stride
)
{
pixel
[
0
]
=
0
+
row_stride
*
3
;
pixel
[
1
]
=
1
+
row_stride
*
3
;
pixel
[
2
]
=
2
+
row_stride
*
2
;
pixel
[
3
]
=
3
+
row_stride
*
1
;
pixel
[
4
]
=
3
+
row_stride
*
0
;
pixel
[
5
]
=
3
+
row_stride
*
-
1
;
pixel
[
6
]
=
2
+
row_stride
*
-
2
;
pixel
[
7
]
=
1
+
row_stride
*
-
3
;
pixel
[
8
]
=
0
+
row_stride
*
-
3
;
pixel
[
9
]
=
-
1
+
row_stride
*
-
3
;
pixel
[
10
]
=
-
2
+
row_stride
*
-
2
;
pixel
[
11
]
=
-
3
+
row_stride
*
-
1
;
pixel
[
12
]
=
-
3
+
row_stride
*
0
;
pixel
[
13
]
=
-
3
+
row_stride
*
1
;
pixel
[
14
]
=
-
2
+
row_stride
*
2
;
pixel
[
15
]
=
-
1
+
row_stride
*
3
;
}
u8
cornerScore
(
const
u8
*
ptr
,
const
ptrdiff_t
pixel
[])
{
const
s32
K
=
8
,
N
=
16
+
K
+
1
;
s32
k
,
v
=
ptr
[
0
];
s16
d
[(
N
+
7
)
&
~
7
];
for
(
k
=
0
;
k
<
N
;
k
++
)
d
[
k
]
=
(
s16
)(
v
-
ptr
[
pixel
[
k
]]);
int16x8_t
q0
=
vdupq_n_s16
((
s16
)(
-
1000
));
int16x8_t
q1
=
vdupq_n_s16
((
s16
)(
1000
));
int16x8_t
d0_7
=
vld1q_s16
(
d
+
0
);
int16x8_t
d8_15
=
vld1q_s16
(
d
+
8
);
int16x8_t
d16_23
=
vld1q_s16
(
d
+
16
);
int16x8_t
d24
=
vld1q_s16
(
d
+
24
);
//k == 0
int16x8_t
v0k0
=
vextq_s16
(
d0_7
,
d8_15
,
1
);
int16x8_t
v1k0
=
vextq_s16
(
d0_7
,
d8_15
,
2
);
int16x8_t
ak0
=
vminq_s16
(
v0k0
,
v1k0
);
int16x8_t
bk0
=
vmaxq_s16
(
v0k0
,
v1k0
);
v0k0
=
vextq_s16
(
d0_7
,
d8_15
,
3
);
ak0
=
vminq_s16
(
ak0
,
v0k0
);
bk0
=
vmaxq_s16
(
bk0
,
v0k0
);
v1k0
=
vextq_s16
(
d0_7
,
d8_15
,
4
);
ak0
=
vminq_s16
(
ak0
,
v1k0
);
bk0
=
vmaxq_s16
(
bk0
,
v1k0
);
v0k0
=
vextq_s16
(
d0_7
,
d8_15
,
5
);
ak0
=
vminq_s16
(
ak0
,
v0k0
);
bk0
=
vmaxq_s16
(
bk0
,
v0k0
);
v1k0
=
vextq_s16
(
d0_7
,
d8_15
,
6
);
ak0
=
vminq_s16
(
ak0
,
v1k0
);
bk0
=
vmaxq_s16
(
bk0
,
v1k0
);
v0k0
=
vextq_s16
(
d0_7
,
d8_15
,
7
);
ak0
=
vminq_s16
(
ak0
,
v0k0
);
bk0
=
vmaxq_s16
(
bk0
,
v0k0
);
ak0
=
vminq_s16
(
ak0
,
d8_15
);
bk0
=
vmaxq_s16
(
bk0
,
d8_15
);
q0
=
vmaxq_s16
(
q0
,
vminq_s16
(
ak0
,
d0_7
));
q1
=
vminq_s16
(
q1
,
vmaxq_s16
(
bk0
,
d0_7
));
v1k0
=
vextq_s16
(
d8_15
,
d16_23
,
1
);
q0
=
vmaxq_s16
(
q0
,
vminq_s16
(
ak0
,
v1k0
));
q1
=
vminq_s16
(
q1
,
vmaxq_s16
(
bk0
,
v1k0
));
//k == 8
int16x8_t
v0k8
=
v1k0
;
int16x8_t
v1k8
=
vextq_s16
(
d8_15
,
d16_23
,
2
);
int16x8_t
ak8
=
vminq_s16
(
v0k8
,
v1k8
);
int16x8_t
bk8
=
vmaxq_s16
(
v0k8
,
v1k8
);
v0k8
=
vextq_s16
(
d8_15
,
d16_23
,
3
);
ak8
=
vminq_s16
(
ak8
,
v0k8
);
bk8
=
vmaxq_s16
(
bk8
,
v0k8
);
v1k8
=
vextq_s16
(
d8_15
,
d16_23
,
4
);
ak8
=
vminq_s16
(
ak8
,
v1k8
);
bk8
=
vmaxq_s16
(
bk8
,
v1k8
);
v0k8
=
vextq_s16
(
d8_15
,
d16_23
,
5
);
ak8
=
vminq_s16
(
ak8
,
v0k8
);
bk8
=
vmaxq_s16
(
bk8
,
v0k8
);
v1k8
=
vextq_s16
(
d8_15
,
d16_23
,
6
);
ak8
=
vminq_s16
(
ak8
,
v1k8
);
bk8
=
vmaxq_s16
(
bk8
,
v1k8
);
v0k8
=
vextq_s16
(
d8_15
,
d16_23
,
7
);
ak8
=
vminq_s16
(
ak8
,
v0k8
);
bk8
=
vmaxq_s16
(
bk8
,
v0k8
);
ak8
=
vminq_s16
(
ak8
,
d16_23
);
bk8
=
vmaxq_s16
(
bk8
,
d16_23
);
q0
=
vmaxq_s16
(
q0
,
vminq_s16
(
ak8
,
d8_15
));
q1
=
vminq_s16
(
q1
,
vmaxq_s16
(
bk8
,
d8_15
));
v1k8
=
vextq_s16
(
d16_23
,
d24
,
1
);
q0
=
vmaxq_s16
(
q0
,
vminq_s16
(
ak8
,
v1k8
));
q1
=
vminq_s16
(
q1
,
vmaxq_s16
(
bk8
,
v1k8
));
//fin
int16x8_t
q
=
vmaxq_s16
(
q0
,
vsubq_s16
(
vmovq_n_s16
(
0
),
q1
));
int16x4_t
q2
=
vmax_s16
(
vget_low_s16
(
q
),
vget_high_s16
(
q
));
int32x4_t
q2w
=
vmovl_s16
(
q2
);
int32x2_t
q4
=
vmax_s32
(
vget_low_s32
(
q2w
),
vget_high_s32
(
q2w
));
int32x2_t
q8
=
vmax_s32
(
q4
,
vreinterpret_s32_s64
(
vshr_n_s64
(
vreinterpret_s64_s32
(
q4
),
32
)));
return
(
u8
)(
vget_lane_s32
(
q8
,
0
)
-
1
);
}
}
//namespace
#endif
void
FAST
(
const
Size2D
&
size
,
u8
*
srcBase
,
ptrdiff_t
srcStride
,
KeypointStore
*
keypoints
,
u8
threshold
,
bool
nonmax_suppression
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
//keypoints.clear();
const
s32
K
=
8
,
N
=
16
+
K
+
1
;
ptrdiff_t
i
,
j
,
k
,
pixel
[
N
];
makeOffsets
(
pixel
,
srcStride
);
for
(
k
=
16
;
k
<
N
;
k
++
)
pixel
[
k
]
=
pixel
[
k
-
16
];
uint8x16_t
delta
=
vdupq_n_u8
(
128
);
uint8x16_t
t
=
vdupq_n_u8
(
threshold
);
uint8x16_t
K16
=
vdupq_n_u8
((
u8
)
K
);
u8
threshold_tab
[
512
];
for
(
i
=
-
255
;
i
<=
255
;
i
++
)
threshold_tab
[
i
+
255
]
=
(
u8
)(
i
<
-
threshold
?
1
:
i
>
threshold
?
2
:
0
);
std
::
vector
<
u8
>
_buf
((
size
.
width
+
16
)
*
3
*
(
sizeof
(
ptrdiff_t
)
+
sizeof
(
u8
))
+
128
);
u8
*
buf
[
3
];
buf
[
0
]
=
&
_buf
[
0
];
buf
[
1
]
=
buf
[
0
]
+
size
.
width
;
buf
[
2
]
=
buf
[
1
]
+
size
.
width
;
ptrdiff_t
*
cpbuf
[
3
];
cpbuf
[
0
]
=
(
ptrdiff_t
*
)
internal
::
alignPtr
(
buf
[
2
]
+
size
.
width
,
sizeof
(
ptrdiff_t
))
+
1
;
cpbuf
[
1
]
=
cpbuf
[
0
]
+
size
.
width
+
1
;
cpbuf
[
2
]
=
cpbuf
[
1
]
+
size
.
width
+
1
;
memset
(
buf
[
0
],
0
,
size
.
width
*
3
);
for
(
i
=
3
;
i
<
(
ptrdiff_t
)
size
.
height
-
2
;
i
++
)
{
const
u8
*
ptr
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
)
+
3
;
u8
*
curr
=
buf
[(
i
-
3
)
%
3
];
ptrdiff_t
*
cornerpos
=
cpbuf
[(
i
-
3
)
%
3
];
memset
(
curr
,
0
,
size
.
width
);
ptrdiff_t
ncorners
=
0
;
if
(
i
<
(
ptrdiff_t
)
size
.
height
-
3
)
{
j
=
3
;
for
(;
j
<
(
ptrdiff_t
)
size
.
width
-
16
-
3
;
j
+=
16
,
ptr
+=
16
)
{
internal
::
prefetch
(
ptr
);
internal
::
prefetch
(
ptr
+
pixel
[
0
]);
internal
::
prefetch
(
ptr
+
pixel
[
2
]);
uint8x16_t
v0
=
vld1q_u8
(
ptr
);
int8x16_t
v1
=
vreinterpretq_s8_u8
(
veorq_u8
(
vqsubq_u8
(
v0
,
t
),
delta
));
int8x16_t
v2
=
vreinterpretq_s8_u8
(
veorq_u8
(
vqaddq_u8
(
v0
,
t
),
delta
));
int8x16_t
x0
=
vreinterpretq_s8_u8
(
vsubq_u8
(
vld1q_u8
(
ptr
+
pixel
[
0
]),
delta
));
int8x16_t
x1
=
vreinterpretq_s8_u8
(
vsubq_u8
(
vld1q_u8
(
ptr
+
pixel
[
4
]),
delta
));
int8x16_t
x2
=
vreinterpretq_s8_u8
(
vsubq_u8
(
vld1q_u8
(
ptr
+
pixel
[
8
]),
delta
));
int8x16_t
x3
=
vreinterpretq_s8_u8
(
vsubq_u8
(
vld1q_u8
(
ptr
+
pixel
[
12
]),
delta
));
uint8x16_t
m0
=
vandq_u8
(
vcgtq_s8
(
x0
,
v2
),
vcgtq_s8
(
x1
,
v2
));
uint8x16_t
m1
=
vandq_u8
(
vcgtq_s8
(
v1
,
x0
),
vcgtq_s8
(
v1
,
x1
));
m0
=
vorrq_u8
(
m0
,
vandq_u8
(
vcgtq_s8
(
x1
,
v2
),
vcgtq_s8
(
x2
,
v2
)));
m1
=
vorrq_u8
(
m1
,
vandq_u8
(
vcgtq_s8
(
v1
,
x1
),
vcgtq_s8
(
v1
,
x2
)));
m0
=
vorrq_u8
(
m0
,
vandq_u8
(
vcgtq_s8
(
x2
,
v2
),
vcgtq_s8
(
x3
,
v2
)));
m1
=
vorrq_u8
(
m1
,
vandq_u8
(
vcgtq_s8
(
v1
,
x2
),
vcgtq_s8
(
v1
,
x3
)));
m0
=
vorrq_u8
(
m0
,
vandq_u8
(
vcgtq_s8
(
x3
,
v2
),
vcgtq_s8
(
x0
,
v2
)));
m1
=
vorrq_u8
(
m1
,
vandq_u8
(
vcgtq_s8
(
v1
,
x3
),
vcgtq_s8
(
v1
,
x0
)));
m0
=
vorrq_u8
(
m0
,
m1
);
u64
mask
[
2
];
vst1q_u64
(
mask
,
vreinterpretq_u64_u8
(
m0
));
if
(
mask
[
0
]
==
0
)
{
if
(
mask
[
1
]
!=
0
)
{
j
-=
8
;
ptr
-=
8
;
}
continue
;
}
uint8x16_t
c0
=
vmovq_n_u8
(
0
);
uint8x16_t
c1
=
vmovq_n_u8
(
0
);
uint8x16_t
max0
=
vmovq_n_u8
(
0
);
uint8x16_t
max1
=
vmovq_n_u8
(
0
);
for
(
k
=
0
;
k
<
N
;
k
++
)
{
int8x16_t
x
=
vreinterpretq_s8_u8
(
veorq_u8
(
vld1q_u8
(
ptr
+
pixel
[
k
]),
delta
));
m0
=
vcgtq_s8
(
x
,
v2
);
m1
=
vcgtq_s8
(
v1
,
x
);
c0
=
vandq_u8
(
vsubq_u8
(
c0
,
m0
),
m0
);
c1
=
vandq_u8
(
vsubq_u8
(
c1
,
m1
),
m1
);
max0
=
vmaxq_u8
(
max0
,
c0
);
max1
=
vmaxq_u8
(
max1
,
c1
);
}
max0
=
vmaxq_u8
(
max0
,
max1
);
u8
m
[
16
];
vst1q_u8
(
m
,
vcgtq_u8
(
max0
,
K16
));
for
(
k
=
0
;
k
<
16
;
++
k
)
if
(
m
[
k
])
{
cornerpos
[
ncorners
++
]
=
j
+
k
;
if
(
nonmax_suppression
)
curr
[
j
+
k
]
=
cornerScore
(
ptr
+
k
,
pixel
);
}
}
for
(
;
j
<
(
s32
)
size
.
width
-
3
;
j
++
,
ptr
++
)
{
s32
v
=
ptr
[
0
];
const
u8
*
tab
=
&
threshold_tab
[
0
]
-
v
+
255
;
s32
d
=
tab
[
ptr
[
pixel
[
0
]]]
|
tab
[
ptr
[
pixel
[
8
]]];
if
(
d
==
0
)
continue
;
d
&=
tab
[
ptr
[
pixel
[
2
]]]
|
tab
[
ptr
[
pixel
[
10
]]];
d
&=
tab
[
ptr
[
pixel
[
4
]]]
|
tab
[
ptr
[
pixel
[
12
]]];
d
&=
tab
[
ptr
[
pixel
[
6
]]]
|
tab
[
ptr
[
pixel
[
14
]]];
if
(
d
==
0
)
continue
;
d
&=
tab
[
ptr
[
pixel
[
1
]]]
|
tab
[
ptr
[
pixel
[
9
]]];
d
&=
tab
[
ptr
[
pixel
[
3
]]]
|
tab
[
ptr
[
pixel
[
11
]]];
d
&=
tab
[
ptr
[
pixel
[
5
]]]
|
tab
[
ptr
[
pixel
[
13
]]];
d
&=
tab
[
ptr
[
pixel
[
7
]]]
|
tab
[
ptr
[
pixel
[
15
]]];
if
(
d
&
1
)
{
s32
vt
=
v
-
threshold
,
count
=
0
;
for
(
k
=
0
;
k
<
N
;
k
++
)
{
s32
x
=
ptr
[
pixel
[
k
]];
if
(
x
<
vt
)
{
if
(
++
count
>
K
)
{
cornerpos
[
ncorners
++
]
=
j
;
if
(
nonmax_suppression
)
curr
[
j
]
=
cornerScore
(
ptr
,
pixel
);
break
;
}
}
else
count
=
0
;
}
}
if
(
d
&
2
)
{
s32
vt
=
v
+
threshold
,
count
=
0
;
for
(
k
=
0
;
k
<
N
;
k
++
)
{
s32
x
=
ptr
[
pixel
[
k
]];
if
(
x
>
vt
)
{
if
(
++
count
>
K
)
{
cornerpos
[
ncorners
++
]
=
j
;
if
(
nonmax_suppression
)
curr
[
j
]
=
cornerScore
(
ptr
,
pixel
);
break
;
}
}
else
count
=
0
;
}
}
}
}
cornerpos
[
-
1
]
=
ncorners
;
if
(
i
==
3
)
continue
;
const
u8
*
prev
=
buf
[(
i
-
4
+
3
)
%
3
];
const
u8
*
pprev
=
buf
[(
i
-
5
+
3
)
%
3
];
cornerpos
=
cpbuf
[(
i
-
4
+
3
)
%
3
];
ncorners
=
cornerpos
[
-
1
];
for
(
k
=
0
;
k
<
ncorners
;
k
++
)
{
j
=
cornerpos
[
k
];
s32
score
=
prev
[
j
];
if
(
!
nonmax_suppression
||
(
score
>
prev
[
j
+
1
]
&&
score
>
prev
[
j
-
1
]
&&
score
>
pprev
[
j
-
1
]
&&
score
>
pprev
[
j
]
&&
score
>
pprev
[
j
+
1
]
&&
score
>
curr
[
j
-
1
]
&&
score
>
curr
[
j
]
&&
score
>
curr
[
j
+
1
])
)
{
keypoints
->
push
((
f32
)
j
,
(
f32
)(
i
-
1
),
7.
f
,
-
1
,
(
f32
)
score
);
}
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
keypoints
;
(
void
)
threshold
;
(
void
)
nonmax_suppression
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/fill_minmaxloc.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace
CAROTENE_NS
{
#ifdef CAROTENE_NEON
namespace
{
template
<
typename
T
>
void
process
(
const
T
*
src
,
size_t
j0
,
size_t
j1
,
size_t
i
,
T
minVal
,
size_t
*
minLocPtr
,
s32
&
minLocCount
,
s32
minLocCapacity
,
T
maxVal
,
size_t
*
maxLocPtr
,
s32
&
maxLocCount
,
s32
maxLocCapacity
)
{
for
(
size_t
j
=
j0
;
j
<
j1
;
++
j
)
{
T
val
=
src
[
j
];
if
(
val
==
maxVal
)
{
if
(
maxLocCount
<
maxLocCapacity
)
{
maxLocPtr
[
maxLocCount
]
=
j
;
maxLocPtr
[
maxLocCount
+
1
]
=
i
;
}
maxLocCount
+=
2
;
}
if
(
val
==
minVal
)
{
if
(
minLocCount
<
minLocCapacity
)
{
minLocPtr
[
minLocCount
]
=
j
;
minLocPtr
[
minLocCount
+
1
]
=
i
;
}
minLocCount
+=
2
;
}
}
}
}
// namespace
#endif
void
fillMinMaxLocs
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
minVal
,
size_t
*
minLocPtr
,
s32
&
minLocCount
,
s32
minLocCapacity
,
u8
maxVal
,
size_t
*
maxLocPtr
,
s32
&
maxLocCount
,
s32
maxLocCapacity
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
size_t
roiw16
=
size
.
width
>=
15
?
size
.
width
-
15
:
0
;
size_t
roiw8
=
size
.
width
>=
7
?
size
.
width
-
7
:
0
;
uint8x16_t
v_maxval16
=
vdupq_n_u8
(
maxVal
),
v_minval16
=
vdupq_n_u8
(
minVal
);
uint8x8_t
v_maxval8
=
vdup_n_u8
(
maxVal
),
v_minval8
=
vdup_n_u8
(
minVal
);
u64
mask
[
2
]
=
{
0ul
};
minLocCapacity
<<=
1
;
maxLocCapacity
<<=
1
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
u8
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
size_t
j
=
0
;
for
(
;
j
<
roiw16
;
j
+=
16
)
{
internal
::
prefetch
(
src
+
j
);
uint8x16_t
v_src
=
vld1q_u8
(
src
+
j
);
uint8x16_t
v_maxmask
=
vceqq_u8
(
v_src
,
v_maxval16
);
uint8x16_t
v_minmask
=
vceqq_u8
(
v_src
,
v_minval16
);
uint8x16_t
v_mask
=
vorrq_u8
(
v_maxmask
,
v_minmask
);
vst1q_u8
((
u8
*
)
&
mask
[
0
],
v_mask
);
if
(
mask
[
0
])
process
(
src
,
j
,
j
+
8
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
if
(
mask
[
1
])
process
(
src
,
j
+
8
,
j
+
16
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
for
(
;
j
<
roiw8
;
j
+=
8
)
{
uint8x8_t
v_src
=
vld1_u8
(
src
+
j
);
uint8x8_t
v_maxmask
=
vceq_u8
(
v_src
,
v_maxval8
);
uint8x8_t
v_minmask
=
vceq_u8
(
v_src
,
v_minval8
);
uint8x8_t
v_mask
=
vorr_u8
(
v_maxmask
,
v_minmask
);
vst1_u8
((
u8
*
)
&
mask
[
0
],
v_mask
);
if
(
mask
[
0
])
process
(
src
,
j
,
j
+
8
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
process
(
src
,
j
,
size
.
width
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
minLocCount
>>=
1
;
maxLocCount
>>=
1
;
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minLocPtr
;
(
void
)
minLocCount
;
(
void
)
minLocCapacity
;
(
void
)
maxVal
;
(
void
)
maxLocPtr
;
(
void
)
maxLocCount
;
(
void
)
maxLocCapacity
;
#endif
}
void
fillMinMaxLocs
(
const
Size2D
&
size
,
const
u16
*
srcBase
,
ptrdiff_t
srcStride
,
u16
minVal
,
size_t
*
minLocPtr
,
s32
&
minLocCount
,
s32
minLocCapacity
,
u16
maxVal
,
size_t
*
maxLocPtr
,
s32
&
maxLocCount
,
s32
maxLocCapacity
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
size_t
roiw16
=
size
.
width
>=
15
?
size
.
width
-
15
:
0
;
size_t
roiw8
=
size
.
width
>=
7
?
size
.
width
-
7
:
0
;
uint16x8_t
v_maxval8
=
vdupq_n_u16
(
maxVal
),
v_minval8
=
vdupq_n_u16
(
minVal
);
u64
mask
[
2
]
=
{
0ul
};
minLocCapacity
<<=
1
;
maxLocCapacity
<<=
1
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
u16
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
size_t
j
=
0
;
for
(
;
j
<
roiw16
;
j
+=
16
)
{
internal
::
prefetch
(
src
+
j
);
uint16x8_t
v_src0
=
vld1q_u16
(
src
+
j
),
v_src1
=
vld1q_u16
(
src
+
j
+
8
);
uint16x8_t
v_mask0
=
vorrq_u16
(
vceqq_u16
(
v_src0
,
v_maxval8
),
vceqq_u16
(
v_src0
,
v_minval8
));
uint16x8_t
v_mask1
=
vorrq_u16
(
vceqq_u16
(
v_src1
,
v_maxval8
),
vceqq_u16
(
v_src1
,
v_minval8
));
vst1q_u8
((
u8
*
)
&
mask
[
0
],
vcombine_u8
(
vmovn_u16
(
v_mask0
),
vmovn_u16
(
v_mask1
)));
if
(
mask
[
0
])
process
(
src
,
j
,
j
+
8
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
if
(
mask
[
1
])
process
(
src
,
j
+
8
,
j
+
16
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
for
(
;
j
<
roiw8
;
j
+=
8
)
{
internal
::
prefetch
(
src
+
j
);
uint16x8_t
v_src
=
vld1q_u16
(
src
+
j
);
uint16x8_t
v_maxmask
=
vceqq_u16
(
v_src
,
v_maxval8
);
uint16x8_t
v_minmask
=
vceqq_u16
(
v_src
,
v_minval8
);
uint16x8_t
v_mask
=
vorrq_u16
(
v_maxmask
,
v_minmask
);
vst1_u8
((
u8
*
)
&
mask
[
0
],
vmovn_u16
(
v_mask
));
if
(
mask
[
0
])
process
(
src
,
j
,
j
+
8
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
process
(
src
,
j
,
size
.
width
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
minLocCount
>>=
1
;
maxLocCount
>>=
1
;
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minLocPtr
;
(
void
)
minLocCount
;
(
void
)
minLocCapacity
;
(
void
)
maxVal
;
(
void
)
maxLocPtr
;
(
void
)
maxLocCount
;
(
void
)
maxLocCapacity
;
#endif
}
void
fillMinMaxLocs
(
const
Size2D
&
size
,
const
s16
*
srcBase
,
ptrdiff_t
srcStride
,
s16
minVal
,
size_t
*
minLocPtr
,
s32
&
minLocCount
,
s32
minLocCapacity
,
s16
maxVal
,
size_t
*
maxLocPtr
,
s32
&
maxLocCount
,
s32
maxLocCapacity
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
size_t
roiw16
=
size
.
width
>=
15
?
size
.
width
-
15
:
0
;
size_t
roiw8
=
size
.
width
>=
7
?
size
.
width
-
7
:
0
;
int16x8_t
v_maxval8
=
vdupq_n_s16
(
maxVal
),
v_minval8
=
vdupq_n_s16
(
minVal
);
u64
mask
[
2
]
=
{
0ul
};
minLocCapacity
<<=
1
;
maxLocCapacity
<<=
1
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
s16
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
size_t
j
=
0
;
for
(
;
j
<
roiw16
;
j
+=
16
)
{
internal
::
prefetch
(
src
+
j
);
int16x8_t
v_src0
=
vld1q_s16
(
src
+
j
),
v_src1
=
vld1q_s16
(
src
+
j
+
8
);
uint16x8_t
v_mask0
=
vorrq_u16
(
vceqq_s16
(
v_src0
,
v_maxval8
),
vceqq_s16
(
v_src0
,
v_minval8
));
uint16x8_t
v_mask1
=
vorrq_u16
(
vceqq_s16
(
v_src1
,
v_maxval8
),
vceqq_s16
(
v_src1
,
v_minval8
));
vst1q_u8
((
u8
*
)
&
mask
[
0
],
vcombine_u8
(
vmovn_u16
(
v_mask0
),
vmovn_u16
(
v_mask1
)));
if
(
mask
[
0
])
process
(
src
,
j
,
j
+
8
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
if
(
mask
[
1
])
process
(
src
,
j
+
8
,
j
+
16
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
for
(
;
j
<
roiw8
;
j
+=
8
)
{
internal
::
prefetch
(
src
+
j
);
int16x8_t
v_src
=
vld1q_s16
(
src
+
j
);
uint16x8_t
v_maxmask
=
vceqq_s16
(
v_src
,
v_maxval8
);
uint16x8_t
v_minmask
=
vceqq_s16
(
v_src
,
v_minval8
);
uint16x8_t
v_mask
=
vorrq_u16
(
v_maxmask
,
v_minmask
);
vst1_u8
((
u8
*
)
&
mask
[
0
],
vmovn_u16
(
v_mask
));
if
(
mask
[
0
])
process
(
src
,
j
,
j
+
8
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
process
(
src
,
j
,
size
.
width
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
minLocCount
>>=
1
;
maxLocCount
>>=
1
;
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minLocPtr
;
(
void
)
minLocCount
;
(
void
)
minLocCapacity
;
(
void
)
maxVal
;
(
void
)
maxLocPtr
;
(
void
)
maxLocCount
;
(
void
)
maxLocCapacity
;
#endif
}
void
fillMinMaxLocs
(
const
Size2D
&
size
,
const
s32
*
srcBase
,
ptrdiff_t
srcStride
,
s32
minVal
,
size_t
*
minLocPtr
,
s32
&
minLocCount
,
s32
minLocCapacity
,
s32
maxVal
,
size_t
*
maxLocPtr
,
s32
&
maxLocCount
,
s32
maxLocCapacity
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
size_t
roiw8
=
size
.
width
>=
7
?
size
.
width
-
7
:
0
;
int32x4_t
v_maxval4
=
vdupq_n_s32
(
maxVal
),
v_minval4
=
vdupq_n_s32
(
minVal
);
u64
mask
=
0ul
;
minLocCapacity
<<=
1
;
maxLocCapacity
<<=
1
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
s32
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
size_t
j
=
0
;
for
(
;
j
<
roiw8
;
j
+=
8
)
{
internal
::
prefetch
(
src
+
j
);
int32x4_t
v_src0
=
vld1q_s32
(
src
+
j
),
v_src1
=
vld1q_s32
(
src
+
j
+
4
);
uint32x4_t
v_mask0
=
vorrq_u32
(
vceqq_s32
(
v_src0
,
v_maxval4
),
vceqq_s32
(
v_src0
,
v_minval4
));
uint32x4_t
v_mask1
=
vorrq_u32
(
vceqq_s32
(
v_src1
,
v_maxval4
),
vceqq_s32
(
v_src1
,
v_minval4
));
vst1_u8
((
u8
*
)
&
mask
,
vmovn_u16
(
vcombine_u16
(
vmovn_u32
(
v_mask0
),
vmovn_u32
(
v_mask1
))));
if
(
mask
)
process
(
src
,
j
,
j
+
8
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
process
(
src
,
j
,
size
.
width
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
minLocCount
>>=
1
;
maxLocCount
>>=
1
;
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minLocPtr
;
(
void
)
minLocCount
;
(
void
)
minLocCapacity
;
(
void
)
maxVal
;
(
void
)
maxLocPtr
;
(
void
)
maxLocCount
;
(
void
)
maxLocCapacity
;
#endif
}
void
fillMinMaxLocs
(
const
Size2D
&
size
,
const
u32
*
srcBase
,
ptrdiff_t
srcStride
,
u32
minVal
,
size_t
*
minLocPtr
,
s32
&
minLocCount
,
s32
minLocCapacity
,
u32
maxVal
,
size_t
*
maxLocPtr
,
s32
&
maxLocCount
,
s32
maxLocCapacity
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
size_t
roiw8
=
size
.
width
>=
7
?
size
.
width
-
7
:
0
;
uint32x4_t
v_maxval4
=
vdupq_n_u32
(
maxVal
),
v_minval4
=
vdupq_n_u32
(
minVal
);
u64
mask
=
0ul
;
minLocCapacity
<<=
1
;
maxLocCapacity
<<=
1
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
u32
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
size_t
j
=
0
;
for
(
;
j
<
roiw8
;
j
+=
8
)
{
internal
::
prefetch
(
src
+
j
);
uint32x4_t
v_src0
=
vld1q_u32
(
src
+
j
),
v_src1
=
vld1q_u32
(
src
+
j
+
4
);
uint32x4_t
v_mask0
=
vorrq_u32
(
vceqq_u32
(
v_src0
,
v_maxval4
),
vceqq_u32
(
v_src0
,
v_minval4
));
uint32x4_t
v_mask1
=
vorrq_u32
(
vceqq_u32
(
v_src1
,
v_maxval4
),
vceqq_u32
(
v_src1
,
v_minval4
));
vst1_u8
((
u8
*
)
&
mask
,
vmovn_u16
(
vcombine_u16
(
vmovn_u32
(
v_mask0
),
vmovn_u32
(
v_mask1
))));
if
(
mask
)
process
(
src
,
j
,
j
+
8
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
process
(
src
,
j
,
size
.
width
,
i
,
minVal
,
minLocPtr
,
minLocCount
,
minLocCapacity
,
maxVal
,
maxLocPtr
,
maxLocCount
,
maxLocCapacity
);
}
minLocCount
>>=
1
;
maxLocCount
>>=
1
;
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minLocPtr
;
(
void
)
minLocCount
;
(
void
)
minLocCapacity
;
(
void
)
maxVal
;
(
void
)
maxLocPtr
;
(
void
)
maxLocCount
;
(
void
)
maxLocCapacity
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/flip.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace
CAROTENE_NS
{
bool
isFlipSupported
(
FLIP_MODE
flipMode
,
u32
elemSize
)
{
bool
supportedElemSize
=
(
elemSize
==
1
)
||
(
elemSize
==
2
)
||
(
elemSize
==
3
)
||
(
elemSize
==
4
);
return
isSupportedConfiguration
()
&&
((
supportedElemSize
&&
((
flipMode
==
FLIP_BOTH_MODE
)
||
(
flipMode
==
FLIP_HORIZONTAL_MODE
)))
||
(
flipMode
==
FLIP_VERTICAL_MODE
));
}
#ifdef CAROTENE_NEON
namespace
{
template
<
typename
T
>
void
flip
(
const
Size2D
&
size
,
const
void
*
srcBase
,
ptrdiff_t
srcStride
,
void
*
dstBase
,
ptrdiff_t
dstStride
,
FLIP_MODE
flipMode
)
{
using
namespace
internal
;
typedef
typename
VecTraits
<
T
>::
vec128
vec128
;
typedef
typename
VecTraits
<
T
>::
vec64
vec64
;
u32
step_base
=
16
/
sizeof
(
T
),
step_tail
=
8
/
sizeof
(
T
);
size_t
roiw_base
=
size
.
width
>=
(
step_base
-
1
)
?
size
.
width
-
step_base
+
1
:
0
;
size_t
roiw_tail
=
size
.
width
>=
(
step_tail
-
1
)
?
size
.
width
-
step_tail
+
1
:
0
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
T
*
src
=
getRowPtr
((
const
T
*
)
srcBase
,
srcStride
,
i
);
T
*
dst
=
getRowPtr
((
T
*
)
dstBase
,
dstStride
,
(
flipMode
&
FLIP_VERTICAL_MODE
)
!=
0
?
size
.
height
-
i
-
1
:
i
);
size_t
js
=
0
,
jd
=
size
.
width
;
for
(;
js
<
roiw_base
;
js
+=
step_base
,
jd
-=
step_base
)
{
prefetch
(
src
+
js
);
vec128
v_src
=
vld1q
(
src
+
js
);
vec128
v_dst
=
vrev64q
(
v_src
);
v_dst
=
vcombine
(
vget_high
(
v_dst
),
vget_low
(
v_dst
));
vst1q
(
dst
+
jd
-
step_base
,
v_dst
);
}
for
(;
js
<
roiw_tail
;
js
+=
step_tail
,
jd
-=
step_tail
)
{
vec64
v_src
=
vld1
(
src
+
js
);
vst1
(
dst
+
jd
-
step_tail
,
vrev64
(
v_src
));
}
for
(
--
jd
;
js
<
size
.
width
;
++
js
,
--
jd
)
dst
[
jd
]
=
src
[
js
];
}
}
template
<
typename
T
>
void
flip3
(
const
Size2D
&
size
,
const
void
*
srcBase
,
ptrdiff_t
srcStride
,
void
*
dstBase
,
ptrdiff_t
dstStride
,
FLIP_MODE
flipMode
)
{
using
namespace
internal
;
#ifndef __ANDROID__
typedef
typename
VecTraits
<
T
,
3
>::
vec128
vec128
;
#endif
typedef
typename
VecTraits
<
T
,
3
>::
vec64
vec64
;
#ifndef __ANDROID__
u32
step_base
=
16
/
sizeof
(
T
),
step_base3
=
step_base
*
3
;
size_t
roiw_base
=
size
.
width
>=
(
step_base
-
1
)
?
size
.
width
-
step_base
+
1
:
0
;
#endif
u32
step_tail
=
8
/
sizeof
(
T
),
step_tail3
=
step_tail
*
3
;
size_t
roiw_tail
=
size
.
width
>=
(
step_tail
-
1
)
?
size
.
width
-
step_tail
+
1
:
0
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
T
*
src
=
getRowPtr
((
const
T
*
)
srcBase
,
srcStride
,
i
);
T
*
dst
=
getRowPtr
((
T
*
)
dstBase
,
dstStride
,
(
flipMode
&
FLIP_VERTICAL_MODE
)
!=
0
?
size
.
height
-
i
-
1
:
i
);
size_t
j
=
0
,
js
=
0
,
jd
=
size
.
width
*
3
;
#ifndef __ANDROID__
for
(;
j
<
roiw_base
;
j
+=
step_base
,
js
+=
step_base3
,
jd
-=
step_base3
)
{
prefetch
(
src
+
js
);
vec128
v_src
=
vld3q
(
src
+
js
),
v_dst
;
v_src
.
val
[
0
]
=
vrev64q
(
v_src
.
val
[
0
]);
v_src
.
val
[
1
]
=
vrev64q
(
v_src
.
val
[
1
]);
v_src
.
val
[
2
]
=
vrev64q
(
v_src
.
val
[
2
]);
v_dst
.
val
[
0
]
=
vcombine
(
vget_high
(
v_src
.
val
[
0
]),
vget_low
(
v_src
.
val
[
0
]));
v_dst
.
val
[
1
]
=
vcombine
(
vget_high
(
v_src
.
val
[
1
]),
vget_low
(
v_src
.
val
[
1
]));
v_dst
.
val
[
2
]
=
vcombine
(
vget_high
(
v_src
.
val
[
2
]),
vget_low
(
v_src
.
val
[
2
]));
vst3q
(
dst
+
jd
-
step_base3
,
v_dst
);
}
#endif // __ANDROID__
for
(;
j
<
roiw_tail
;
j
+=
step_tail
,
js
+=
step_tail3
,
jd
-=
step_tail3
)
{
vec64
v_src
=
vld3
(
src
+
js
),
v_dst
;
v_dst
.
val
[
0
]
=
vrev64
(
v_src
.
val
[
0
]);
v_dst
.
val
[
1
]
=
vrev64
(
v_src
.
val
[
1
]);
v_dst
.
val
[
2
]
=
vrev64
(
v_src
.
val
[
2
]);
vst3
(
dst
+
jd
-
step_tail3
,
v_dst
);
}
for
(
jd
-=
3
;
j
<
size
.
width
;
++
j
,
js
+=
3
,
jd
-=
3
)
{
dst
[
jd
]
=
src
[
js
];
dst
[
jd
+
1
]
=
src
[
js
+
1
];
dst
[
jd
+
2
]
=
src
[
js
+
2
];
}
}
}
typedef
void
(
*
flipFunc
)(
const
Size2D
&
size
,
const
void
*
srcBase
,
ptrdiff_t
srcStride
,
void
*
dstBase
,
ptrdiff_t
dstStride
,
FLIP_MODE
flipMode
);
}
// namespace
#endif
void
flip
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
FLIP_MODE
flipMode
,
u32
elemSize
)
{
internal
::
assertSupportedConfiguration
(
isFlipSupported
(
flipMode
,
elemSize
));
#ifdef CAROTENE_NEON
if
(
flipMode
==
FLIP_VERTICAL_MODE
)
{
for
(
size_t
y
=
0
;
y
<
size
.
height
;
++
y
)
{
const
u8
*
src_row
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
);
u8
*
dst_row
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
size
.
height
-
y
-
1
);
std
::
memcpy
(
dst_row
,
src_row
,
elemSize
*
size
.
width
);
}
return
;
}
flipFunc
func
=
NULL
;
if
(
elemSize
==
(
u32
)
sizeof
(
u8
))
func
=
&
flip
<
u8
>
;
if
(
elemSize
==
(
u32
)
sizeof
(
u16
))
func
=
&
flip
<
u16
>
;
if
(
elemSize
==
(
u32
)
sizeof
(
u32
))
func
=
&
flip
<
u32
>
;
if
(
elemSize
==
(
u32
)
sizeof
(
u8
)
*
3
)
func
=
&
flip3
<
u8
>
;
if
(
func
==
NULL
)
return
;
func
(
size
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
flipMode
);
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
flipMode
;
(
void
)
elemSize
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/gaussian_blur.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include "separable_filter.hpp"
namespace
CAROTENE_NS
{
bool
isGaussianBlur3x3Supported
(
const
Size2D
&
size
,
BORDER_MODE
border
)
{
return
isSupportedConfiguration
()
&&
size
.
width
>=
8
&&
(
border
==
BORDER_MODE_CONSTANT
||
border
==
BORDER_MODE_REPLICATE
);
}
void
gaussianBlur3x3
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
u8
borderValue
)
{
internal
::
assertSupportedConfiguration
(
isGaussianBlur3x3Supported
(
size
,
border
));
#ifdef CAROTENE_NEON
const
uint16x8_t
v_border_x4
=
vdupq_n_u16
(
borderValue
<<
2
);
const
uint16x8_t
v_zero
=
vdupq_n_u16
(
0
);
const
uint8x8_t
v_border
=
vdup_n_u8
(
borderValue
);
uint16x8_t
tprev
=
v_zero
,
tcurr
=
v_zero
,
tnext
=
v_zero
;
uint16x8_t
t0
=
v_zero
,
t1
=
v_zero
,
t2
=
v_zero
;
ptrdiff_t
width
=
(
ptrdiff_t
)
size
.
width
,
height
=
(
ptrdiff_t
)
size
.
height
;
for
(
ptrdiff_t
y
=
0
;
y
<
height
;
++
y
)
{
const
u8
*
srow0
=
y
==
0
&&
border
==
BORDER_MODE_CONSTANT
?
NULL
:
internal
::
getRowPtr
(
srcBase
,
srcStride
,
std
::
max
<
ptrdiff_t
>
(
y
-
1
,
0
));
const
u8
*
srow1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
);
const
u8
*
srow2
=
y
+
1
==
height
&&
border
==
BORDER_MODE_CONSTANT
?
NULL
:
internal
::
getRowPtr
(
srcBase
,
srcStride
,
std
::
min
(
y
+
1
,
height
-
1
));
u8
*
drow
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
s16
prevx
=
0
,
currx
=
0
,
nextx
=
0
;
ptrdiff_t
x
=
0
;
const
ptrdiff_t
bwidth
=
y
+
2
<
height
?
width
:
(
width
-
8
);
// perform vertical convolution
for
(
;
x
<=
bwidth
;
x
+=
8
)
{
internal
::
prefetch
(
srow0
+
x
);
internal
::
prefetch
(
srow1
+
x
);
internal
::
prefetch
(
srow2
+
x
);
uint8x8_t
x0
=
!
srow0
?
v_border
:
vld1_u8
(
srow0
+
x
);
uint8x8_t
x1
=
vld1_u8
(
srow1
+
x
);
uint8x8_t
x2
=
!
srow2
?
v_border
:
vld1_u8
(
srow2
+
x
);
// calculate values for plain CPU part below if needed
if
(
x
+
8
>=
bwidth
)
{
ptrdiff_t
x3
=
x
==
width
?
width
-
1
:
x
;
ptrdiff_t
x4
=
border
==
BORDER_MODE_CONSTANT
?
x3
-
1
:
std
::
max
<
ptrdiff_t
>
(
x3
-
1
,
0
);
if
(
border
==
BORDER_MODE_CONSTANT
&&
x4
<
0
)
prevx
=
borderValue
;
else
prevx
=
(
srow2
?
srow2
[
x4
]
:
borderValue
)
+
(
srow1
[
x4
]
<<
1
)
+
(
srow0
?
srow0
[
x4
]
:
borderValue
);
currx
=
(
srow2
?
srow2
[
x3
]
:
borderValue
)
+
(
srow1
[
x3
]
<<
1
)
+
(
srow0
?
srow0
[
x3
]
:
borderValue
);
}
// make shift
if
(
x
)
{
tprev
=
tcurr
;
tcurr
=
tnext
;
}
// and calculate next value
tnext
=
vaddq_u16
(
vaddl_u8
(
x0
,
x2
),
vshll_n_u8
(
x1
,
1
));
// make extrapolation for the first elements
if
(
!
x
)
{
// make border
if
(
border
==
BORDER_MODE_CONSTANT
)
tcurr
=
v_border_x4
;
else
if
(
border
==
BORDER_MODE_REPLICATE
)
tcurr
=
vdupq_n_u16
(
vgetq_lane_u16
(
tnext
,
0
));
continue
;
}
// combine 3 "shifted" vectors
t0
=
vextq_u16
(
tprev
,
tcurr
,
7
);
t1
=
tcurr
;
t2
=
vextq_u16
(
tcurr
,
tnext
,
1
);
// and add them
t0
=
vqaddq_u16
(
vshlq_n_u16
(
t1
,
1
),
vqaddq_u16
(
t0
,
t2
));
vst1_u8
(
drow
+
x
-
8
,
vshrn_n_u16
(
t0
,
4
));
}
x
-=
8
;
if
(
x
==
width
)
--
x
;
for
(
;
x
<
width
;
++
x
)
{
// make extrapolation for the last elements
if
(
x
+
1
>=
width
)
{
if
(
border
==
BORDER_MODE_CONSTANT
)
nextx
=
borderValue
<<
2
;
else
if
(
border
==
BORDER_MODE_REPLICATE
)
nextx
=
srow2
[
x
]
+
(
srow1
[
x
]
<<
1
)
+
srow0
[
x
];
}
else
nextx
=
(
srow2
?
srow2
[
x
+
1
]
:
borderValue
)
+
(
srow1
[
x
+
1
]
<<
1
)
+
(
srow0
?
srow0
[
x
+
1
]
:
borderValue
);
f32
val
=
(
prevx
+
(
currx
<<
1
)
+
nextx
)
>>
4
;
drow
[
x
]
=
internal
::
saturate_cast
<
u8
>
((
s32
)
val
);
// make shift
prevx
=
currx
;
currx
=
nextx
;
}
}
#else
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
borderValue
;
#endif
}
bool
isGaussianBlur3x3MarginSupported
(
const
Size2D
&
size
,
BORDER_MODE
border
,
Margin
borderMargin
)
{
return
isSeparableFilter3x3Supported
(
size
,
border
,
0
,
0
,
borderMargin
);
}
void
gaussianBlur3x3Margin
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
u8
borderValue
,
Margin
borderMargin
)
{
internal
::
assertSupportedConfiguration
(
isGaussianBlur3x3MarginSupported
(
size
,
border
,
borderMargin
));
#ifdef CAROTENE_NEON
internal
::
sepFilter3x3
<
internal
::
RowFilter3x3S16_121
,
internal
::
ColFilter3x3U8_121
>::
process
(
size
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
0
,
0
,
border
,
borderValue
,
borderMargin
);
#else
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
borderValue
;
#endif
}
bool
isGaussianBlur5x5Supported
(
const
Size2D
&
size
,
s32
cn
,
BORDER_MODE
border
)
{
return
isSupportedConfiguration
()
&&
cn
>
0
&&
cn
<=
4
&&
size
.
width
>=
8
&&
size
.
height
>=
2
&&
(
border
==
BORDER_MODE_CONSTANT
||
border
==
BORDER_MODE_REFLECT101
||
border
==
BORDER_MODE_REFLECT
||
border
==
BORDER_MODE_REPLICATE
||
border
==
BORDER_MODE_WRAP
);
}
void
gaussianBlur5x5
(
const
Size2D
&
size
,
s32
cn
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
borderType
,
u8
borderValue
,
Margin
borderMargin
)
{
internal
::
assertSupportedConfiguration
(
isGaussianBlur5x5Supported
(
size
,
cn
,
borderType
));
#ifdef CAROTENE_NEON
size_t
colsn
=
size
.
width
*
cn
;
std
::
vector
<
u8
>
_tmp
;
u8
*
tmp
=
0
;
if
(
borderType
==
BORDER_MODE_CONSTANT
)
{
_tmp
.
assign
(
colsn
+
4
*
cn
,
borderValue
);
tmp
=
&
_tmp
[
cn
<<
1
];
}
ptrdiff_t
idx_l1
=
internal
::
borderInterpolate
(
-
1
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_l2
=
internal
::
borderInterpolate
(
-
2
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_r1
=
internal
::
borderInterpolate
(
size
.
width
+
0
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_r2
=
internal
::
borderInterpolate
(
size
.
width
+
1
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
//1-line buffer
std
::
vector
<
u16
>
_buf
(
cn
*
(
size
.
width
+
4
)
+
32
/
sizeof
(
u16
));
u16
*
lane
=
internal
::
alignPtr
(
&
_buf
[
cn
<<
1
],
32
);
if
(
borderType
==
BORDER_MODE_CONSTANT
)
for
(
s32
k
=
0
;
k
<
cn
;
++
k
)
{
lane
[
-
cn
+
k
]
=
borderValue
;
lane
[
-
cn
-
cn
+
k
]
=
borderValue
;
lane
[
colsn
+
k
]
=
borderValue
;
lane
[
colsn
+
cn
+
k
]
=
borderValue
;
}
uint8x8_t
vc6u8
=
vmov_n_u8
(
6
);
uint16x8_t
vc6u16
=
vmovq_n_u16
(
6
);
uint16x8_t
vc4u16
=
vmovq_n_u16
(
4
);
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
u8
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
//vertical convolution
ptrdiff_t
idx_rm2
=
internal
::
borderInterpolate
(
i
-
2
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rm1
=
internal
::
borderInterpolate
(
i
-
1
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rp1
=
internal
::
borderInterpolate
(
i
+
1
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rp2
=
internal
::
borderInterpolate
(
i
+
2
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
const
u8
*
ln0
=
idx_rm2
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rm2
)
:
tmp
;
const
u8
*
ln1
=
idx_rm1
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rm1
)
:
tmp
;
const
u8
*
ln2
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
const
u8
*
ln3
=
idx_rp1
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rp1
)
:
tmp
;
const
u8
*
ln4
=
idx_rp2
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rp2
)
:
tmp
;
size_t
x
=
0
;
for
(;
x
<=
colsn
-
8
;
x
+=
8
)
{
internal
::
prefetch
(
internal
::
getRowPtr
(
ln2
+
x
,
srcStride
,
x
%
5
-
2
));
uint8x8_t
v0
=
vld1_u8
(
ln0
+
x
);
uint8x8_t
v1
=
vld1_u8
(
ln1
+
x
);
uint8x8_t
v2
=
vld1_u8
(
ln2
+
x
);
uint8x8_t
v3
=
vld1_u8
(
ln3
+
x
);
uint8x8_t
v4
=
vld1_u8
(
ln4
+
x
);
uint16x8_t
v
=
vaddl_u8
(
v0
,
v4
);
uint16x8_t
v13
=
vaddl_u8
(
v1
,
v3
);
v
=
vmlal_u8
(
v
,
v2
,
vc6u8
);
v
=
vmlaq_u16
(
v
,
v13
,
vc4u16
);
vst1q_u16
(
lane
+
x
,
v
);
}
for
(;
x
<
colsn
;
++
x
)
lane
[
x
]
=
ln0
[
x
]
+
ln4
[
x
]
+
u16
(
4
)
*
(
ln1
[
x
]
+
ln3
[
x
])
+
u16
(
6
)
*
ln2
[
x
];
//left&right borders
if
(
borderType
!=
BORDER_MODE_CONSTANT
)
for
(
s32
k
=
0
;
k
<
cn
;
++
k
)
{
lane
[
-
cn
+
k
]
=
lane
[
idx_l1
+
k
];
lane
[
-
cn
-
cn
+
k
]
=
lane
[
idx_l2
+
k
];
lane
[
colsn
+
k
]
=
lane
[
idx_r1
+
k
];
lane
[
colsn
+
cn
+
k
]
=
lane
[
idx_r2
+
k
];
}
//horizontal convolution
x
=
0
;
switch
(
cn
)
{
case
1
:
for
(;
x
<=
colsn
-
8
;
x
+=
8
)
{
internal
::
prefetch
(
lane
+
x
);
uint16x8_t
lane0
=
vld1q_u16
(
lane
+
x
-
2
);
uint16x8_t
lane4
=
vld1q_u16
(
lane
+
x
+
2
);
uint16x8_t
lane1
=
vld1q_u16
(
lane
+
x
-
1
);
uint16x8_t
lane3
=
vld1q_u16
(
lane
+
x
+
1
);
uint16x8_t
lane2
=
vld1q_u16
(
lane
+
x
+
0
);
uint16x8_t
ln04
=
vaddq_u16
(
lane0
,
lane4
);
uint16x8_t
ln13
=
vaddq_u16
(
lane1
,
lane3
);
uint16x8_t
ln042
=
vmlaq_u16
(
ln04
,
lane2
,
vc6u16
);
uint16x8_t
lsw
=
vmlaq_u16
(
ln042
,
ln13
,
vc4u16
);
uint8x8_t
ls
=
vrshrn_n_u16
(
lsw
,
8
);
vst1_u8
(
dst
+
x
,
ls
);
}
break
;
case
2
:
for
(;
x
<=
colsn
-
8
*
2
;
x
+=
8
*
2
)
{
internal
::
prefetch
(
lane
+
x
);
u16
*
lidx0
=
lane
+
x
-
2
*
2
;
u16
*
lidx1
=
lane
+
x
-
1
*
2
;
u16
*
lidx3
=
lane
+
x
+
1
*
2
;
u16
*
lidx4
=
lane
+
x
+
2
*
2
;
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
__asm__
__volatile__
(
"vld2.16 {d0, d2}, [%[in0]]!
\n\t
"
"vld2.16 {d1, d3}, [%[in0]]
\n\t
"
"vld2.16 {d8, d10}, [%[in4]]!
\n\t
"
"vld2.16 {d9, d11}, [%[in4]]
\n\t
"
"vadd.i16 q0, q4
\n\t
"
"vadd.i16 q1, q5
\n\t
"
"vld2.16 {d16, d18}, [%[in1]]!
\n\t
"
"vld2.16 {d17, d19}, [%[in1]]
\n\t
"
"vld2.16 {d8, d10}, [%[in3]]!
\n\t
"
"vld2.16 {d9, d11}, [%[in3]]
\n\t
"
"vadd.i16 q4, q8
\n\t
"
"vadd.i16 q5, q9
\n\t
"
"vld2.16 {d16, d18}, [%[in2]]
\n\t
"
"vld2.16 {d17, d19}, [%[in22]]
\n\t
"
"vmla.i16 q0, q4, %q[c4]
\n\t
"
"vmla.i16 q1, q5, %q[c4]
\n\t
"
"vmla.i16 q0, q8, %q[c6]
\n\t
"
"vmla.i16 q1, q9, %q[c6]
\n\t
"
"vrshrn.u16 d8, q0, #8
\n\t
"
"vrshrn.u16 d9, q1, #8
\n\t
"
"vst2.8 {d8-d9}, [%[out]]
\n\t
"
:
[
in0
]
"=r"
(
lidx0
),
[
in1
]
"=r"
(
lidx1
),
[
in3
]
"=r"
(
lidx3
),
[
in4
]
"=r"
(
lidx4
)
:
[
out
]
"r"
(
dst
+
x
),
"0"
(
lidx0
),
"1"
(
lidx1
),
"2"
(
lidx3
),
"3"
(
lidx4
),
[
in2
]
"r"
(
lane
+
x
),
[
in22
]
"r"
(
lane
+
x
+
4
*
2
),
[
c4
]
"w"
(
vc4u16
),
[
c6
]
"w"
(
vc6u16
)
:
"d0"
,
"d1"
,
"d2"
,
"d3"
,
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
);
#else
uint16x8x2_t
vLane0
=
vld2q_u16
(
lidx0
);
uint16x8x2_t
vLane1
=
vld2q_u16
(
lidx1
);
uint16x8x2_t
vLane2
=
vld2q_u16
(
lane
+
x
);
uint16x8x2_t
vLane3
=
vld2q_u16
(
lidx3
);
uint16x8x2_t
vLane4
=
vld2q_u16
(
lidx4
);
uint16x8_t
vSum_0_4
=
vaddq_u16
(
vLane0
.
val
[
0
],
vLane4
.
val
[
0
]);
uint16x8_t
vSum_1_5
=
vaddq_u16
(
vLane0
.
val
[
1
],
vLane4
.
val
[
1
]);
uint16x8_t
vSum_4_8
=
vaddq_u16
(
vLane1
.
val
[
0
],
vLane3
.
val
[
0
]);
uint16x8_t
vSum_5_9
=
vaddq_u16
(
vLane1
.
val
[
1
],
vLane3
.
val
[
1
]);
vSum_0_4
=
vmlaq_u16
(
vSum_0_4
,
vSum_4_8
,
vc4u16
);
vSum_1_5
=
vmlaq_u16
(
vSum_1_5
,
vSum_5_9
,
vc4u16
);
vSum_0_4
=
vmlaq_u16
(
vSum_0_4
,
vLane2
.
val
[
0
],
vc6u16
);
vSum_1_5
=
vmlaq_u16
(
vSum_1_5
,
vLane2
.
val
[
1
],
vc6u16
);
uint8x8x2_t
vRes
;
vRes
.
val
[
0
]
=
vrshrn_n_u16
(
vSum_0_4
,
8
);
vRes
.
val
[
1
]
=
vrshrn_n_u16
(
vSum_1_5
,
8
);
vst2_u8
(
dst
+
x
,
vRes
);
#endif
}
break
;
case
3
:
for
(;
x
<=
colsn
-
8
*
3
;
x
+=
8
*
3
)
{
internal
::
prefetch
(
lane
+
x
);
u16
*
lidx0
=
lane
+
x
-
2
*
3
;
u16
*
lidx1
=
lane
+
x
-
1
*
3
;
u16
*
lidx3
=
lane
+
x
+
1
*
3
;
u16
*
lidx4
=
lane
+
x
+
2
*
3
;
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__
__volatile__
(
"vld3.16 {d0, d2, d4}, [%[in0]]!
\n\t
"
"vld3.16 {d1, d3, d5}, [%[in0]]
\n\t
"
"vld3.16 {d8, d10, d12}, [%[in4]]!
\n\t
"
"vld3.16 {d9, d11, d13}, [%[in4]]
\n\t
"
"vadd.i16 q0, q4
\n\t
"
"vadd.i16 q1, q5
\n\t
"
"vadd.i16 q2, q6
\n\t
"
"vld3.16 {d16, d18, d20}, [%[in1]]!
\n\t
"
"vld3.16 {d17, d19, d21}, [%[in1]]
\n\t
"
"vld3.16 {d8, d10, d12}, [%[in3]]!
\n\t
"
"vld3.16 {d9, d11, d13}, [%[in3]]
\n\t
"
"vadd.i16 q4, q8
\n\t
"
"vadd.i16 q5, q9
\n\t
"
"vadd.i16 q6, q10
\n\t
"
"vld3.16 {d16, d18, d20}, [%[in2]]
\n\t
"
"vld3.16 {d17, d19, d21}, [%[in22]]
\n\t
"
"vmla.i16 q0, q4, %q[c4]
\n\t
"
"vmla.i16 q1, q5, %q[c4]
\n\t
"
"vmla.i16 q2, q6, %q[c4]
\n\t
"
"vmla.i16 q0, q8, %q[c6]
\n\t
"
"vmla.i16 q1, q9, %q[c6]
\n\t
"
"vmla.i16 q2, q10, %q[c6]
\n\t
"
"vrshrn.u16 d8, q0, #8
\n\t
"
"vrshrn.u16 d9, q1, #8
\n\t
"
"vrshrn.u16 d10, q2, #8
\n\t
"
"vst3.8 {d8-d10}, [%[out]]
\n\t
"
:
[
in0
]
"=r"
(
lidx0
),
[
in1
]
"=r"
(
lidx1
),
[
in3
]
"=r"
(
lidx3
),
[
in4
]
"=r"
(
lidx4
)
:
[
out
]
"r"
(
dst
+
x
),
"0"
(
lidx0
),
"1"
(
lidx1
),
"2"
(
lidx3
),
"3"
(
lidx4
),
[
in2
]
"r"
(
lane
+
x
),
[
in22
]
"r"
(
lane
+
x
+
4
*
3
),
[
c4
]
"w"
(
vc4u16
),
[
c6
]
"w"
(
vc6u16
)
:
"d0"
,
"d1"
,
"d2"
,
"d3"
,
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
);
#else
uint16x8x3_t
vLane0
=
vld3q_u16
(
lidx0
);
uint16x8x3_t
vLane1
=
vld3q_u16
(
lidx1
);
uint16x8x3_t
vLane2
=
vld3q_u16
(
lane
+
x
);
uint16x8x3_t
vLane3
=
vld3q_u16
(
lidx3
);
uint16x8x3_t
vLane4
=
vld3q_u16
(
lidx4
);
uint16x8_t
vSum_0_4
=
vaddq_u16
(
vLane0
.
val
[
0
],
vLane4
.
val
[
0
]);
uint16x8_t
vSum_1_5
=
vaddq_u16
(
vLane0
.
val
[
1
],
vLane4
.
val
[
1
]);
uint16x8_t
vSum_2_6
=
vaddq_u16
(
vLane0
.
val
[
2
],
vLane4
.
val
[
2
]);
uint16x8_t
vSum_3_1
=
vaddq_u16
(
vLane3
.
val
[
0
],
vLane1
.
val
[
0
]);
uint16x8_t
vSum_4_2
=
vaddq_u16
(
vLane3
.
val
[
1
],
vLane1
.
val
[
1
]);
uint16x8_t
vSum_5_6
=
vaddq_u16
(
vLane3
.
val
[
2
],
vLane1
.
val
[
2
]);
vSum_0_4
=
vmlaq_u16
(
vSum_0_4
,
vSum_3_1
,
vc4u16
);
vSum_1_5
=
vmlaq_u16
(
vSum_1_5
,
vSum_4_2
,
vc4u16
);
vSum_2_6
=
vmlaq_u16
(
vSum_2_6
,
vSum_5_6
,
vc4u16
);
vSum_0_4
=
vmlaq_u16
(
vSum_0_4
,
vLane2
.
val
[
0
],
vc6u16
);
vSum_1_5
=
vmlaq_u16
(
vSum_1_5
,
vLane2
.
val
[
1
],
vc6u16
);
vSum_2_6
=
vmlaq_u16
(
vSum_2_6
,
vLane2
.
val
[
2
],
vc6u16
);
uint8x8x3_t
vRes
;
vRes
.
val
[
0
]
=
vrshrn_n_u16
(
vSum_0_4
,
8
);
vRes
.
val
[
1
]
=
vrshrn_n_u16
(
vSum_1_5
,
8
);
vRes
.
val
[
2
]
=
vrshrn_n_u16
(
vSum_2_6
,
8
);
vst3_u8
(
dst
+
x
,
vRes
);
#endif
}
break
;
case
4
:
for
(;
x
<=
colsn
-
8
*
4
;
x
+=
8
*
4
)
{
internal
::
prefetch
(
lane
+
x
);
internal
::
prefetch
(
lane
+
x
+
16
);
u16
*
lidx0
=
lane
+
x
-
2
*
4
;
u16
*
lidx1
=
lane
+
x
-
1
*
4
;
u16
*
lidx3
=
lane
+
x
+
1
*
4
;
u16
*
lidx4
=
lane
+
x
+
2
*
4
;
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__
__volatile__
(
"vld4.16 {d0, d2, d4, d6}, [%[in0]]!
\n\t
"
"vld4.16 {d1, d3, d5, d7}, [%[in0]]
\n\t
"
"vld4.16 {d8, d10, d12, d14}, [%[in4]]!
\n\t
"
"vld4.16 {d9, d11, d13, d15}, [%[in4]]
\n\t
"
"vadd.i16 q0, q4
\n\t
"
"vadd.i16 q1, q5
\n\t
"
"vadd.i16 q2, q6
\n\t
"
"vadd.i16 q3, q7
\n\t
"
"vld4.16 {d16, d18, d20, d22}, [%[in1]]!
\n\t
"
"vld4.16 {d17, d19, d21, d23}, [%[in1]]
\n\t
"
"vld4.16 {d8, d10, d12, d14}, [%[in3]]!
\n\t
"
"vld4.16 {d9, d11, d13, d15}, [%[in3]]
\n\t
"
"vadd.i16 q4, q8
\n\t
"
"vadd.i16 q5, q9
\n\t
"
"vadd.i16 q6, q10
\n\t
"
"vadd.i16 q7, q11
\n\t
"
"vld4.16 {d16, d18, d20, d22}, [%[in2],:256]
\n\t
"
"vld4.16 {d17, d19, d21, d23}, [%[in22],:256]
\n\t
"
"vmla.i16 q0, q4, %q[c4]
\n\t
"
"vmla.i16 q1, q5, %q[c4]
\n\t
"
"vmla.i16 q2, q6, %q[c4]
\n\t
"
"vmla.i16 q3, q7, %q[c4]
\n\t
"
"vmla.i16 q0, q8, %q[c6]
\n\t
"
"vmla.i16 q1, q9, %q[c6]
\n\t
"
"vmla.i16 q2, q10, %q[c6]
\n\t
"
"vmla.i16 q3, q11, %q[c6]
\n\t
"
"vrshrn.u16 d8, q0, #8
\n\t
"
"vrshrn.u16 d9, q1, #8
\n\t
"
"vrshrn.u16 d10, q2, #8
\n\t
"
"vrshrn.u16 d11, q3, #8
\n\t
"
"vst4.8 {d8-d11}, [%[out]]
\n\t
"
:
[
in0
]
"=r"
(
lidx0
),
[
in1
]
"=r"
(
lidx1
),
[
in3
]
"=r"
(
lidx3
),
[
in4
]
"=r"
(
lidx4
)
:
[
out
]
"r"
(
dst
+
x
),
"0"
(
lidx0
),
"1"
(
lidx1
),
"2"
(
lidx3
),
"3"
(
lidx4
),
[
in2
]
"r"
(
lane
+
x
),
[
in22
]
"r"
(
lane
+
x
+
4
*
4
),
[
c4
]
"w"
(
vc4u16
),
[
c6
]
"w"
(
vc6u16
)
:
"d0"
,
"d1"
,
"d2"
,
"d3"
,
"d4"
,
"d5"
,
"d6"
,
"d7"
,
"d8"
,
"d9"
,
"d10"
,
"d11"
,
"d12"
,
"d13"
,
"d14"
,
"d15"
,
"d16"
,
"d17"
,
"d18"
,
"d19"
,
"d20"
,
"d21"
,
"d22"
,
"d23"
);
#else
uint16x8x4_t
vLane0
=
vld4q_u16
(
lidx0
);
uint16x8x4_t
vLane2
=
vld4q_u16
(
lidx4
);
uint16x8x4_t
vLane4
=
vld4q_u16
(
lidx1
);
uint16x8x4_t
vLane6
=
vld4q_u16
(
lidx3
);
uint16x8x4_t
vLane8
=
vld4q_u16
(
lane
+
x
);
uint16x8_t
vSum_0_4
=
vaddq_u16
(
vLane0
.
val
[
0
],
vLane2
.
val
[
0
]);
uint16x8_t
vSum_1_5
=
vaddq_u16
(
vLane0
.
val
[
1
],
vLane2
.
val
[
1
]);
uint16x8_t
vSum_2_6
=
vaddq_u16
(
vLane0
.
val
[
2
],
vLane2
.
val
[
2
]);
uint16x8_t
vSum_3_7
=
vaddq_u16
(
vLane0
.
val
[
3
],
vLane2
.
val
[
3
]);
uint16x8_t
vSum_4_8
=
vaddq_u16
(
vLane4
.
val
[
0
],
vLane6
.
val
[
0
]);
uint16x8_t
vSum_5_9
=
vaddq_u16
(
vLane4
.
val
[
1
],
vLane6
.
val
[
1
]);
uint16x8_t
vSum_6_10
=
vaddq_u16
(
vLane4
.
val
[
2
],
vLane6
.
val
[
2
]);
uint16x8_t
vSum_7_11
=
vaddq_u16
(
vLane4
.
val
[
3
],
vLane6
.
val
[
3
]);
vSum_0_4
=
vmlaq_u16
(
vSum_0_4
,
vSum_4_8
,
vc4u16
);
vSum_1_5
=
vmlaq_u16
(
vSum_1_5
,
vSum_5_9
,
vc4u16
);
vSum_2_6
=
vmlaq_u16
(
vSum_2_6
,
vSum_6_10
,
vc4u16
);
vSum_3_7
=
vmlaq_u16
(
vSum_3_7
,
vSum_7_11
,
vc4u16
);
vSum_0_4
=
vmlaq_u16
(
vSum_0_4
,
vLane8
.
val
[
0
],
vc6u16
);
vSum_1_5
=
vmlaq_u16
(
vSum_1_5
,
vLane8
.
val
[
1
],
vc6u16
);
vSum_2_6
=
vmlaq_u16
(
vSum_2_6
,
vLane8
.
val
[
2
],
vc6u16
);
vSum_3_7
=
vmlaq_u16
(
vSum_3_7
,
vLane8
.
val
[
3
],
vc6u16
);
uint8x8x4_t
vRes
;
vRes
.
val
[
0
]
=
vrshrn_n_u16
(
vSum_0_4
,
8
);
vRes
.
val
[
1
]
=
vrshrn_n_u16
(
vSum_1_5
,
8
);
vRes
.
val
[
2
]
=
vrshrn_n_u16
(
vSum_2_6
,
8
);
vRes
.
val
[
3
]
=
vrshrn_n_u16
(
vSum_3_7
,
8
);
vst4_u8
(
dst
+
x
,
vRes
);
#endif
}
break
;
}
for
(
s32
h
=
0
;
h
<
cn
;
++
h
)
{
u16
*
ln
=
lane
+
h
;
u8
*
dt
=
dst
+
h
;
for
(
size_t
k
=
x
;
k
<
colsn
;
k
+=
cn
)
{
dt
[
k
]
=
(
u8
)((
ln
[
k
-
2
*
cn
]
+
ln
[
k
+
2
*
cn
]
+
u16
(
4
)
*
(
ln
[
k
-
cn
]
+
ln
[
k
+
cn
])
+
u16
(
6
)
*
ln
[
k
]
+
(
1
<<
7
))
>>
8
);
}
}
}
#else
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
borderValue
;
(
void
)
borderMargin
;
#endif
}
void
gaussianBlur5x5
(
const
Size2D
&
size
,
s32
cn
,
const
u16
*
srcBase
,
ptrdiff_t
srcStride
,
u16
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
borderType
,
u16
borderValue
,
Margin
borderMargin
)
{
internal
::
assertSupportedConfiguration
(
isGaussianBlur5x5Supported
(
size
,
cn
,
borderType
));
#ifdef CAROTENE_NEON
size_t
colsn
=
size
.
width
*
cn
;
std
::
vector
<
u16
>
_tmp
;
u16
*
tmp
=
0
;
if
(
borderType
==
BORDER_MODE_CONSTANT
)
{
_tmp
.
assign
(
colsn
+
4
*
cn
,
borderValue
);
tmp
=
&
_tmp
[
cn
<<
1
];
}
ptrdiff_t
idx_l1
=
internal
::
borderInterpolate
(
-
1
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_l2
=
internal
::
borderInterpolate
(
-
2
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_r1
=
internal
::
borderInterpolate
(
size
.
width
+
0
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_r2
=
internal
::
borderInterpolate
(
size
.
width
+
1
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
//1-line buffer
std
::
vector
<
u32
>
_buf
(
cn
*
(
size
.
width
+
4
)
+
32
/
sizeof
(
u32
));
u32
*
lane
=
internal
::
alignPtr
(
&
_buf
[
cn
<<
1
],
32
);
if
(
borderType
==
BORDER_MODE_CONSTANT
)
for
(
s32
k
=
0
;
k
<
cn
;
++
k
)
{
lane
[
-
cn
+
k
]
=
borderValue
;
lane
[
-
cn
-
cn
+
k
]
=
borderValue
;
lane
[
colsn
+
k
]
=
borderValue
;
lane
[
colsn
+
cn
+
k
]
=
borderValue
;
}
uint16x4_t
vc6u16
=
vmov_n_u16
(
6
);
uint32x4_t
vc6u32
=
vmovq_n_u32
(
6
);
uint32x4_t
vc4u32
=
vmovq_n_u32
(
4
);
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
u16
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
//vertical convolution
ptrdiff_t
idx_rm2
=
internal
::
borderInterpolate
(
i
-
2
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rm1
=
internal
::
borderInterpolate
(
i
-
1
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rp1
=
internal
::
borderInterpolate
(
i
+
1
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rp2
=
internal
::
borderInterpolate
(
i
+
2
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
const
u16
*
ln0
=
idx_rm2
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rm2
)
:
tmp
;
const
u16
*
ln1
=
idx_rm1
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rm1
)
:
tmp
;
const
u16
*
ln2
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
const
u16
*
ln3
=
idx_rp1
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rp1
)
:
tmp
;
const
u16
*
ln4
=
idx_rp2
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rp2
)
:
tmp
;
size_t
x
=
0
;
for
(;
x
<=
colsn
-
4
;
x
+=
4
)
{
internal
::
prefetch
(
internal
::
getRowPtr
(
ln2
+
x
,
srcStride
,
x
%
5
-
2
));
uint16x4_t
v0
=
vld1_u16
(
ln0
+
x
);
uint16x4_t
v1
=
vld1_u16
(
ln1
+
x
);
uint16x4_t
v2
=
vld1_u16
(
ln2
+
x
);
uint16x4_t
v3
=
vld1_u16
(
ln3
+
x
);
uint16x4_t
v4
=
vld1_u16
(
ln4
+
x
);
uint32x4_t
v
=
vaddl_u16
(
v0
,
v4
);
uint32x4_t
v13
=
vaddl_u16
(
v1
,
v3
);
v
=
vmlal_u16
(
v
,
v2
,
vc6u16
);
v
=
vmlaq_u32
(
v
,
v13
,
vc4u32
);
vst1q_u32
(
lane
+
x
,
v
);
}
for
(;
x
<
colsn
;
++
x
)
lane
[
x
]
=
ln0
[
x
]
+
ln4
[
x
]
+
4
*
(
ln1
[
x
]
+
ln3
[
x
])
+
6
*
ln2
[
x
];
//left&right borders
if
(
borderType
!=
BORDER_MODE_CONSTANT
)
for
(
s32
k
=
0
;
k
<
cn
;
++
k
)
{
lane
[
-
cn
+
k
]
=
lane
[
idx_l1
+
k
];
lane
[
-
cn
-
cn
+
k
]
=
lane
[
idx_l2
+
k
];
lane
[
colsn
+
k
]
=
lane
[
idx_r1
+
k
];
lane
[
colsn
+
cn
+
k
]
=
lane
[
idx_r2
+
k
];
}
//horizontal convolution
x
=
0
;
for
(;
x
<=
colsn
-
4
;
x
+=
4
)
{
internal
::
prefetch
(
lane
+
x
);
uint32x4_t
lane0
=
vld1q_u32
(
lane
+
x
-
2
);
uint32x4_t
lane4
=
vld1q_u32
(
lane
+
x
+
2
);
uint32x4_t
lane1
=
vld1q_u32
(
lane
+
x
-
1
);
uint32x4_t
lane3
=
vld1q_u32
(
lane
+
x
+
1
);
uint32x4_t
lane2
=
vld1q_u32
(
lane
+
x
+
0
);
uint32x4_t
ln04
=
vaddq_u32
(
lane0
,
lane4
);
uint32x4_t
ln13
=
vaddq_u32
(
lane1
,
lane3
);
uint32x4_t
ln042
=
vmlaq_u32
(
ln04
,
lane2
,
vc6u32
);
uint32x4_t
lsw
=
vmlaq_u32
(
ln042
,
ln13
,
vc4u32
);
uint16x4_t
ls
=
vrshrn_n_u32
(
lsw
,
8
);
vst1_u16
(
dst
+
x
,
ls
);
}
for
(
s32
h
=
0
;
h
<
cn
;
++
h
)
{
u32
*
ln
=
lane
+
h
;
u16
*
dt
=
dst
+
h
;
for
(
size_t
k
=
x
;
k
<
colsn
;
k
+=
cn
)
{
dt
[
k
]
=
(
u16
)((
ln
[
k
-
2
*
cn
]
+
ln
[
k
+
2
*
cn
]
+
4
*
(
ln
[
k
-
cn
]
+
ln
[
k
+
cn
])
+
6
*
ln
[
k
]
+
(
1
<<
7
))
>>
8
);
}
}
}
#else
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
borderValue
;
(
void
)
borderMargin
;
#endif
}
void
gaussianBlur5x5
(
const
Size2D
&
size
,
s32
cn
,
const
s16
*
srcBase
,
ptrdiff_t
srcStride
,
s16
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
borderType
,
s16
borderValue
,
Margin
borderMargin
)
{
internal
::
assertSupportedConfiguration
(
isGaussianBlur5x5Supported
(
size
,
cn
,
borderType
));
#ifdef CAROTENE_NEON
size_t
colsn
=
size
.
width
*
cn
;
std
::
vector
<
s16
>
_tmp
;
s16
*
tmp
=
0
;
if
(
borderType
==
BORDER_MODE_CONSTANT
)
{
_tmp
.
assign
(
colsn
+
4
*
cn
,
borderValue
);
tmp
=
&
_tmp
[
cn
<<
1
];
}
ptrdiff_t
idx_l1
=
internal
::
borderInterpolate
(
-
1
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_l2
=
internal
::
borderInterpolate
(
-
2
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_r1
=
internal
::
borderInterpolate
(
size
.
width
+
0
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_r2
=
internal
::
borderInterpolate
(
size
.
width
+
1
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
//1-line buffer
std
::
vector
<
s32
>
_buf
(
cn
*
(
size
.
width
+
4
)
+
32
/
sizeof
(
s32
));
s32
*
lane
=
internal
::
alignPtr
(
&
_buf
[
cn
<<
1
],
32
);
if
(
borderType
==
BORDER_MODE_CONSTANT
)
for
(
s32
k
=
0
;
k
<
cn
;
++
k
)
{
lane
[
-
cn
+
k
]
=
borderValue
;
lane
[
-
cn
-
cn
+
k
]
=
borderValue
;
lane
[
colsn
+
k
]
=
borderValue
;
lane
[
colsn
+
cn
+
k
]
=
borderValue
;
}
int16x4_t
vc6s16
=
vmov_n_s16
(
6
);
int32x4_t
vc6s32
=
vmovq_n_s32
(
6
);
int32x4_t
vc4s32
=
vmovq_n_s32
(
4
);
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
s16
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
//vertical convolution
ptrdiff_t
idx_rm2
=
internal
::
borderInterpolate
(
i
-
2
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rm1
=
internal
::
borderInterpolate
(
i
-
1
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rp1
=
internal
::
borderInterpolate
(
i
+
1
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rp2
=
internal
::
borderInterpolate
(
i
+
2
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
const
s16
*
ln0
=
idx_rm2
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rm2
)
:
tmp
;
const
s16
*
ln1
=
idx_rm1
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rm1
)
:
tmp
;
const
s16
*
ln2
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
const
s16
*
ln3
=
idx_rp1
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rp1
)
:
tmp
;
const
s16
*
ln4
=
idx_rp2
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rp2
)
:
tmp
;
size_t
x
=
0
;
for
(;
x
<=
colsn
-
4
;
x
+=
4
)
{
internal
::
prefetch
(
internal
::
getRowPtr
(
ln2
+
x
,
srcStride
,
x
%
5
-
2
));
int16x4_t
v0
=
vld1_s16
(
ln0
+
x
);
int16x4_t
v1
=
vld1_s16
(
ln1
+
x
);
int16x4_t
v2
=
vld1_s16
(
ln2
+
x
);
int16x4_t
v3
=
vld1_s16
(
ln3
+
x
);
int16x4_t
v4
=
vld1_s16
(
ln4
+
x
);
int32x4_t
v
=
vaddl_s16
(
v0
,
v4
);
int32x4_t
v13
=
vaddl_s16
(
v1
,
v3
);
v
=
vmlal_s16
(
v
,
v2
,
vc6s16
);
v
=
vmlaq_s32
(
v
,
v13
,
vc4s32
);
vst1q_s32
(
lane
+
x
,
v
);
}
for
(;
x
<
colsn
;
++
x
)
lane
[
x
]
=
ln0
[
x
]
+
ln4
[
x
]
+
4
*
(
ln1
[
x
]
+
ln3
[
x
])
+
6
*
ln2
[
x
];
//left&right borders
if
(
borderType
!=
BORDER_MODE_CONSTANT
)
for
(
s32
k
=
0
;
k
<
cn
;
++
k
)
{
lane
[
-
cn
+
k
]
=
lane
[
idx_l1
+
k
];
lane
[
-
cn
-
cn
+
k
]
=
lane
[
idx_l2
+
k
];
lane
[
colsn
+
k
]
=
lane
[
idx_r1
+
k
];
lane
[
colsn
+
cn
+
k
]
=
lane
[
idx_r2
+
k
];
}
//horizontal convolution
x
=
0
;
switch
(
cn
)
{
case
1
:
case
2
:
case
3
:
for
(;
x
<=
colsn
-
4
;
x
+=
4
)
{
internal
::
prefetch
(
lane
+
x
);
int32x4_t
lane0
=
vld1q_s32
(
lane
+
x
-
2
);
int32x4_t
lane4
=
vld1q_s32
(
lane
+
x
+
2
);
int32x4_t
lane1
=
vld1q_s32
(
lane
+
x
-
1
);
int32x4_t
lane3
=
vld1q_s32
(
lane
+
x
+
1
);
int32x4_t
lane2
=
vld1q_s32
(
lane
+
x
+
0
);
int32x4_t
ln04
=
vaddq_s32
(
lane0
,
lane4
);
int32x4_t
ln13
=
vaddq_s32
(
lane1
,
lane3
);
int32x4_t
ln042
=
vmlaq_s32
(
ln04
,
lane2
,
vc6s32
);
int32x4_t
lsw
=
vmlaq_s32
(
ln042
,
ln13
,
vc4s32
);
int16x4_t
ls
=
vrshrn_n_s32
(
lsw
,
8
);
vst1_s16
(
dst
+
x
,
ls
);
}
break
;
case
4
:
/* for (; x <= colsn - 4*4; x += 4*4)
{
internal::prefetch(lane + x);
internal::prefetch(lane + x + 16);
ptrdiff_t* lidx0 = lane + x - 2*4;
ptrdiff_t* lidx1 = lane + x - 1*4;
ptrdiff_t* lidx3 = lane + x + 1*4;
ptrdiff_t* lidx4 = lane + x + 2*4;
__asm__ __volatile__ (
"vld4.32 {d0, d2, d4, d6}, [%[in0]]! \n\t"
"vld4.32 {d1, d3, d5, d7}, [%[in0]] \n\t"
"vld4.32 {d8, d10, d12, d14}, [%[in4]]! \n\t"
"vld4.32 {d9, d11, d13, d15}, [%[in4]] \n\t"
"vadd.i32 q0, q4 \n\t"
"vadd.i32 q1, q5 \n\t"
"vadd.i32 q2, q6 \n\t"
"vadd.i32 q3, q7 \n\t"
"vld4.32 {d16, d18, d20, d22}, [%[in1]]! \n\t"
"vld4.32 {d17, d19, d21, d23}, [%[in1]] \n\t"
"vld4.32 {d8, d10, d12, d14}, [%[in3]]! \n\t"
"vld4.32 {d9, d11, d13, d15}, [%[in3]] \n\t"
"vadd.i32 q4, q8 \n\t"
"vadd.i32 q5, q9 \n\t"
"vadd.i32 q6, q10 \n\t"
"vadd.i32 q7, q11 \n\t"
"vld4.32 {d16, d18, d20, d22}, [%[in2],:256] \n\t"
"vld4.32 {d17, d19, d21, d23}, [%[in22],:256] \n\t"
"vmla.i32 q0, q4, %q[c4] \n\t"
"vmla.i32 q1, q5, %q[c4] \n\t"
"vmla.i32 q2, q6, %q[c4] \n\t"
"vmla.i32 q3, q7, %q[c4] \n\t"
"vmla.i32 q0, q8, %q[c6] \n\t"
"vmla.i32 q1, q9, %q[c6] \n\t"
"vmla.i32 q2, q10, %q[c6] \n\t"
"vmla.i32 q3, q11, %q[c6] \n\t"
"vrshrn.i32 d8, q0, #8 \n\t"
"vrshrn.i32 d9, q1, #8 \n\t"
"vrshrn.i32 d10, q2, #8 \n\t"
"vrshrn.i32 d11, q3, #8 \n\t"
"vst4.16 {d8-d11}, [%[out]] \n\t"
: [in0] "=r" (lidx0),
[in1] "=r" (lidx1),
[in3] "=r" (lidx3),
[in4] "=r" (lidx4)
: [out] "r" (dst + x),
"0" (lidx0),
"1" (lidx1),
"2" (lidx3),
"3" (lidx4),
[in2] "r" (lane + x),
[in22] "r" (lane + x + 4*2),
[c4] "w" (vc4s32), [c6] "w" (vc6s32)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
);
*/
for
(;
x
<=
colsn
-
4
;
x
+=
4
)
{
internal
::
prefetch
(
lane
+
x
);
int32x4_t
lane0
=
vld1q_s32
(
lane
+
x
-
2
);
int32x4_t
lane4
=
vld1q_s32
(
lane
+
x
+
2
);
int32x4_t
lane1
=
vld1q_s32
(
lane
+
x
-
1
);
int32x4_t
lane3
=
vld1q_s32
(
lane
+
x
+
1
);
int32x4_t
lane2
=
vld1q_s32
(
lane
+
x
+
0
);
int32x4_t
ln04
=
vaddq_s32
(
lane0
,
lane4
);
int32x4_t
ln13
=
vaddq_s32
(
lane1
,
lane3
);
int32x4_t
ln042
=
vmlaq_s32
(
ln04
,
lane2
,
vc6s32
);
int32x4_t
lsw
=
vmlaq_s32
(
ln042
,
ln13
,
vc4s32
);
int16x4_t
ls
=
vrshrn_n_s32
(
lsw
,
8
);
vst1_s16
(
dst
+
x
,
ls
);
}
break
;
}
for
(
s32
h
=
0
;
h
<
cn
;
++
h
)
{
s32
*
ln
=
lane
+
h
;
s16
*
dt
=
dst
+
h
;
for
(
size_t
k
=
x
;
k
<
colsn
;
k
+=
cn
)
{
dt
[
k
]
=
(
s16
)((
ln
[
k
-
2
*
cn
]
+
ln
[
k
+
2
*
cn
]
+
4
*
(
ln
[
k
-
cn
]
+
ln
[
k
+
cn
])
+
6
*
ln
[
k
]
+
(
1
<<
7
))
>>
8
);
}
}
}
#else
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
borderValue
;
(
void
)
borderMargin
;
#endif
}
void
gaussianBlur5x5
(
const
Size2D
&
size
,
s32
cn
,
const
s32
*
srcBase
,
ptrdiff_t
srcStride
,
s32
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
borderType
,
s32
borderValue
,
Margin
borderMargin
)
{
internal
::
assertSupportedConfiguration
(
isGaussianBlur5x5Supported
(
size
,
cn
,
borderType
));
#ifdef CAROTENE_NEON
size_t
colsn
=
size
.
width
*
cn
;
std
::
vector
<
s32
>
_tmp
;
s32
*
tmp
=
0
;
if
(
borderType
==
BORDER_MODE_CONSTANT
)
{
_tmp
.
assign
(
colsn
+
4
*
cn
,
borderValue
);
tmp
=
&
_tmp
[
cn
<<
1
];
}
ptrdiff_t
idx_l1
=
internal
::
borderInterpolate
(
-
1
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_l2
=
internal
::
borderInterpolate
(
-
2
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_r1
=
internal
::
borderInterpolate
(
size
.
width
+
0
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
ptrdiff_t
idx_r2
=
internal
::
borderInterpolate
(
size
.
width
+
1
,
size
.
width
,
borderType
,
borderMargin
.
left
,
borderMargin
.
right
)
*
cn
;
//1-line buffer
std
::
vector
<
s32
>
_buf
(
cn
*
(
size
.
width
+
4
)
+
32
/
sizeof
(
s32
));
s32
*
lane
=
internal
::
alignPtr
(
&
_buf
[
cn
<<
1
],
32
);
if
(
borderType
==
BORDER_MODE_CONSTANT
)
for
(
s32
k
=
0
;
k
<
cn
;
++
k
)
{
lane
[
-
cn
+
k
]
=
borderValue
;
lane
[
-
cn
-
cn
+
k
]
=
borderValue
;
lane
[
colsn
+
k
]
=
borderValue
;
lane
[
colsn
+
cn
+
k
]
=
borderValue
;
}
int32x4_t
vc6s32
=
vmovq_n_s32
(
6
);
int32x4_t
vc4s32
=
vmovq_n_s32
(
4
);
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
s32
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
//vertical convolution
ptrdiff_t
idx_rm2
=
internal
::
borderInterpolate
(
i
-
2
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rm1
=
internal
::
borderInterpolate
(
i
-
1
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rp1
=
internal
::
borderInterpolate
(
i
+
1
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
ptrdiff_t
idx_rp2
=
internal
::
borderInterpolate
(
i
+
2
,
size
.
height
,
borderType
,
borderMargin
.
top
,
borderMargin
.
bottom
);
const
s32
*
ln0
=
idx_rm2
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rm2
)
:
tmp
;
const
s32
*
ln1
=
idx_rm1
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rm1
)
:
tmp
;
const
s32
*
ln2
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
const
s32
*
ln3
=
idx_rp1
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rp1
)
:
tmp
;
const
s32
*
ln4
=
idx_rp2
>=
-
(
ptrdiff_t
)
borderMargin
.
top
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
idx_rp2
)
:
tmp
;
size_t
x
=
0
;
for
(;
x
<=
colsn
-
4
;
x
+=
4
)
{
internal
::
prefetch
(
internal
::
getRowPtr
(
ln2
+
x
,
srcStride
,
x
%
5
-
2
));
int32x4_t
v0
=
vld1q_s32
(
ln0
+
x
);
int32x4_t
v1
=
vld1q_s32
(
ln1
+
x
);
int32x4_t
v2
=
vld1q_s32
(
ln2
+
x
);
int32x4_t
v3
=
vld1q_s32
(
ln3
+
x
);
int32x4_t
v4
=
vld1q_s32
(
ln4
+
x
);
int32x4_t
v
=
vaddq_s32
(
v0
,
v4
);
int32x4_t
v13
=
vaddq_s32
(
v1
,
v3
);
v
=
vmlaq_s32
(
v
,
v2
,
vc6s32
);
v
=
vmlaq_s32
(
v
,
v13
,
vc4s32
);
vst1q_s32
(
lane
+
x
,
v
);
}
for
(;
x
<
colsn
;
++
x
)
lane
[
x
]
=
ln0
[
x
]
+
ln4
[
x
]
+
4
*
(
ln1
[
x
]
+
ln3
[
x
])
+
6
*
ln2
[
x
];
//left&right borders
if
(
borderType
!=
BORDER_MODE_CONSTANT
)
for
(
s32
k
=
0
;
k
<
cn
;
++
k
)
{
lane
[
-
cn
+
k
]
=
lane
[
idx_l1
+
k
];
lane
[
-
cn
-
cn
+
k
]
=
lane
[
idx_l2
+
k
];
lane
[
colsn
+
k
]
=
lane
[
idx_r1
+
k
];
lane
[
colsn
+
cn
+
k
]
=
lane
[
idx_r2
+
k
];
}
//horizontal convolution
x
=
0
;
for
(;
x
<=
colsn
-
4
;
x
+=
4
)
{
internal
::
prefetch
(
lane
+
x
);
int32x4_t
lane0
=
vld1q_s32
(
lane
+
x
-
2
);
int32x4_t
lane4
=
vld1q_s32
(
lane
+
x
+
2
);
int32x4_t
lane1
=
vld1q_s32
(
lane
+
x
-
1
);
int32x4_t
lane3
=
vld1q_s32
(
lane
+
x
+
1
);
int32x4_t
lane2
=
vld1q_s32
(
lane
+
x
+
0
);
int32x4_t
ln04
=
vaddq_s32
(
lane0
,
lane4
);
int32x4_t
ln13
=
vaddq_s32
(
lane1
,
lane3
);
int32x4_t
ln042
=
vmlaq_s32
(
ln04
,
lane2
,
vc6s32
);
int32x4_t
lsw
=
vmlaq_s32
(
ln042
,
ln13
,
vc4s32
);
vst1q_s32
(
dst
+
x
,
lsw
);
}
for
(
s32
h
=
0
;
h
<
cn
;
++
h
)
{
s32
*
ln
=
lane
+
h
;
s32
*
dt
=
dst
+
h
;
for
(
size_t
k
=
x
;
k
<
colsn
;
k
+=
cn
)
{
dt
[
k
]
=
ln
[
k
-
2
*
cn
]
+
ln
[
k
+
2
*
cn
]
+
4
*
(
ln
[
k
-
cn
]
+
ln
[
k
+
cn
])
+
6
*
ln
[
k
];
}
}
}
#else
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
borderValue
;
(
void
)
borderMargin
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/in_range.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace
CAROTENE_NS
{
#ifdef CAROTENE_NEON
namespace
{
inline
void
vnst
(
u8
*
dst
,
uint8x16_t
v1
,
uint8x16_t
v2
)
{
vst1q_u8
(
dst
,
v1
);
vst1q_u8
(
dst
+
16
,
v2
);
}
inline
void
vnst
(
u8
*
dst
,
uint16x8_t
v1
,
uint16x8_t
v2
)
{
vst1q_u8
(
dst
,
vcombine_u8
(
vmovn_u16
(
v1
),
vmovn_u16
(
v2
)));
}
inline
void
vnst
(
u8
*
dst
,
uint32x4_t
v1
,
uint32x4_t
v2
)
{
vst1_u8
(
dst
,
vmovn_u16
(
vcombine_u16
(
vmovn_u32
(
v1
),
vmovn_u32
(
v2
))));
}
template
<
typename
T
,
int
elsize
>
struct
vtail
{
static
inline
void
inRange
(
const
T
*
,
const
T
*
,
const
T
*
,
u8
*
,
size_t
&
,
size_t
)
{
//do nothing since there couldn't be enough data
}
};
template
<
typename
T
>
struct
vtail
<
T
,
2
>
{
static
inline
void
inRange
(
const
T
*
src
,
const
T
*
rng1
,
const
T
*
rng2
,
u8
*
dst
,
size_t
&
x
,
size_t
width
)
{
typedef
typename
internal
::
VecTraits
<
T
>::
vec128
vec128
;
typedef
typename
internal
::
VecTraits
<
T
>::
unsign
::
vec128
uvec128
;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if
(
x
+
8
<
width
)
{
vec128
vs
=
internal
::
vld1q
(
src
+
x
);
vec128
vr1
=
internal
::
vld1q
(
rng1
+
x
);
vec128
vr2
=
internal
::
vld1q
(
rng2
+
x
);
uvec128
vd
=
internal
::
vandq
(
internal
::
vcgeq
(
vs
,
vr1
),
internal
::
vcgeq
(
vr2
,
vs
));
internal
::
vst1
(
dst
+
x
,
internal
::
vmovn
(
vd
));
x
+=
8
;
}
}
};
template
<
typename
T
>
struct
vtail
<
T
,
1
>
{
static
inline
void
inRange
(
const
T
*
src
,
const
T
*
rng1
,
const
T
*
rng2
,
u8
*
dst
,
size_t
&
x
,
size_t
width
)
{
typedef
typename
internal
::
VecTraits
<
T
>::
vec128
vec128
;
typedef
typename
internal
::
VecTraits
<
T
>::
unsign
::
vec128
uvec128
;
typedef
typename
internal
::
VecTraits
<
T
>::
vec64
vec64
;
typedef
typename
internal
::
VecTraits
<
T
>::
unsign
::
vec64
uvec64
;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if
(
x
+
16
<
width
)
{
vec128
vs
=
internal
::
vld1q
(
src
+
x
);
vec128
vr1
=
internal
::
vld1q
(
rng1
+
x
);
vec128
vr2
=
internal
::
vld1q
(
rng2
+
x
);
uvec128
vd
=
internal
::
vandq
(
internal
::
vcgeq
(
vs
,
vr1
),
internal
::
vcgeq
(
vr2
,
vs
));
internal
::
vst1q
(
dst
+
x
,
vd
);
x
+=
16
;
}
if
(
x
+
8
<
width
)
{
vec64
vs
=
internal
::
vld1
(
src
+
x
);
vec64
vr1
=
internal
::
vld1
(
rng1
+
x
);
vec64
vr2
=
internal
::
vld1
(
rng2
+
x
);
uvec64
vd
=
internal
::
vand
(
internal
::
vcge
(
vs
,
vr1
),
internal
::
vcge
(
vr2
,
vs
));
internal
::
vst1
(
dst
+
x
,
vd
);
x
+=
8
;
}
}
};
template
<
typename
T
>
inline
void
inRangeCheck
(
const
Size2D
&
_size
,
const
T
*
srcBase
,
ptrdiff_t
srcStride
,
const
T
*
rng1Base
,
ptrdiff_t
rng1Stride
,
const
T
*
rng2Base
,
ptrdiff_t
rng2Stride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
)
{
typedef
typename
internal
::
VecTraits
<
T
>::
vec128
vec128
;
typedef
typename
internal
::
VecTraits
<
T
>::
unsign
::
vec128
uvec128
;
Size2D
size
(
_size
);
if
(
srcStride
==
dstStride
&&
srcStride
==
rng1Stride
&&
srcStride
==
rng2Stride
&&
srcStride
==
(
ptrdiff_t
)(
size
.
width
))
{
size
.
width
*=
size
.
height
;
size
.
height
=
1
;
}
const
size_t
width
=
size
.
width
&
~
(
32
/
sizeof
(
T
)
-
1
);
for
(
size_t
j
=
0
;
j
<
size
.
height
;
++
j
)
{
const
T
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
j
);
const
T
*
rng1
=
internal
::
getRowPtr
(
rng1Base
,
rng1Stride
,
j
);
const
T
*
rng2
=
internal
::
getRowPtr
(
rng2Base
,
rng2Stride
,
j
);
u8
*
dst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
j
);
size_t
i
=
0
;
for
(
;
i
<
width
;
i
+=
32
/
sizeof
(
T
)
)
{
internal
::
prefetch
(
src
+
i
);
internal
::
prefetch
(
rng1
+
i
);
internal
::
prefetch
(
rng2
+
i
);
vec128
vs
=
internal
::
vld1q
(
src
+
i
);
vec128
vr1
=
internal
::
vld1q
(
rng1
+
i
);
vec128
vr2
=
internal
::
vld1q
(
rng2
+
i
);
uvec128
vd1
=
internal
::
vandq
(
internal
::
vcgeq
(
vs
,
vr1
),
internal
::
vcgeq
(
vr2
,
vs
));
vs
=
internal
::
vld1q
(
src
+
i
+
16
/
sizeof
(
T
));
vr1
=
internal
::
vld1q
(
rng1
+
i
+
16
/
sizeof
(
T
));
vr2
=
internal
::
vld1q
(
rng2
+
i
+
16
/
sizeof
(
T
));
uvec128
vd2
=
internal
::
vandq
(
internal
::
vcgeq
(
vs
,
vr1
),
internal
::
vcgeq
(
vr2
,
vs
));
vnst
(
dst
+
i
,
vd1
,
vd2
);
}
vtail
<
T
,
sizeof
(
T
)
>::
inRange
(
src
,
rng1
,
rng2
,
dst
,
i
,
size
.
width
);
for
(
;
i
<
size
.
width
;
i
++
)
dst
[
i
]
=
(
u8
)(
-
(
rng1
[
i
]
<=
src
[
i
]
&&
src
[
i
]
<=
rng2
[
i
]));
}
}
}
#define INRANGEFUNC(T) \
void inRange(const Size2D &_size, \
const T * srcBase, ptrdiff_t srcStride, \
const T * rng1Base, ptrdiff_t rng1Stride, \
const T * rng2Base, ptrdiff_t rng2Stride, \
u8 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
inRangeCheck(_size, srcBase, srcStride, \
rng1Base, rng1Stride, rng2Base, rng2Stride, \
dstBase, dstStride); \
}
#else
#define INRANGEFUNC(T) \
void inRange(const Size2D &, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
u8 *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
INRANGEFUNC
(
u8
)
INRANGEFUNC
(
s8
)
INRANGEFUNC
(
u16
)
INRANGEFUNC
(
s16
)
INRANGEFUNC
(
s32
)
INRANGEFUNC
(
f32
)
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/integral.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace
CAROTENE_NS
{
void
integral
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u32
*
sumBase
,
ptrdiff_t
sumStride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
uint32x4_t
v_zero
=
vmovq_n_u32
(
0u
);
// the first iteration
const
u8
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
0
);
u32
*
sum
=
internal
::
getRowPtr
(
sumBase
,
sumStride
,
0
);
uint32x4_t
prev
=
v_zero
;
size_t
j
=
0u
;
for
(
;
j
+
7
<
size
.
width
;
j
+=
8
)
{
internal
::
prefetch
(
sum
+
j
);
internal
::
prefetch
(
src
+
j
);
uint8x8_t
el8shr0
=
vld1_u8
(
src
+
j
);
uint8x8_t
el8shr1
=
vreinterpret_u8_u64
(
vshl_n_u64
(
vreinterpret_u64_u8
(
el8shr0
),
8
));
uint8x8_t
el8shr2
=
vreinterpret_u8_u64
(
vshl_n_u64
(
vreinterpret_u64_u8
(
el8shr0
),
16
));
uint8x8_t
el8shr3
=
vreinterpret_u8_u64
(
vshl_n_u64
(
vreinterpret_u64_u8
(
el8shr0
),
24
));
uint16x8_t
el8shr12
=
vaddl_u8
(
el8shr1
,
el8shr2
);
uint16x8_t
el8shr03
=
vaddl_u8
(
el8shr0
,
el8shr3
);
uint16x8_t
el8
=
vaddq_u16
(
el8shr12
,
el8shr03
);
uint16x4_t
el4h
=
vadd_u16
(
vget_low_u16
(
el8
),
vget_high_u16
(
el8
));
uint32x4_t
vsuml
=
vaddw_u16
(
prev
,
vget_low_u16
(
el8
));
uint32x4_t
vsumh
=
vaddw_u16
(
prev
,
el4h
);
vst1q_u32
(
sum
+
j
,
vsuml
);
vst1q_u32
(
sum
+
j
+
4
,
vsumh
);
prev
=
vaddw_u16
(
prev
,
vdup_lane_u16
(
el4h
,
3
));
}
for
(
u32
v
=
vgetq_lane_u32
(
prev
,
3
);
j
<
size
.
width
;
++
j
)
sum
[
j
]
=
(
v
+=
src
[
j
]);
// the others
for
(
size_t
i
=
1
;
i
<
size
.
height
;
++
i
)
{
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
u32
*
prevSum
=
internal
::
getRowPtr
(
sumBase
,
sumStride
,
i
-
1
);
sum
=
internal
::
getRowPtr
(
sumBase
,
sumStride
,
i
);
prev
=
v_zero
;
j
=
0u
;
for
(
;
j
+
7
<
size
.
width
;
j
+=
8
)
{
internal
::
prefetch
(
sum
+
j
);
internal
::
prefetch
(
src
+
j
);
uint32x4_t
vsuml
=
vld1q_u32
(
prevSum
+
j
);
uint32x4_t
vsumh
=
vld1q_u32
(
prevSum
+
j
+
4
);
uint8x8_t
el8shr0
=
vld1_u8
(
src
+
j
);
uint8x8_t
el8shr1
=
vreinterpret_u8_u64
(
vshl_n_u64
(
vreinterpret_u64_u8
(
el8shr0
),
8
));
uint8x8_t
el8shr2
=
vreinterpret_u8_u64
(
vshl_n_u64
(
vreinterpret_u64_u8
(
el8shr0
),
16
));
uint8x8_t
el8shr3
=
vreinterpret_u8_u64
(
vshl_n_u64
(
vreinterpret_u64_u8
(
el8shr0
),
24
));
vsuml
=
vaddq_u32
(
vsuml
,
prev
);
vsumh
=
vaddq_u32
(
vsumh
,
prev
);
uint16x8_t
el8shr12
=
vaddl_u8
(
el8shr1
,
el8shr2
);
uint16x8_t
el8shr03
=
vaddl_u8
(
el8shr0
,
el8shr3
);
uint16x8_t
el8
=
vaddq_u16
(
el8shr12
,
el8shr03
);
uint16x4_t
el4h
=
vadd_u16
(
vget_low_u16
(
el8
),
vget_high_u16
(
el8
));
vsuml
=
vaddw_u16
(
vsuml
,
vget_low_u16
(
el8
));
vsumh
=
vaddw_u16
(
vsumh
,
el4h
);
vst1q_u32
(
sum
+
j
,
vsuml
);
vst1q_u32
(
sum
+
j
+
4
,
vsumh
);
prev
=
vaddw_u16
(
prev
,
vdup_lane_u16
(
el4h
,
3
));
}
for
(
u32
v
=
vgetq_lane_u32
(
prev
,
3
);
j
<
size
.
width
;
++
j
)
sum
[
j
]
=
(
v
+=
src
[
j
])
+
prevSum
[
j
];
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
sumBase
;
(
void
)
sumStride
;
#endif
}
void
sqrIntegral
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
f64
*
sqsumBase
,
ptrdiff_t
sqsumStride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
uint16x8_t
v_zero8
=
vmovq_n_u16
(
0u
);
// the first iteration
const
u8
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
0
);
f64
*
sqsum
=
internal
::
getRowPtr
(
sqsumBase
,
sqsumStride
,
0
);
double
prev
=
0.
;
size_t
j
=
0u
;
for
(
;
j
+
7
<
size
.
width
;
j
+=
8
)
{
internal
::
prefetch
(
sqsum
+
j
);
internal
::
prefetch
(
src
+
j
);
uint8x8_t
vsrc
=
vld1_u8
(
src
+
j
);
uint16x8_t
el8shr0
=
vmull_u8
(
vsrc
,
vsrc
);
uint16x8_t
el8shr1
=
vextq_u16
(
v_zero8
,
el8shr0
,
7
);
uint32x4_t
el8shr01l
=
vaddl_u16
(
vget_low_u16
(
el8shr0
),
vget_low_u16
(
el8shr1
));
uint32x4_t
el8shr01h
=
vaddl_u16
(
vget_high_u16
(
el8shr0
),
vget_high_u16
(
el8shr1
));
uint32x4_t
el4h
=
vaddq_u32
(
el8shr01l
,
el8shr01h
);
uint32x2_t
el2l
=
vadd_u32
(
vget_low_u32
(
el8shr01l
),
vget_high_u32
(
el8shr01l
));
uint32x2_t
el2hl
=
vadd_u32
(
vget_low_u32
(
el4h
),
vget_high_u32
(
el8shr01l
));
uint32x2_t
el2hh
=
vadd_u32
(
vget_low_u32
(
el4h
),
vget_high_u32
(
el4h
));
u32
buf
[
8
];
vst1_u32
(
buf
,
vget_low_u32
(
el8shr01l
));
vst1_u32
(
buf
+
2
,
el2l
);
vst1_u32
(
buf
+
4
,
el2hl
);
vst1_u32
(
buf
+
6
,
el2hh
);
for
(
u32
k
=
0
;
k
<
8
;
k
++
)
sqsum
[
j
+
k
]
=
prev
+
buf
[
k
];
prev
+=
buf
[
7
];
}
for
(;
j
<
size
.
width
;
++
j
)
sqsum
[
j
]
=
(
prev
+=
src
[
j
]
*
src
[
j
]);
// the others
for
(
size_t
i
=
1
;
i
<
size
.
height
;
++
i
)
{
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
f64
*
prevSqSum
=
internal
::
getRowPtr
(
sqsumBase
,
sqsumStride
,
i
-
1
);
sqsum
=
internal
::
getRowPtr
(
sqsumBase
,
sqsumStride
,
i
);
prev
=
0.
;
j
=
0u
;
for
(
;
j
+
7
<
size
.
width
;
j
+=
8
)
{
internal
::
prefetch
(
sqsum
+
j
);
internal
::
prefetch
(
src
+
j
);
uint8x8_t
vsrc
=
vld1_u8
(
src
+
j
);
uint16x8_t
el8shr0
=
vmull_u8
(
vsrc
,
vsrc
);
uint16x8_t
el8shr1
=
vextq_u16
(
v_zero8
,
el8shr0
,
7
);
uint32x4_t
el8shr01l
=
vaddl_u16
(
vget_low_u16
(
el8shr0
),
vget_low_u16
(
el8shr1
));
uint32x4_t
el8shr01h
=
vaddl_u16
(
vget_high_u16
(
el8shr0
),
vget_high_u16
(
el8shr1
));
uint32x4_t
el4h
=
vaddq_u32
(
el8shr01l
,
el8shr01h
);
uint32x2_t
el2l
=
vadd_u32
(
vget_low_u32
(
el8shr01l
),
vget_high_u32
(
el8shr01l
));
uint32x2_t
el2hl
=
vadd_u32
(
vget_low_u32
(
el4h
),
vget_high_u32
(
el8shr01l
));
uint32x2_t
el2hh
=
vadd_u32
(
vget_low_u32
(
el4h
),
vget_high_u32
(
el4h
));
u32
buf
[
8
];
vst1_u32
(
buf
,
vget_low_u32
(
el8shr01l
));
vst1_u32
(
buf
+
2
,
el2l
);
vst1_u32
(
buf
+
4
,
el2hl
);
vst1_u32
(
buf
+
6
,
el2hh
);
for
(
u32
k
=
0
;
k
<
8
;
k
++
)
sqsum
[
j
+
k
]
=
prev
+
prevSqSum
[
j
+
k
]
+
buf
[
k
];
prev
+=
buf
[
7
];
}
for
(;
j
<
size
.
width
;
++
j
)
sqsum
[
j
]
=
(
prev
+=
src
[
j
]
*
src
[
j
])
+
prevSqSum
[
j
];
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
sqsumBase
;
(
void
)
sqsumStride
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/intrinsics.hpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_INTRINSICS_HPP
#define CAROTENE_INTRINSICS_HPP
#include <carotene/definitions.hpp>
#include <arm_neon.h>
namespace
CAROTENE_NS
{
namespace
internal
{
/////////////// Custom NEON intrinsics ///////////////////
// calculate reciprocal value
inline
float32x4_t
vrecpq_f32
(
float32x4_t
val
)
{
float32x4_t
reciprocal
=
vrecpeq_f32
(
val
);
reciprocal
=
vmulq_f32
(
vrecpsq_f32
(
val
,
reciprocal
),
reciprocal
);
reciprocal
=
vmulq_f32
(
vrecpsq_f32
(
val
,
reciprocal
),
reciprocal
);
return
reciprocal
;
}
inline
float32x2_t
vrecp_f32
(
float32x2_t
val
)
{
float32x2_t
reciprocal
=
vrecpe_f32
(
val
);
reciprocal
=
vmul_f32
(
vrecps_f32
(
val
,
reciprocal
),
reciprocal
);
reciprocal
=
vmul_f32
(
vrecps_f32
(
val
,
reciprocal
),
reciprocal
);
return
reciprocal
;
}
// caclulate sqrt value
inline
float32x4_t
vrsqrtq_f32
(
float32x4_t
val
)
{
float32x4_t
e
=
vrsqrteq_f32
(
val
);
e
=
vmulq_f32
(
vrsqrtsq_f32
(
vmulq_f32
(
e
,
e
),
val
),
e
);
e
=
vmulq_f32
(
vrsqrtsq_f32
(
vmulq_f32
(
e
,
e
),
val
),
e
);
return
e
;
}
inline
float32x2_t
vrsqrt_f32
(
float32x2_t
val
)
{
float32x2_t
e
=
vrsqrte_f32
(
val
);
e
=
vmul_f32
(
vrsqrts_f32
(
vmul_f32
(
e
,
e
),
val
),
e
);
e
=
vmul_f32
(
vrsqrts_f32
(
vmul_f32
(
e
,
e
),
val
),
e
);
return
e
;
}
inline
float32x4_t
vsqrtq_f32
(
float32x4_t
val
)
{
return
vrecpq_f32
(
vrsqrtq_f32
(
val
));
}
inline
float32x2_t
vsqrt_f32
(
float32x2_t
val
)
{
return
vrecp_f32
(
vrsqrt_f32
(
val
));
}
// table lookup with the table in a 128-bit register
inline
uint8x8_t
vqtbl1_u8
(
uint8x16_t
a
,
uint8x8_t
b
)
{
#ifdef __aarch64__
// AArch64 supports this natively
return
::
vqtbl1_u8
(
a
,
b
);
#else
union
{
uint8x16_t
v
;
uint8x8x2_t
w
;
}
u
=
{
a
};
return
vtbl2_u8
(
u
.
w
,
b
);
#endif
}
}
}
#endif
opencv/3rdparty/carotene/src/laplacian.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
namespace
CAROTENE_NS
{
bool
isLaplacian3x3Supported
(
const
Size2D
&
size
,
BORDER_MODE
border
)
{
return
isSupportedConfiguration
()
&&
size
.
width
>=
8
&&
(
border
==
BORDER_MODE_CONSTANT
||
border
==
BORDER_MODE_REPLICATE
);
}
void
Laplacian3x3
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
u8
borderValue
)
{
internal
::
assertSupportedConfiguration
(
isLaplacian3x3Supported
(
size
,
border
));
#ifdef CAROTENE_NEON
const
uint16x8_t
v_border_x3
=
vdupq_n_u16
(
borderValue
*
3
);
const
uint16x8_t
v_zero
=
vdupq_n_u16
(
0
);
const
uint8x8_t
v_border
=
vdup_n_u8
(
borderValue
);
uint8x8_t
vsub
;
uint16x8_t
tprev
=
v_zero
,
tcurr
=
v_zero
,
tnext
=
v_zero
;
uint16x8_t
t0
=
v_zero
,
t1
=
v_zero
,
t2
=
v_zero
;
ptrdiff_t
width
=
(
ptrdiff_t
)
size
.
width
,
height
=
(
ptrdiff_t
)
size
.
height
;
for
(
ptrdiff_t
y
=
0
;
y
<
height
;
++
y
)
{
const
u8
*
srow0
=
y
==
0
&&
border
==
BORDER_MODE_CONSTANT
?
NULL
:
internal
::
getRowPtr
(
srcBase
,
srcStride
,
std
::
max
<
ptrdiff_t
>
(
y
-
1
,
0
));
const
u8
*
srow1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
);
const
u8
*
srow2
=
y
+
1
==
height
&&
border
==
BORDER_MODE_CONSTANT
?
NULL
:
internal
::
getRowPtr
(
srcBase
,
srcStride
,
std
::
min
(
y
+
1
,
height
-
1
));
u8
*
drow
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
s16
prevx
=
0
,
currx
=
0
,
nextx
=
0
;
ptrdiff_t
x
=
0
;
const
ptrdiff_t
bwidth
=
y
+
2
<
height
?
width
:
(
width
-
8
);
// perform vertical convolution
for
(
;
x
<=
bwidth
;
x
+=
8
)
{
internal
::
prefetch
(
srow0
+
x
);
internal
::
prefetch
(
srow1
+
x
);
internal
::
prefetch
(
srow2
+
x
);
uint8x8_t
x0
=
!
srow0
?
v_border
:
vld1_u8
(
srow0
+
x
);
uint8x8_t
x1
=
vld1_u8
(
srow1
+
x
);
uint8x8_t
x2
=
!
srow2
?
v_border
:
vld1_u8
(
srow2
+
x
);
// calculate values for plain CPU part below if needed
if
(
x
+
8
>=
bwidth
)
{
ptrdiff_t
x3
=
x
==
width
?
width
-
1
:
x
;
ptrdiff_t
x4
=
border
==
BORDER_MODE_CONSTANT
?
x3
-
1
:
std
::
max
<
ptrdiff_t
>
(
x3
-
1
,
0
);
if
(
border
==
BORDER_MODE_CONSTANT
&&
x4
<
0
)
prevx
=
borderValue
;
else
prevx
=
(
srow2
?
srow2
[
x4
]
:
borderValue
)
+
srow1
[
x4
]
+
(
srow0
?
srow0
[
x4
]
:
borderValue
);
currx
=
(
srow2
?
srow2
[
x3
]
:
borderValue
)
+
srow1
[
x3
]
+
(
srow0
?
srow0
[
x3
]
:
borderValue
);
}
// make shift
if
(
x
)
{
tprev
=
tcurr
;
tcurr
=
tnext
;
}
// and calculate next value
tnext
=
vaddw_u8
(
vaddl_u8
(
x0
,
x1
),
x2
);
// make extrapolation for the first elements
if
(
!
x
)
{
// make border
if
(
border
==
BORDER_MODE_CONSTANT
)
tcurr
=
v_border_x3
;
else
if
(
border
==
BORDER_MODE_REPLICATE
)
tcurr
=
vdupq_n_u16
(
vgetq_lane_u16
(
tnext
,
0
));
vsub
=
x1
;
continue
;
}
// combine 3 "shifted" vectors
t0
=
vextq_u16
(
tprev
,
tcurr
,
7
);
t1
=
tcurr
;
t2
=
vextq_u16
(
tcurr
,
tnext
,
1
);
// and add them
t0
=
vqaddq_u16
(
t0
,
vqaddq_u16
(
t1
,
t2
));
int16x8_t
tt0
=
vsubq_s16
(
vreinterpretq_s16_u16
(
t0
),
vreinterpretq_s16_u16
(
vaddw_u8
(
vshll_n_u8
(
vsub
,
3
),
vsub
)));
uint8x8_t
it0
=
vqmovun_s16
(
tt0
);
vst1_u8
(
drow
+
x
-
8
,
it0
);
vsub
=
x1
;
}
x
-=
8
;
if
(
x
==
width
)
--
x
;
for
(
;
x
<
width
;
++
x
)
{
// make extrapolation for the last elements
if
(
x
+
1
>=
width
)
{
if
(
border
==
BORDER_MODE_CONSTANT
)
nextx
=
borderValue
*
3
;
else
if
(
border
==
BORDER_MODE_REPLICATE
)
nextx
=
srow2
[
x
]
+
srow1
[
x
]
+
srow0
[
x
];
}
else
{
nextx
=
(
srow2
?
srow2
[
x
+
1
]
:
borderValue
)
+
srow1
[
x
+
1
]
+
(
srow0
?
srow0
[
x
+
1
]
:
borderValue
);
}
s32
val
=
(
prevx
+
currx
+
nextx
)
-
9
*
srow1
[
x
];
drow
[
x
]
=
internal
::
saturate_cast
<
u8
>
((
s32
)
val
);
// make shift
prevx
=
currx
;
currx
=
nextx
;
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
border
;
(
void
)
borderValue
;
#endif
}
bool
isLaplacianOpenCVSupported
(
const
Size2D
&
size
,
BORDER_MODE
border
)
{
return
isSupportedConfiguration
()
&&
size
.
width
>=
8
&&
size
.
height
>=
1
&&
(
border
==
BORDER_MODE_CONSTANT
||
border
==
BORDER_MODE_REFLECT
||
border
==
BORDER_MODE_REFLECT101
||
border
==
BORDER_MODE_REPLICATE
);
}
void
Laplacian1OpenCV
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
s16
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
u8
borderValue
)
{
internal
::
assertSupportedConfiguration
(
isLaplacianOpenCVSupported
(
size
,
border
));
#ifdef CAROTENE_NEON
ptrdiff_t
rows
=
size
.
height
,
cols
=
size
.
width
;
std
::
vector
<
u8
>
_tmp
;
u8
*
tmp
=
0
;
if
(
border
==
BORDER_MODE_CONSTANT
)
{
_tmp
.
assign
(
cols
+
4
,
borderValue
);
tmp
=
&
_tmp
[
2
];
}
for
(
ptrdiff_t
y
=
0
;
y
<
rows
;
y
++
)
{
const
u8
*
v0
=
0
;
const
u8
*
v1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
);
const
u8
*
v2
=
0
;
// make border
if
(
border
==
BORDER_MODE_REFLECT101
)
{
v0
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
0
?
y
-
1
:
y
+
1
);
v2
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
1
?
y
+
1
:
rows
>
1
?
rows
-
2
:
0
);
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
v0
=
y
>
0
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
-
1
)
:
tmp
;
v2
=
y
<
rows
-
1
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
+
1
)
:
tmp
;
}
else
{
v0
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
0
?
y
-
1
:
0
);
v2
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
1
?
y
+
1
:
rows
>
0
?
rows
-
1
:
0
);
}
s16
*
drow
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
int16x8_t
tcurr
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext
=
vmovq_n_s16
(
0x0
);
int16x8_t
t0
,
t2
;
uint8x8_t
xx0
=
vmov_n_u8
(
0x0
);
uint8x8_t
xx1
=
vmov_n_u8
(
0x0
);
uint8x8_t
xx2
=
vmov_n_u8
(
0x0
);
ptrdiff_t
x
=
0
;
const
ptrdiff_t
bcols
=
y
+
2
<
rows
?
cols
:
(
cols
-
8
);
for
(
;
x
<=
bcols
;
x
+=
8
)
{
internal
::
prefetch
(
v0
+
x
);
internal
::
prefetch
(
v1
+
x
);
internal
::
prefetch
(
v2
+
x
);
uint8x8_t
x0
=
vld1_u8
(
v0
+
x
);
uint8x8_t
x1
=
vld1_u8
(
v1
+
x
);
uint8x8_t
x2
=
vld1_u8
(
v2
+
x
);
if
(
x
)
{
xx0
=
xx1
;
xx1
=
xx2
;
}
else
{
xx1
=
x1
;
// make border
if
(
border
==
BORDER_MODE_REPLICATE
||
border
==
BORDER_MODE_REFLECT
)
{
xx1
=
vset_lane_u8
(
vget_lane_u8
(
x1
,
0
),
x1
,
7
);
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
xx1
=
vset_lane_u8
(
borderValue
,
x1
,
7
);
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
xx1
=
vset_lane_u8
(
vget_lane_u8
(
x1
,
1
),
x1
,
7
);
}
}
xx2
=
x1
;
if
(
x
)
{
tcurr
=
tnext
;
}
tnext
=
vsubq_s16
(
vreinterpretq_s16_u16
(
vaddl_u8
(
x0
,
x2
)),
vreinterpretq_s16_u16
(
vshll_n_u8
(
x1
,
2
)));
if
(
!
x
)
{
tcurr
=
tnext
;
continue
;
}
t0
=
vreinterpretq_s16_u16
(
vmovl_u8
(
vext_u8
(
xx0
,
xx1
,
7
)));
t2
=
vreinterpretq_s16_u16
(
vmovl_u8
(
vext_u8
(
xx1
,
xx2
,
1
)));
t0
=
vaddq_s16
(
vqaddq_s16
(
t0
,
t2
),
tcurr
);
vst1q_s16
(
drow
+
x
-
8
,
t0
);
}
x
-=
8
;
if
(
x
==
cols
){
x
--
;
}
for
(
;
x
<
cols
;
x
++
)
{
s16
nextx
;
s16
prevx
;
// make border
if
(
border
==
BORDER_MODE_REPLICATE
||
border
==
BORDER_MODE_REFLECT
)
{
prevx
=
x
==
0
?
v1
[
0
]
:
v1
[
x
-
1
];
nextx
=
x
==
cols
-
1
?
v1
[
x
]
:
v1
[
x
+
1
];
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
prevx
=
x
==
0
?
v1
[
1
]
:
v1
[
x
-
1
];
nextx
=
x
==
cols
-
1
?
v1
[
x
-
1
]
:
v1
[
x
+
1
];
}
else
//if (border == BORDER_MODE_CONSTANT)
{
prevx
=
x
==
0
?
borderValue
:
v1
[
x
-
1
];
nextx
=
x
==
cols
-
1
?
borderValue
:
v1
[
x
+
1
];
}
*
(
drow
+
x
)
=
prevx
+
nextx
-
4
*
v1
[
x
]
+
v0
[
x
]
+
v2
[
x
];
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
border
;
(
void
)
borderValue
;
#endif
}
void
Laplacian3OpenCV
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
s16
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
u8
borderValue
)
{
internal
::
assertSupportedConfiguration
(
isLaplacianOpenCVSupported
(
size
,
border
));
#ifdef CAROTENE_NEON
ptrdiff_t
rows
=
size
.
height
,
cols
=
size
.
width
;
std
::
vector
<
u8
>
_tmp
;
u8
*
tmp
=
0
;
if
(
border
==
BORDER_MODE_CONSTANT
)
{
_tmp
.
assign
(
cols
+
4
,
borderValue
);
tmp
=
&
_tmp
[
2
];
}
for
(
ptrdiff_t
y
=
0
;
y
<
rows
;
y
++
)
{
const
u8
*
v0
=
0
;
const
u8
*
v1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
);
const
u8
*
v2
=
0
;
// make border
if
(
border
==
BORDER_MODE_REFLECT101
)
{
v0
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
0
?
y
-
1
:
y
+
1
);
v2
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
1
?
y
+
1
:
rows
>
1
?
rows
-
2
:
0
);
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
v0
=
y
>
0
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
-
1
)
:
tmp
;
v2
=
y
<
rows
-
1
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
+
1
)
:
tmp
;
}
else
{
v0
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
0
?
y
-
1
:
0
);
v2
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
1
?
y
+
1
:
rows
>
0
?
rows
-
1
:
0
);
}
s16
*
drow
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
int16x8_t
tprev
=
vmovq_n_s16
(
0x0
);
int16x8_t
tcurr
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext
=
vmovq_n_s16
(
0x0
);
int16x8_t
tc
=
vmovq_n_s16
(
0x0
);
int16x8_t
t0
,
t2
,
tcnext
;
ptrdiff_t
x
=
0
;
const
ptrdiff_t
bcols
=
y
+
2
<
rows
?
cols
:
(
cols
-
8
);
for
(
;
x
<=
bcols
;
x
+=
8
)
{
internal
::
prefetch
(
v0
+
x
);
internal
::
prefetch
(
v1
+
x
);
internal
::
prefetch
(
v2
+
x
);
uint8x8_t
x0
=
vld1_u8
(
v0
+
x
);
uint8x8_t
x1
=
vld1_u8
(
v1
+
x
);
uint8x8_t
x2
=
vld1_u8
(
v2
+
x
);
tcnext
=
vreinterpretq_s16_u16
(
vshll_n_u8
(
x1
,
2
));
if
(
x
)
{
tprev
=
tcurr
;
tcurr
=
tnext
;
}
tnext
=
vreinterpretq_s16_u16
(
vaddl_u8
(
x0
,
x2
));
if
(
!
x
)
{
tcurr
=
tnext
;
tc
=
tcnext
;
// make border
if
(
border
==
BORDER_MODE_REPLICATE
||
border
==
BORDER_MODE_REFLECT
)
{
tcurr
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tcurr
,
0
),
tcurr
,
7
);
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
tcurr
=
vsetq_lane_s16
(
borderValue
,
tcurr
,
7
);
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
tcurr
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tcurr
,
1
),
tcurr
,
7
);
}
continue
;
}
t0
=
vextq_s16
(
tprev
,
tcurr
,
7
);
t2
=
vextq_s16
(
tcurr
,
tnext
,
1
);
t0
=
vsubq_s16
(
vqaddq_s16
(
t0
,
t2
),
tc
);
tc
=
tcnext
;
t0
=
vshlq_n_s16
(
t0
,
1
);
vst1q_s16
(
drow
+
x
-
8
,
t0
);
}
x
-=
8
;
if
(
x
==
cols
){
x
--
;
}
for
(
;
x
<
cols
;
x
++
)
{
s16
nextx
,
nextx2
;
s16
prevx
,
prevx2
;
// make border
if
(
border
==
BORDER_MODE_REPLICATE
||
border
==
BORDER_MODE_REFLECT
)
{
prevx
=
x
==
0
?
v0
[
0
]
:
v0
[
x
-
1
];
prevx2
=
x
==
0
?
v2
[
0
]
:
v2
[
x
-
1
];
nextx
=
x
==
cols
-
1
?
v0
[
x
]
:
v0
[
x
+
1
];
nextx2
=
x
==
cols
-
1
?
v2
[
x
]
:
v2
[
x
+
1
];
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
prevx
=
x
==
0
?
v0
[
1
]
:
v0
[
x
-
1
];
prevx2
=
x
==
0
?
v2
[
1
]
:
v2
[
x
-
1
];
nextx
=
x
==
cols
-
1
?
v0
[
x
-
1
]
:
v0
[
x
+
1
];
nextx2
=
x
==
cols
-
1
?
v2
[
x
-
1
]
:
v2
[
x
+
1
];
}
else
//if (border == BORDER_MODE_CONSTANT)
{
prevx
=
x
==
0
?
borderValue
:
v0
[
x
-
1
];
prevx2
=
x
==
0
?
borderValue
:
v2
[
x
-
1
];
nextx
=
x
==
cols
-
1
?
borderValue
:
v0
[
x
+
1
];
nextx2
=
x
==
cols
-
1
?
borderValue
:
v2
[
x
+
1
];
}
s16
res
=
prevx
+
nextx
-
4
*
v1
[
x
]
+
prevx2
+
nextx2
;
*
(
drow
+
x
)
=
2
*
res
;
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
border
;
(
void
)
borderValue
;
#endif
}
void
Laplacian5OpenCV
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
s16
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
u8
borderValue
)
{
internal
::
assertSupportedConfiguration
(
isLaplacianOpenCVSupported
(
size
,
border
));
#ifdef CAROTENE_NEON
ptrdiff_t
rows
=
size
.
height
,
cols
=
size
.
width
;
std
::
vector
<
u8
>
_tmp
;
u8
*
tmp
=
0
;
if
(
border
==
BORDER_MODE_CONSTANT
)
{
_tmp
.
assign
(
cols
+
4
,
borderValue
);
tmp
=
&
_tmp
[
2
];
}
for
(
ptrdiff_t
y
=
0
;
y
<
rows
;
y
++
)
{
const
u8
*
v0
=
0
;
const
u8
*
v1
=
0
;
const
u8
*
v2
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
);
const
u8
*
v3
=
0
;
const
u8
*
v4
=
0
;
// make border
if
(
border
==
BORDER_MODE_REPLICATE
)
{
v0
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
1
?
y
-
2
:
0
);
v1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
0
?
y
-
1
:
0
);
v3
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
1
?
y
+
1
:
rows
>
0
?
rows
-
1
:
0
);
v4
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
2
?
y
+
2
:
rows
>
0
?
rows
-
1
:
0
);
}
else
if
(
border
==
BORDER_MODE_REFLECT
)
{
v0
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
1
?
y
-
2
:
rows
>
1
?
1
-
y
:
0
);
v1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
0
?
y
-
1
:
0
);
v3
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
1
?
y
+
1
:
rows
>
0
?
rows
-
1
:
0
);
v4
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
2
?
y
+
2
:
rows
>
1
?
2
*
rows
-
(
y
+
3
)
:
0
);
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
v0
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
1
?
y
-
2
:
rows
>
2
-
y
?
2
-
y
:
0
);
///check
v1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
>
0
?
y
-
1
:
rows
>
1
?
1
:
0
);
v3
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
1
?
y
+
1
:
rows
>
1
?
rows
-
2
:
0
);
v4
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
<
rows
-
2
?
y
+
2
:
rows
>
2
?
2
*
rows
-
(
y
+
4
)
:
0
);
///bad if rows=2 y=1 rows - 4 + (2,1)
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
v0
=
y
>
1
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
-
2
)
:
tmp
;
v1
=
y
>
0
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
-
1
)
:
tmp
;
v3
=
y
<
rows
-
1
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
+
1
)
:
tmp
;
v4
=
y
<
rows
-
2
?
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
+
2
)
:
tmp
;
}
s16
*
drow
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
int16x8_t
tnext
,
tc
,
t0
;
int16x8_t
tnext2
,
tnext3
;
int16x8_t
tnext1Old
,
tnext2Old
,
tnext3Old
;
int16x8_t
tnext4OldOldOld
,
tnext5OldOldOld
;
int16x8_t
tcurr1
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext1
=
vmovq_n_s16
(
0x0
);
int16x8_t
tprev1
=
vmovq_n_s16
(
0x0
);
int16x8_t
tpprev1
=
vmovq_n_s16
(
0x0
);
int16x8_t
tppprev1
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext4Old
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext5Old
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext1OldOld
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext2OldOld
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext3OldOld
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext4OldOld
=
vmovq_n_s16
(
0x0
);
int16x8_t
tnext5OldOld
=
vmovq_n_s16
(
0x0
);
// do vertical convolution
ptrdiff_t
x
=
0
;
const
ptrdiff_t
bcols
=
y
+
3
<
rows
?
cols
:
(
cols
-
8
);
for
(
;
x
<=
bcols
;
x
+=
8
)
{
internal
::
prefetch
(
v0
+
x
);
internal
::
prefetch
(
v1
+
x
);
internal
::
prefetch
(
v2
+
x
);
internal
::
prefetch
(
v3
+
x
);
internal
::
prefetch
(
v4
+
x
);
uint8x8_t
x0
=
vld1_u8
(
v0
+
x
);
uint8x8_t
x1
=
vld1_u8
(
v1
+
x
);
uint8x8_t
x2
=
vld1_u8
(
v2
+
x
);
uint8x8_t
x3
=
vld1_u8
(
v3
+
x
);
uint8x8_t
x4
=
vld1_u8
(
v4
+
x
);
if
(
x
)
{
tcurr1
=
tnext1
;
}
tnext4OldOldOld
=
tnext4Old
;
tnext5OldOldOld
=
tnext5Old
;
tnext1Old
=
tnext1OldOld
;
tnext2Old
=
tnext2OldOld
;
tnext3Old
=
tnext3OldOld
;
tnext4Old
=
tnext4OldOld
;
tnext5Old
=
tnext5OldOld
;
tnext3
=
vreinterpretq_s16_u16
(
vaddq_u16
(
vaddl_u8
(
x3
,
x2
),
vaddl_u8
(
x2
,
x1
)));
tnext3
=
vshlq_n_s16
(
tnext3
,
1
);
tc
=
vreinterpretq_s16_u16
(
vsubl_u8
(
x4
,
x2
));
tnext
=
vreinterpretq_s16_u16
(
vsubl_u8
(
x2
,
x0
));
tnext2
=
vsubq_s16
(
tc
,
tnext
);
tnext1
=
vaddq_s16
(
tnext3
,
tnext2
);
// tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
tnext2
=
vshlq_n_s16
(
tnext2
,
1
);
// tnext2 = 2*x4 - 4*x2 + 2*x0
tnext3
=
vsubq_s16
(
tnext2
,
vshlq_n_s16
(
tnext3
,
1
));
// tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4
tnext1OldOld
=
tnext1
;
tnext2OldOld
=
tnext2
;
tnext3OldOld
=
tnext3
;
tnext4OldOld
=
tnext2
;
tnext5OldOld
=
tnext1
;
if
(
x
)
{
tnext1
=
vextq_s16
(
tnext1Old
,
tnext1
,
2
);
tcurr1
=
vextq_s16
(
tnext2Old
,
tnext2
,
1
);
tprev1
=
tnext3Old
;
if
(
x
!=
8
)
{
tpprev1
=
vextq_s16
(
tnext4OldOldOld
,
tnext4Old
,
7
);
tppprev1
=
vextq_s16
(
tnext5OldOldOld
,
tnext5Old
,
6
);
}
}
if
(
!
x
)
{
// make border
if
(
border
==
BORDER_MODE_REPLICATE
)
{
tpprev1
=
vextq_s16
(
tnext2
,
tnext2
,
7
);
tpprev1
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tpprev1
,
1
),
tpprev1
,
0
);
tprev1
=
vextq_s16
(
tnext1
,
tnext1
,
6
);
tprev1
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tprev1
,
2
),
tprev1
,
0
);
tprev1
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tprev1
,
2
),
tprev1
,
1
);
}
else
if
(
border
==
BORDER_MODE_REFLECT
)
{
tpprev1
=
vextq_s16
(
tnext2
,
tnext2
,
7
);
tpprev1
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tpprev1
,
1
),
tpprev1
,
0
);
tprev1
=
vextq_s16
(
tnext1
,
tnext1
,
6
);
tprev1
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tprev1
,
3
),
tprev1
,
0
);
tprev1
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tprev1
,
2
),
tprev1
,
1
);
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
tpprev1
=
vextq_s16
(
tnext2
,
tnext2
,
7
);
tpprev1
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tpprev1
,
2
),
tpprev1
,
0
);
tprev1
=
vextq_s16
(
tnext1
,
tnext1
,
6
);
tprev1
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tprev1
,
3
),
tprev1
,
1
);
tprev1
=
vsetq_lane_s16
(
vgetq_lane_s16
(
tprev1
,
4
),
tprev1
,
0
);
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
tpprev1
=
vextq_s16
(
tnext2
,
tnext2
,
7
);
tpprev1
=
vsetq_lane_s16
(
borderValue
,
tpprev1
,
0
);
tprev1
=
vextq_s16
(
tnext1
,
tnext1
,
6
);
tprev1
=
vsetq_lane_s16
(
borderValue
,
tprev1
,
0
);
tprev1
=
vsetq_lane_s16
(
borderValue
,
tprev1
,
1
);
}
tppprev1
=
tprev1
;
continue
;
}
t0
=
vaddq_s16
(
vaddq_s16
(
vqaddq_s16
(
tcurr1
,
tprev1
),
vqaddq_s16
(
tpprev1
,
tppprev1
)),
tnext1
);
t0
=
vaddq_s16
(
t0
,
t0
);
vst1q_s16
(
drow
+
x
-
8
,
t0
);
}
x
-=
8
;
if
(
x
>=
cols
-
1
)
x
=
cols
-
2
;
s16
pprevx
=
0
;
s16
prevx
=
0
;
s16
nextx
=
0
;
s16
nnextx
=
0
;
for
(
;
x
<
cols
;
x
++
)
{
if
(
x
==
0
)
{
// make border
if
(
border
==
BORDER_MODE_REPLICATE
)
{
pprevx
=
v0
[
0
]
+
2
*
v1
[
0
]
+
2
*
v2
[
0
]
+
2
*
v3
[
0
]
+
v4
[
0
];
prevx
=
2
*
v0
[
0
]
-
4
*
v2
[
0
]
+
2
*
v4
[
0
];
}
else
if
(
border
==
BORDER_MODE_REFLECT
)
{
pprevx
=
v0
[
1
]
+
2
*
v1
[
1
]
+
2
*
v2
[
1
]
+
2
*
v3
[
1
]
+
v4
[
1
];
prevx
=
2
*
v0
[
0
]
-
4
*
v2
[
0
]
+
2
*
v4
[
0
];
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
pprevx
=
v0
[
2
]
+
2
*
v1
[
2
]
+
2
*
v2
[
2
]
+
2
*
v3
[
2
]
+
v4
[
2
];
prevx
=
2
*
v0
[
1
]
-
4
*
v2
[
1
]
+
2
*
v4
[
1
];
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
pprevx
=
8
*
borderValue
;
prevx
=
0
;
}
}
else
if
(
x
==
1
)
{
// make border
if
(
border
==
BORDER_MODE_REPLICATE
||
border
==
BORDER_MODE_REFLECT
)
{
pprevx
=
v0
[
0
]
+
2
*
v1
[
0
]
+
2
*
v2
[
0
]
+
2
*
v3
[
0
]
+
v4
[
0
];
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
pprevx
=
v0
[
1
]
+
2
*
v1
[
1
]
+
2
*
v2
[
1
]
+
2
*
v3
[
1
]
+
v4
[
1
];
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
pprevx
=
8
*
borderValue
;
}
prevx
=
2
*
v0
[
0
]
-
4
*
v2
[
0
]
+
2
*
v4
[
0
];
}
else
{
pprevx
=
v0
[
x
-
2
]
+
2
*
v1
[
x
-
2
]
+
2
*
v2
[
x
-
2
]
+
2
*
v3
[
x
-
2
]
+
v4
[
x
-
2
];
prevx
=
2
*
v0
[
x
-
1
]
-
4
*
v2
[
x
-
1
]
+
2
*
v4
[
x
-
1
];
}
s16
currx
=
2
*
v0
[
x
]
-
4
*
v1
[
x
]
-
12
*
v2
[
x
]
-
4
*
v3
[
x
]
+
2
*
v4
[
x
];
if
(
x
==
cols
-
1
)
{
// make border
if
(
border
==
BORDER_MODE_REPLICATE
)
{
nextx
=
2
*
v0
[
x
]
-
4
*
v2
[
x
]
+
2
*
v4
[
x
];
nnextx
=
v0
[
x
]
+
2
*
v1
[
x
]
+
2
*
v2
[
x
]
+
2
*
v3
[
x
]
+
v4
[
x
];
}
else
if
(
border
==
BORDER_MODE_REFLECT
)
{
nextx
=
2
*
v0
[
x
]
-
4
*
v2
[
x
]
+
2
*
v4
[
x
];
nnextx
=
v0
[
x
-
1
]
+
2
*
v1
[
x
-
1
]
+
2
*
v2
[
x
-
1
]
+
2
*
v3
[
x
-
1
]
+
v4
[
x
-
1
];
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
nextx
=
2
*
v0
[
x
-
1
]
-
4
*
v2
[
x
-
1
]
+
2
*
v4
[
x
-
1
];
nnextx
=
v0
[
x
-
2
]
+
2
*
v1
[
x
-
2
]
+
2
*
v2
[
x
-
2
]
+
2
*
v3
[
x
-
2
]
+
v4
[
x
-
2
];
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
nextx
=
0
;
nnextx
=
8
*
borderValue
;
}
}
else
if
(
x
==
cols
-
2
)
{
// make border
if
(
border
==
BORDER_MODE_REPLICATE
||
border
==
BORDER_MODE_REFLECT
)
{
nnextx
=
v0
[
x
+
1
]
+
2
*
v1
[
x
+
1
]
+
2
*
v2
[
x
+
1
]
+
2
*
v3
[
x
+
1
]
+
v4
[
x
+
1
];
}
else
if
(
border
==
BORDER_MODE_REFLECT101
)
{
nnextx
=
v0
[
x
]
+
2
*
v1
[
x
]
+
2
*
v2
[
x
]
+
2
*
v3
[
x
]
+
v4
[
x
];
}
else
if
(
border
==
BORDER_MODE_CONSTANT
)
{
nnextx
=
8
*
borderValue
;
}
nextx
=
2
*
v0
[
x
+
1
]
-
4
*
v2
[
x
+
1
]
+
2
*
v4
[
x
+
1
];
}
else
{
nextx
=
2
*
v0
[
x
+
1
]
-
4
*
v2
[
x
+
1
]
+
2
*
v4
[
x
+
1
];
nnextx
=
v0
[
x
+
2
]
+
2
*
v1
[
x
+
2
]
+
2
*
v2
[
x
+
2
]
+
2
*
v3
[
x
+
2
]
+
v4
[
x
+
2
];
}
s16
res
=
pprevx
+
prevx
+
currx
+
nextx
+
nnextx
;
*
(
drow
+
x
)
=
2
*
res
;
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
border
;
(
void
)
borderValue
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/magnitude.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cmath>
namespace
CAROTENE_NS
{
#ifdef CAROTENE_NEON
namespace
{
struct
Magnitude
{
typedef
s16
type
;
void
operator
()
(
const
int16x8_t
&
v_src0
,
const
int16x8_t
&
v_src1
,
int16x8_t
&
v_dst
)
const
{
int16x4_t
v_src0_p
=
vget_low_s16
(
v_src0
),
v_src1_p
=
vget_low_s16
(
v_src1
);
float32x4_t
v_sqr0
=
vaddq_f32
(
vcvtq_f32_s32
(
vmull_s16
(
v_src0_p
,
v_src0_p
)),
vcvtq_f32_s32
(
vmull_s16
(
v_src1_p
,
v_src1_p
)));
v_src0_p
=
vget_high_s16
(
v_src0
);
v_src1_p
=
vget_high_s16
(
v_src1
);
float32x4_t
v_sqr1
=
vaddq_f32
(
vcvtq_f32_s32
(
vmull_s16
(
v_src0_p
,
v_src0_p
)),
vcvtq_f32_s32
(
vmull_s16
(
v_src1_p
,
v_src1_p
)));
int32x4_t
v_sqrt0
=
vcvtq_s32_f32
(
internal
::
vsqrtq_f32
(
v_sqr0
));
int32x4_t
v_sqrt1
=
vcvtq_s32_f32
(
internal
::
vsqrtq_f32
(
v_sqr1
));
v_dst
=
vcombine_s16
(
vqmovn_s32
(
v_sqrt0
),
vqmovn_s32
(
v_sqrt1
));
}
void
operator
()
(
const
int16x4_t
&
v_src0
,
const
int16x4_t
&
v_src1
,
int16x4_t
&
v_dst
)
const
{
float32x4_t
v_tmp
=
vaddq_f32
(
vcvtq_f32_s32
(
vmull_s16
(
v_src0
,
v_src0
)),
vcvtq_f32_s32
(
vmull_s16
(
v_src1
,
v_src1
)));
int32x4_t
v_sqrt
=
vcvtq_s32_f32
(
internal
::
vsqrtq_f32
(
v_tmp
));
v_dst
=
vqmovn_s32
(
v_sqrt
);
}
void
operator
()
(
const
short
*
src0
,
const
short
*
src1
,
short
*
dst
)
const
{
f32
src0val
=
(
f32
)
src0
[
0
],
src1val
=
(
f32
)
src1
[
0
];
dst
[
0
]
=
internal
::
saturate_cast
<
s16
>
((
s32
)
sqrtf
(
src0val
*
src0val
+
src1val
*
src1val
));
}
};
struct
MagnitudeF32
{
typedef
f32
type
;
void
operator
()
(
const
float32x4_t
&
v_src0
,
const
float32x4_t
&
v_src1
,
float32x4_t
&
v_dst
)
const
{
v_dst
=
internal
::
vsqrtq_f32
(
vaddq_f32
(
vmulq_f32
(
v_src0
,
v_src0
),
vmulq_f32
(
v_src1
,
v_src1
)));
}
void
operator
()
(
const
float32x2_t
&
v_src0
,
const
float32x2_t
&
v_src1
,
float32x2_t
&
v_dst
)
const
{
v_dst
=
internal
::
vsqrt_f32
(
vadd_f32
(
vmul_f32
(
v_src0
,
v_src0
),
vmul_f32
(
v_src1
,
v_src1
)));
}
void
operator
()
(
const
f32
*
src0
,
const
f32
*
src1
,
f32
*
dst
)
const
{
dst
[
0
]
=
sqrtf
(
src0
[
0
]
*
src0
[
0
]
+
src1
[
0
]
*
src1
[
0
]);
}
};
}
// namespace
#endif
void
magnitude
(
const
Size2D
&
size
,
const
s16
*
src0Base
,
ptrdiff_t
src0Stride
,
const
s16
*
src1Base
,
ptrdiff_t
src1Stride
,
s16
*
dstBase
,
ptrdiff_t
dstStride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
internal
::
vtransform
(
size
,
src0Base
,
src0Stride
,
src1Base
,
src1Stride
,
dstBase
,
dstStride
,
Magnitude
());
#else
(
void
)
size
;
(
void
)
src0Base
;
(
void
)
src0Stride
;
(
void
)
src1Base
;
(
void
)
src1Stride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
#endif
}
void
magnitude
(
const
Size2D
&
size
,
const
f32
*
src0Base
,
ptrdiff_t
src0Stride
,
const
f32
*
src1Base
,
ptrdiff_t
src1Stride
,
f32
*
dstBase
,
ptrdiff_t
dstStride
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
internal
::
vtransform
(
size
,
src0Base
,
src0Stride
,
src1Base
,
src1Stride
,
dstBase
,
dstStride
,
MagnitudeF32
());
#else
(
void
)
size
;
(
void
)
src0Base
;
(
void
)
src0Stride
;
(
void
)
src1Base
;
(
void
)
src1Stride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/meanstddev.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cmath>
namespace
CAROTENE_NS
{
void
meanStdDev
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
f32
*
pMean
,
f32
*
pStdDev
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
f64
fsum
=
0.0
f
,
fsqsum
=
0.0
f
;
sqsum
(
size
,
srcBase
,
srcStride
,
&
fsum
,
&
fsqsum
,
1
);
// calc mean and stddev
f64
itotal
=
1.0
/
size
.
total
();
f64
mean
=
fsum
*
itotal
;
f64
stddev
=
sqrt
(
std
::
max
(
fsqsum
*
itotal
-
mean
*
mean
,
0.0
));
if
(
pMean
)
*
pMean
=
mean
;
if
(
pStdDev
)
*
pStdDev
=
stddev
;
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
pMean
;
(
void
)
pStdDev
;
#endif
}
void
meanStdDev
(
const
Size2D
&
size
,
const
u16
*
srcBase
,
ptrdiff_t
srcStride
,
f32
*
pMean
,
f32
*
pStdDev
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
size_t
blockSize0
=
1
<<
10
,
roiw4
=
size
.
width
&
~
3
;
f64
fsum
=
0.0
f
,
fsqsum
=
0.0
f
;
f32
arsum
[
8
];
uint32x4_t
v_zero
=
vdupq_n_u32
(
0u
),
v_sum
;
float32x4_t
v_zero_f
=
vdupq_n_f32
(
0.0
f
),
v_sqsum
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
u16
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
size_t
j
=
0u
;
while
(
j
<
roiw4
)
{
size_t
blockSize
=
std
::
min
(
roiw4
-
j
,
blockSize0
)
+
j
;
v_sum
=
v_zero
;
v_sqsum
=
v_zero_f
;
for
(
;
j
+
16
<
blockSize
;
j
+=
16
)
{
internal
::
prefetch
(
src
+
j
);
uint16x8_t
v_src0
=
vld1q_u16
(
src
+
j
),
v_src1
=
vld1q_u16
(
src
+
j
+
8
);
// 0
uint32x4_t
v_srclo
=
vmovl_u16
(
vget_low_u16
(
v_src0
));
uint32x4_t
v_srchi
=
vmovl_u16
(
vget_high_u16
(
v_src0
));
v_sum
=
vaddq_u32
(
v_sum
,
vaddq_u32
(
v_srclo
,
v_srchi
));
float32x4_t
v_srclo_f
=
vcvtq_f32_u32
(
v_srclo
);
float32x4_t
v_srchi_f
=
vcvtq_f32_u32
(
v_srchi
);
v_sqsum
=
vmlaq_f32
(
v_sqsum
,
v_srclo_f
,
v_srclo_f
);
v_sqsum
=
vmlaq_f32
(
v_sqsum
,
v_srchi_f
,
v_srchi_f
);
// 1
v_srclo
=
vmovl_u16
(
vget_low_u16
(
v_src1
));
v_srchi
=
vmovl_u16
(
vget_high_u16
(
v_src1
));
v_sum
=
vaddq_u32
(
v_sum
,
vaddq_u32
(
v_srclo
,
v_srchi
));
v_srclo_f
=
vcvtq_f32_u32
(
v_srclo
);
v_srchi_f
=
vcvtq_f32_u32
(
v_srchi
);
v_sqsum
=
vmlaq_f32
(
v_sqsum
,
v_srclo_f
,
v_srclo_f
);
v_sqsum
=
vmlaq_f32
(
v_sqsum
,
v_srchi_f
,
v_srchi_f
);
}
for
(
;
j
<
blockSize
;
j
+=
4
)
{
uint32x4_t
v_src
=
vmovl_u16
(
vld1_u16
(
src
+
j
));
float32x4_t
v_src_f
=
vcvtq_f32_u32
(
v_src
);
v_sum
=
vaddq_u32
(
v_sum
,
v_src
);
v_sqsum
=
vmlaq_f32
(
v_sqsum
,
v_src_f
,
v_src_f
);
}
vst1q_f32
(
arsum
,
vcvtq_f32_u32
(
v_sum
));
vst1q_f32
(
arsum
+
4
,
v_sqsum
);
fsum
+=
(
f64
)
arsum
[
0
]
+
arsum
[
1
]
+
arsum
[
2
]
+
arsum
[
3
];
fsqsum
+=
(
f64
)
arsum
[
4
]
+
arsum
[
5
]
+
arsum
[
6
]
+
arsum
[
7
];
}
// collect a few last elements in the current row
for
(
;
j
<
size
.
width
;
++
j
)
{
f32
srcval
=
src
[
j
];
fsum
+=
srcval
;
fsqsum
+=
srcval
*
srcval
;
}
}
// calc mean and stddev
f64
itotal
=
1.0
/
size
.
total
();
f64
mean
=
fsum
*
itotal
;
f64
stddev
=
sqrt
(
std
::
max
(
fsqsum
*
itotal
-
mean
*
mean
,
0.0
));
if
(
pMean
)
*
pMean
=
mean
;
if
(
pStdDev
)
*
pStdDev
=
stddev
;
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
pMean
;
(
void
)
pStdDev
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/median_filter.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
/*
* The code here is based on the code in
* <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
* See also <http://ndevilla.free.fr/median/median/index.html>.
*/
namespace
CAROTENE_NS
{
#ifdef CAROTENE_NEON
namespace
{
uint8x16_t
getLeftReplicate
(
uint8x16_t
r
,
u32
cn
)
{
u8
buf
[
16
+
8
];
vst1q_u8
(
buf
+
cn
,
r
);
for
(
u32
i
=
0
;
i
<
cn
;
++
i
)
buf
[
i
]
=
buf
[
cn
+
i
];
return
vld1q_u8
(
buf
);
}
uint8x8_t
getRightReplicate
(
uint8x8_t
r
,
u32
cn
)
{
u8
buf
[
8
+
8
];
vst1_u8
(
buf
,
r
);
for
(
u32
i
=
0
;
i
<
cn
;
++
i
)
buf
[
8
+
i
]
=
buf
[
8
-
cn
+
i
];
return
vld1_u8
(
buf
+
cn
);
}
}
// namespace
//o------^-------^-----------------------------o 0
// | |
//o--^---v---^---|-------^---------------------o 1
// | | | |
//o--v-------v---|-------|-^-------^-------^---o 2
// | | | | |
//o------^-------v-----^-|-|-------|-------|---o 3
// | | | | | |
//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
// | | | | | | |
//o--v-------v---^-|---|---v---|-------|-------o 5
// | | | | |
//o------^-------|-|---v-------|-------v-------o 6
// | | | |
//o--^---v---^---|-v-----------v---------------o 7
// | | |
//o--v-------v---v-----------------------------o 8
#define ELT(num, level) v ## num ## _lv ## level
#define PIX_SORT(a, alvl, b, blvl, newlvl) \
PIX_MIN(a, alvl, b, blvl, newlvl); \
PIX_MAX(a, alvl, b, blvl, newlvl);
#define SORT9 \
PIX_SORT(1, 00, 2, 00, 01); \
PIX_SORT(4, 00, 5, 00, 02); \
PIX_SORT(7, 00, 8, 00, 03); \
PIX_SORT(0, 00, 1, 01, 04); \
PIX_SORT(3, 00, 4, 02, 05); \
PIX_SORT(6, 00, 7, 03, 06); \
PIX_SORT(1, 04, 2, 01, 07); \
PIX_SORT(4, 05, 5, 02, 08); \
PIX_SORT(7, 06, 8, 03, 09); \
PIX_MAX (0, 04, 3, 05, 10); \
PIX_MIN (5, 08, 8, 09, 11); \
PIX_SORT(4, 08, 7, 09, 12); \
PIX_MAX (3, 10, 6, 06, 13); \
PIX_MAX (1, 07, 4, 12, 14); \
PIX_MIN (2, 07, 5, 11, 15); \
PIX_MIN (4, 14, 7, 12, 16); \
PIX_SORT(4, 16, 2, 15, 17); \
PIX_MAX (6, 13, 4, 17, 18); \
PIX_MIN (4, 18, 2, 17, 19);
#endif
bool
isMedianFilter3x3Supported
(
const
Size2D
&
size
,
u32
numChannels
)
{
return
isSupportedConfiguration
()
&&
size
.
width
>=
16
+
numChannels
&&
numChannels
<=
8
;
}
void
medianFilter3x3
(
const
Size2D
&
size
,
u32
numChannels
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
const
Margin
&
srcMargin
,
u8
*
dstBase
,
ptrdiff_t
dstStride
)
{
internal
::
assertSupportedConfiguration
(
isMedianFilter3x3Supported
(
size
,
numChannels
));
#ifdef CAROTENE_NEON
u32
cn
=
numChannels
;
size_t
colsn
=
size
.
width
*
cn
;
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
u8
*
psrc1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
i
);
const
u8
*
psrc0
=
i
==
0
&&
srcMargin
.
top
==
0
?
psrc1
:
psrc1
-
srcStride
;
const
u8
*
psrc2
=
i
+
1
==
size
.
height
&&
srcMargin
.
bottom
==
0
?
psrc1
:
psrc1
+
srcStride
;
u8
*
pdst
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
i
);
size_t
j
=
0
;
{
uint8x16_t
v3_lv00
=
vld1q_u8
(
psrc0
);
uint8x16_t
v4_lv00
=
vld1q_u8
(
psrc1
);
uint8x16_t
v5_lv00
=
vld1q_u8
(
psrc2
);
uint8x16_t
v6_lv00
=
vld1q_u8
(
psrc0
+
cn
);
uint8x16_t
v7_lv00
=
vld1q_u8
(
psrc1
+
cn
);
uint8x16_t
v8_lv00
=
vld1q_u8
(
psrc2
+
cn
);
uint8x16_t
v0_lv00
=
srcMargin
.
left
>
0
?
vld1q_u8
(
psrc0
-
cn
)
:
getLeftReplicate
(
v3_lv00
,
cn
);
uint8x16_t
v1_lv00
=
srcMargin
.
left
>
0
?
vld1q_u8
(
psrc1
-
cn
)
:
getLeftReplicate
(
v4_lv00
,
cn
);
uint8x16_t
v2_lv00
=
srcMargin
.
left
>
0
?
vld1q_u8
(
psrc2
-
cn
)
:
getLeftReplicate
(
v5_lv00
,
cn
);
goto
medianBlur3x3_mainBody
;
for
(;
j
<
colsn
-
16
;
j
+=
16
)
{
internal
::
prefetch
(
psrc0
+
j
);
internal
::
prefetch
(
psrc1
+
j
);
internal
::
prefetch
(
psrc2
+
j
);
v0_lv00
=
vld1q_u8
(
psrc0
+
j
-
cn
);
v1_lv00
=
vld1q_u8
(
psrc1
+
j
-
cn
);
v2_lv00
=
vld1q_u8
(
psrc2
+
j
-
cn
);
v3_lv00
=
vld1q_u8
(
psrc0
+
j
);
v4_lv00
=
vld1q_u8
(
psrc1
+
j
);
v5_lv00
=
vld1q_u8
(
psrc2
+
j
);
v6_lv00
=
vld1q_u8
(
psrc0
+
j
+
cn
);
v7_lv00
=
vld1q_u8
(
psrc1
+
j
+
cn
);
v8_lv00
=
vld1q_u8
(
psrc2
+
j
+
cn
);
medianBlur3x3_mainBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
SORT9
;
#undef PIX_MAX
#undef PIX_MIN
vst1q_u8
(
pdst
+
j
,
v4_lv19
);
}
}
{
size_t
k
=
colsn
-
8
;
uint8x8_t
v0_lv00
=
vld1_u8
(
psrc0
+
k
-
cn
);
uint8x8_t
v1_lv00
=
vld1_u8
(
psrc1
+
k
-
cn
);
uint8x8_t
v2_lv00
=
vld1_u8
(
psrc2
+
k
-
cn
);
uint8x8_t
v3_lv00
=
vld1_u8
(
psrc0
+
k
);
uint8x8_t
v4_lv00
=
vld1_u8
(
psrc1
+
k
);
uint8x8_t
v5_lv00
=
vld1_u8
(
psrc2
+
k
);
uint8x8_t
v6_lv00
=
srcMargin
.
right
>
0
?
vld1_u8
(
psrc0
+
k
+
cn
)
:
getRightReplicate
(
v3_lv00
,
cn
);
uint8x8_t
v7_lv00
=
srcMargin
.
right
>
0
?
vld1_u8
(
psrc1
+
k
+
cn
)
:
getRightReplicate
(
v4_lv00
,
cn
);
uint8x8_t
v8_lv00
=
srcMargin
.
right
>
0
?
vld1_u8
(
psrc2
+
k
+
cn
)
:
getRightReplicate
(
v5_lv00
,
cn
);
goto
medianBlur3x3_tailBody
;
for
(;
k
>=
j
-
8
;
k
-=
8
)
{
v0_lv00
=
vld1_u8
(
psrc0
+
k
-
cn
);
v1_lv00
=
vld1_u8
(
psrc1
+
k
-
cn
);
v2_lv00
=
vld1_u8
(
psrc2
+
k
-
cn
);
v3_lv00
=
vld1_u8
(
psrc0
+
k
);
v4_lv00
=
vld1_u8
(
psrc1
+
k
);
v5_lv00
=
vld1_u8
(
psrc2
+
k
);
v6_lv00
=
vld1_u8
(
psrc0
+
k
+
cn
);
v7_lv00
=
vld1_u8
(
psrc1
+
k
+
cn
);
v8_lv00
=
vld1_u8
(
psrc2
+
k
+
cn
);
medianBlur3x3_tailBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
SORT9
;
#undef PIX_MAX
#undef PIX_MIN
vst1_u8
(
pdst
+
k
,
v4_lv19
);
}
}
}
#else
(
void
)
size
;
(
void
)
numChannels
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
srcMargin
;
(
void
)
dstBase
;
(
void
)
dstStride
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/min_max.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace
CAROTENE_NS
{
#ifdef CAROTENE_NEON
namespace
{
template
<
typename
T
>
struct
Min
{
typedef
T
type
;
void
operator
()
(
const
typename
internal
::
VecTraits
<
T
>::
vec128
&
v_src0
,
const
typename
internal
::
VecTraits
<
T
>::
vec128
&
v_src1
,
typename
internal
::
VecTraits
<
T
>::
vec128
&
v_dst
)
const
{
v_dst
=
internal
::
vminq
(
v_src0
,
v_src1
);
}
void
operator
()
(
const
typename
internal
::
VecTraits
<
T
>::
vec64
&
v_src0
,
const
typename
internal
::
VecTraits
<
T
>::
vec64
&
v_src1
,
typename
internal
::
VecTraits
<
T
>::
vec64
&
v_dst
)
const
{
v_dst
=
internal
::
vmin
(
v_src0
,
v_src1
);
}
void
operator
()
(
const
T
*
src0
,
const
T
*
src1
,
T
*
dst
)
const
{
dst
[
0
]
=
std
::
min
(
src0
[
0
],
src1
[
0
]);
}
};
template
<
typename
T
>
struct
Max
{
typedef
T
type
;
void
operator
()
(
const
typename
internal
::
VecTraits
<
T
>::
vec128
&
v_src0
,
const
typename
internal
::
VecTraits
<
T
>::
vec128
&
v_src1
,
typename
internal
::
VecTraits
<
T
>::
vec128
&
v_dst
)
const
{
v_dst
=
internal
::
vmaxq
(
v_src0
,
v_src1
);
}
void
operator
()
(
const
typename
internal
::
VecTraits
<
T
>::
vec64
&
v_src0
,
const
typename
internal
::
VecTraits
<
T
>::
vec64
&
v_src1
,
typename
internal
::
VecTraits
<
T
>::
vec64
&
v_dst
)
const
{
v_dst
=
internal
::
vmax
(
v_src0
,
v_src1
);
}
void
operator
()
(
const
T
*
src0
,
const
T
*
src1
,
T
*
dst
)
const
{
dst
[
0
]
=
std
::
max
(
src0
[
0
],
src1
[
0
]);
}
};
}
// namespace
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, op<type>()); \
}
#else
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
IMPL_MINMAX
(
u8
)
IMPL_MINMAX
(
s8
)
IMPL_MINMAX
(
u16
)
IMPL_MINMAX
(
s16
)
IMPL_MINMAX
(
u32
)
IMPL_MINMAX
(
s32
)
IMPL_MINMAX
(
f32
)
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/minmaxloc.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <limits>
namespace
CAROTENE_NS
{
#ifdef CAROTENE_NEON
namespace
{
template
<
typename
T
>
void
minMaxVals
(
const
Size2D
&
size
,
const
T
*
srcBase
,
ptrdiff_t
srcStride
,
T
*
pMinVal
,
T
*
pMaxVal
)
{
using
namespace
internal
;
typedef
typename
VecTraits
<
T
>::
vec128
vec128
;
typedef
typename
VecTraits
<
T
>::
vec64
vec64
;
u32
step_base
=
32
/
sizeof
(
T
),
step_tail
=
8
/
sizeof
(
T
);
size_t
roiw_base
=
size
.
width
>=
(
step_base
-
1
)
?
size
.
width
-
step_base
+
1
:
0
;
size_t
roiw_tail
=
size
.
width
>=
(
step_tail
-
1
)
?
size
.
width
-
step_tail
+
1
:
0
;
T
maxVal
=
std
::
numeric_limits
<
T
>::
min
();
T
minVal
=
std
::
numeric_limits
<
T
>::
max
();
vec128
v_min_base
=
vdupq_n
(
minVal
),
v_max_base
=
vdupq_n
(
maxVal
);
vec64
v_min_tail
=
vdup_n
(
minVal
),
v_max_tail
=
vdup_n
(
maxVal
);
for
(
size_t
i
=
0
;
i
<
size
.
height
;
++
i
)
{
const
T
*
src
=
getRowPtr
(
srcBase
,
srcStride
,
i
);
size_t
j
=
0
;
for
(;
j
<
roiw_base
;
j
+=
step_base
)
{
prefetch
(
src
+
j
);
vec128
v_src0
=
vld1q
(
src
+
j
),
v_src1
=
vld1q
(
src
+
j
+
16
/
sizeof
(
T
));
v_min_base
=
vminq
(
v_min_base
,
v_src0
);
v_max_base
=
vmaxq
(
v_max_base
,
v_src0
);
v_min_base
=
vminq
(
v_min_base
,
v_src1
);
v_max_base
=
vmaxq
(
v_max_base
,
v_src1
);
}
for
(;
j
<
roiw_tail
;
j
+=
step_tail
)
{
vec64
v_src0
=
vld1
(
src
+
j
);
v_min_tail
=
vmin
(
v_min_tail
,
v_src0
);
v_max_tail
=
vmax
(
v_max_tail
,
v_src0
);
}
for
(;
j
<
size
.
width
;
j
++
)
{
T
srcval
=
src
[
j
];
minVal
=
std
::
min
(
srcval
,
minVal
);
maxVal
=
std
::
max
(
srcval
,
maxVal
);
}
}
// collect min & max values
T
ar
[
16
/
sizeof
(
T
)];
vst1q
(
ar
,
vcombine
(
vmin
(
v_min_tail
,
vmin
(
vget_low
(
v_min_base
),
vget_high
(
v_min_base
))),
vmax
(
v_max_tail
,
vmax
(
vget_low
(
v_max_base
),
vget_high
(
v_max_base
)))));
for
(
size_t
x
=
0
;
x
<
8u
/
sizeof
(
T
);
++
x
)
{
minVal
=
std
::
min
(
minVal
,
ar
[
x
]);
maxVal
=
std
::
max
(
maxVal
,
ar
[
x
+
8
/
sizeof
(
T
)]);
}
if
(
pMaxVal
)
*
pMaxVal
=
maxVal
;
if
(
pMinVal
)
*
pMinVal
=
minVal
;
}
}
// namespace
#endif
void
minMaxVals
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
pMinVal
,
u8
*
pMaxVal
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minMaxVals
<
u8
>
(
size
,
srcBase
,
srcStride
,
pMinVal
,
pMaxVal
);
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
pMinVal
;
(
void
)
pMaxVal
;
#endif
}
void
minMaxVals
(
const
Size2D
&
size
,
const
s16
*
srcBase
,
ptrdiff_t
srcStride
,
s16
*
pMinVal
,
s16
*
pMaxVal
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minMaxVals
<
s16
>
(
size
,
srcBase
,
srcStride
,
pMinVal
,
pMaxVal
);
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
pMinVal
;
(
void
)
pMaxVal
;
#endif
}
void
minMaxVals
(
const
Size2D
&
size
,
const
u16
*
srcBase
,
ptrdiff_t
srcStride
,
u16
*
pMinVal
,
u16
*
pMaxVal
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minMaxVals
<
u16
>
(
size
,
srcBase
,
srcStride
,
pMinVal
,
pMaxVal
);
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
pMinVal
;
(
void
)
pMaxVal
;
#endif
}
void
minMaxVals
(
const
Size2D
&
size
,
const
s32
*
srcBase
,
ptrdiff_t
srcStride
,
s32
*
pMinVal
,
s32
*
pMaxVal
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minMaxVals
<
s32
>
(
size
,
srcBase
,
srcStride
,
pMinVal
,
pMaxVal
);
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
pMinVal
;
(
void
)
pMaxVal
;
#endif
}
void
minMaxVals
(
const
Size2D
&
size
,
const
u32
*
srcBase
,
ptrdiff_t
srcStride
,
u32
*
pMinVal
,
u32
*
pMaxVal
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minMaxVals
<
u32
>
(
size
,
srcBase
,
srcStride
,
pMinVal
,
pMaxVal
);
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
pMinVal
;
(
void
)
pMaxVal
;
#endif
}
void
minMaxLoc
(
const
Size2D
&
size
,
const
f32
*
srcBase
,
ptrdiff_t
srcStride
,
f32
&
minVal
,
size_t
&
minCol
,
size_t
&
minRow
,
f32
&
maxVal
,
size_t
&
maxCol
,
size_t
&
maxRow
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minVal
=
srcBase
[
0
];
minCol
=
0
;
minRow
=
0
;
maxVal
=
srcBase
[
0
];
maxCol
=
0
;
maxRow
=
0
;
for
(
size_t
l
=
0
,
i
=
0
;
l
<
size
.
height
;
++
l
,
i
=
0
)
{
const
f32
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
l
);
if
(
size
.
width
>=
16
)
{
u32
tmp0123
[
4
]
=
{
0
,
1
,
2
,
3
};
uint32x4_t
c4
=
vdupq_n_u32
(
4
);
#if SIZE_MAX > UINT32_MAX
size_t
boundAll
=
size
.
width
-
(
4
-
1
);
for
(
size_t
b
=
0
;
i
<
boundAll
;
b
=
i
)
{
size_t
bound
=
std
::
min
<
size_t
>
(
boundAll
,
b
+
0xffffFFFC
);
#else
{
size_t
bound
=
size
.
width
-
(
4
-
1
);
#endif
uint32x4_t
lineIdxOffset
=
vld1q_u32
(
tmp0123
);
float32x4_t
n_min
=
vdupq_n_f32
(
minVal
);
uint32x4_t
n_minIdx
=
vdupq_n_u32
(
0xffffFFFC
);
float32x4_t
n_max
=
vdupq_n_f32
(
maxVal
);
uint32x4_t
n_maxIdx
=
vdupq_n_u32
(
0xffffFFFC
);
for
(;
i
<
bound
;
i
+=
4
)
{
internal
::
prefetch
(
src
+
i
);
float32x4_t
line
=
vld1q_f32
(
src
+
i
);
uint32x4_t
minmask
=
vcltq_f32
(
line
,
n_min
);
uint32x4_t
maxmask
=
vcgtq_f32
(
line
,
n_max
);
n_min
=
vbslq_f32
(
minmask
,
line
,
n_min
);
n_minIdx
=
vbslq_u32
(
minmask
,
lineIdxOffset
,
n_minIdx
);
n_max
=
vbslq_f32
(
maxmask
,
line
,
n_max
);
n_maxIdx
=
vbslq_u32
(
maxmask
,
lineIdxOffset
,
n_maxIdx
);
// idx[] +=4
lineIdxOffset
=
vaddq_u32
(
lineIdxOffset
,
c4
);
}
f32
fmin
[
4
],
fmax
[
4
];
u32
fminIdx
[
4
],
fmaxIdx
[
4
];
vst1q_f32
(
fmin
,
n_min
);
vst1q_f32
(
fmax
,
n_max
);
vst1q_u32
(
fminIdx
,
n_minIdx
);
vst1q_u32
(
fmaxIdx
,
n_maxIdx
);
size_t
minIdx
=
fminIdx
[
0
];
size_t
maxIdx
=
fmaxIdx
[
0
];
minVal
=
fmin
[
0
];
maxVal
=
fmax
[
0
];
for
(
s32
j
=
1
;
j
<
4
;
++
j
)
{
f32
minval
=
fmin
[
j
];
f32
maxval
=
fmax
[
j
];
if
(
minval
<
minVal
||
(
minval
==
minVal
&&
fminIdx
[
j
]
<
minIdx
))
{
minIdx
=
fminIdx
[
j
];
minVal
=
minval
;
}
if
(
maxval
>
maxVal
||
(
maxval
==
maxVal
&&
fmaxIdx
[
j
]
<
maxIdx
))
{
maxIdx
=
fmaxIdx
[
j
];
maxVal
=
maxval
;
}
}
if
(
minIdx
<
0xffffFFFC
)
{
#if SIZE_MAX > UINT32_MAX
minCol
=
b
+
minIdx
;
#else
minCol
=
minIdx
;
#endif
minRow
=
l
;
}
if
(
maxIdx
<
0xffffFFFC
)
{
#if SIZE_MAX > UINT32_MAX
maxCol
=
b
+
maxIdx
;
#else
maxCol
=
maxIdx
;
#endif
maxRow
=
l
;
}
}
}
for
(;
i
<
size
.
width
;
++
i
)
{
float
val
=
src
[
i
];
if
(
val
<
minVal
)
{
minVal
=
val
;
minCol
=
i
;
minRow
=
l
;
}
else
if
(
val
>
maxVal
)
{
maxVal
=
val
;
maxCol
=
i
;
maxRow
=
l
;
}
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minCol
;
(
void
)
minRow
;
(
void
)
maxVal
;
(
void
)
maxCol
;
(
void
)
maxRow
;
#endif
}
void
minMaxLoc
(
const
Size2D
&
size
,
const
f32
*
srcBase
,
ptrdiff_t
srcStride
,
const
u8
*
maskBase
,
ptrdiff_t
maskStride
,
f32
&
minVal
,
size_t
&
minCol
,
size_t
&
minRow
,
f32
&
maxVal
,
size_t
&
maxCol
,
size_t
&
maxRow
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minVal
=
std
::
numeric_limits
<
f32
>::
max
();
minCol
=
size
.
width
;
minRow
=
size
.
height
;
maxVal
=
-
std
::
numeric_limits
<
f32
>::
max
();
maxCol
=
size
.
width
;
maxRow
=
size
.
height
;
for
(
size_t
l
=
0
,
i
=
0
;
l
<
size
.
height
;
++
l
,
i
=
0
)
{
const
f32
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
l
);
const
u8
*
mask
=
internal
::
getRowPtr
(
maskBase
,
maskStride
,
l
);
if
(
size
.
width
>=
16
)
{
u32
tmp0123
[
4
]
=
{
0
,
1
,
2
,
3
};
uint32x4_t
uOne
=
vdupq_n_u32
(
1
);
uint32x4_t
c4
=
vdupq_n_u32
(
4
);
#if SIZE_MAX > UINT32_MAX
size_t
boundAll
=
size
.
width
-
(
4
-
1
);
for
(
size_t
b
=
0
;
i
<
boundAll
;
b
=
i
)
{
size_t
bound
=
std
::
min
<
size_t
>
(
boundAll
,
b
+
0xffffFFFC
);
#else
{
size_t
bound
=
size
.
width
-
(
4
-
1
);
#endif
uint32x4_t
lineIdxOffset
=
vld1q_u32
(
tmp0123
);
float32x4_t
n_min
=
vdupq_n_f32
(
minVal
);
uint32x4_t
n_minIdx
=
vdupq_n_u32
(
0xffffFFFC
);
float32x4_t
n_max
=
vdupq_n_f32
(
maxVal
);
uint32x4_t
n_maxIdx
=
vdupq_n_u32
(
0xffffFFFC
);
for
(;
i
<
bound
;
i
+=
4
)
{
internal
::
prefetch
(
src
+
i
);
internal
::
prefetch
(
mask
+
i
);
float32x4_t
line
=
vld1q_f32
(
src
+
i
);
uint8x8_t
maskLine
=
vld1_u8
(
mask
+
i
);
uint32x4_t
maskLine4
=
vmovl_u16
(
vget_low_u16
(
vmovl_u8
(
maskLine
)));
maskLine4
=
vcgeq_u32
(
maskLine4
,
uOne
);
uint32x4_t
minmask
=
vcltq_f32
(
line
,
n_min
);
uint32x4_t
maxmask
=
vcgtq_f32
(
line
,
n_max
);
minmask
=
vandq_u32
(
minmask
,
maskLine4
);
maxmask
=
vandq_u32
(
maxmask
,
maskLine4
);
n_min
=
vbslq_f32
(
minmask
,
line
,
n_min
);
n_minIdx
=
vbslq_u32
(
minmask
,
lineIdxOffset
,
n_minIdx
);
n_max
=
vbslq_f32
(
maxmask
,
line
,
n_max
);
n_maxIdx
=
vbslq_u32
(
maxmask
,
lineIdxOffset
,
n_maxIdx
);
// idx[] +=4
lineIdxOffset
=
vaddq_u32
(
lineIdxOffset
,
c4
);
}
f32
fmin
[
4
],
fmax
[
4
];
u32
fminIdx
[
4
],
fmaxIdx
[
4
];
vst1q_f32
(
fmin
,
n_min
);
vst1q_f32
(
fmax
,
n_max
);
vst1q_u32
(
fminIdx
,
n_minIdx
);
vst1q_u32
(
fmaxIdx
,
n_maxIdx
);
size_t
minIdx
=
fminIdx
[
0
];
size_t
maxIdx
=
fmaxIdx
[
0
];
minVal
=
fmin
[
0
];
maxVal
=
fmax
[
0
];
for
(
s32
j
=
1
;
j
<
4
;
++
j
)
{
f32
minval
=
fmin
[
j
];
f32
maxval
=
fmax
[
j
];
if
(
minval
<
minVal
||
(
minval
==
minVal
&&
fminIdx
[
j
]
<
minIdx
))
{
minIdx
=
fminIdx
[
j
];
minVal
=
minval
;
}
if
(
maxval
>
maxVal
||
(
maxval
==
maxVal
&&
fmaxIdx
[
j
]
<
maxIdx
))
{
maxIdx
=
fmaxIdx
[
j
];
maxVal
=
maxval
;
}
}
if
(
minIdx
<
0xffffFFFC
)
{
#if SIZE_MAX > UINT32_MAX
minCol
=
b
+
minIdx
;
#else
minCol
=
minIdx
;
#endif
minRow
=
l
;
}
if
(
maxIdx
<
0xffffFFFC
)
{
#if SIZE_MAX > UINT32_MAX
maxCol
=
b
+
maxIdx
;
#else
maxCol
=
maxIdx
;
#endif
maxRow
=
l
;
}
}
}
for
(;
i
<
size
.
width
;
i
++
)
{
if
(
!
mask
[
i
])
continue
;
f32
val
=
src
[
i
];
if
(
val
<
minVal
)
{
minVal
=
val
;
minCol
=
i
;
minRow
=
l
;
}
if
(
val
>
maxVal
)
{
maxVal
=
val
;
maxCol
=
i
;
maxRow
=
l
;
}
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
maskBase
;
(
void
)
maskStride
;
(
void
)
minVal
;
(
void
)
minCol
;
(
void
)
minRow
;
(
void
)
maxVal
;
(
void
)
maxCol
;
(
void
)
maxRow
;
#endif
}
void
minMaxLoc
(
const
Size2D
&
size
,
const
s32
*
srcBase
,
ptrdiff_t
srcStride
,
s32
&
minVal
,
size_t
&
minCol
,
size_t
&
minRow
,
s32
&
maxVal
,
size_t
&
maxCol
,
size_t
&
maxRow
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minVal
=
srcBase
[
0
];
minCol
=
0
;
minRow
=
0
;
maxVal
=
srcBase
[
0
];
maxCol
=
0
;
maxRow
=
0
;
for
(
size_t
l
=
0
,
i
=
0
;
l
<
size
.
height
;
++
l
,
i
=
0
)
{
const
s32
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
l
);
if
(
size
.
width
>=
16
)
{
u32
tmp0123
[
4
]
=
{
0
,
1
,
2
,
3
};
uint32x4_t
c4
=
vdupq_n_u32
(
4
);
#if SIZE_MAX > UINT32_MAX
size_t
boundAll
=
size
.
width
-
(
4
-
1
);
for
(
size_t
b
=
0
;
i
<
boundAll
;
b
=
i
)
{
size_t
bound
=
std
::
min
<
size_t
>
(
boundAll
,
b
+
0xffffFFFC
);
#else
{
size_t
bound
=
size
.
width
-
(
4
-
1
);
#endif
uint32x4_t
lineIdxOffset
=
vld1q_u32
(
tmp0123
);
int32x4_t
n_min
=
vdupq_n_s32
(
minVal
);
uint32x4_t
n_minIdx
=
vdupq_n_u32
(
0xffffFFFC
);
int32x4_t
n_max
=
vdupq_n_s32
(
maxVal
);
uint32x4_t
n_maxIdx
=
vdupq_n_u32
(
0xffffFFFC
);
for
(;
i
<
bound
;
i
+=
4
)
{
internal
::
prefetch
(
src
+
i
);
int32x4_t
line
=
vld1q_s32
(
src
+
i
);
uint32x4_t
minmask
=
vcltq_s32
(
line
,
n_min
);
uint32x4_t
maxmask
=
vcgtq_s32
(
line
,
n_max
);
n_min
=
vbslq_s32
(
minmask
,
line
,
n_min
);
n_minIdx
=
vbslq_u32
(
minmask
,
lineIdxOffset
,
n_minIdx
);
n_max
=
vbslq_s32
(
maxmask
,
line
,
n_max
);
n_maxIdx
=
vbslq_u32
(
maxmask
,
lineIdxOffset
,
n_maxIdx
);
// idx[] +=4
lineIdxOffset
=
vaddq_u32
(
lineIdxOffset
,
c4
);
}
s32
fmin
[
4
],
fmax
[
4
];
u32
fminIdx
[
4
],
fmaxIdx
[
4
];
vst1q_s32
(
fmin
,
n_min
);
vst1q_s32
(
fmax
,
n_max
);
vst1q_u32
(
fminIdx
,
n_minIdx
);
vst1q_u32
(
fmaxIdx
,
n_maxIdx
);
size_t
minIdx
=
fminIdx
[
0
];
size_t
maxIdx
=
fmaxIdx
[
0
];
minVal
=
fmin
[
0
];
maxVal
=
fmax
[
0
];
for
(
s32
j
=
1
;
j
<
4
;
++
j
)
{
s32
minval
=
fmin
[
j
];
s32
maxval
=
fmax
[
j
];
if
(
minval
<
minVal
||
(
minval
==
minVal
&&
fminIdx
[
j
]
<
minIdx
))
{
minIdx
=
fminIdx
[
j
];
minVal
=
minval
;
}
if
(
maxval
>
maxVal
||
(
maxval
==
maxVal
&&
fmaxIdx
[
j
]
<
maxIdx
))
{
maxIdx
=
fmaxIdx
[
j
];
maxVal
=
maxval
;
}
}
if
(
minIdx
<
0xffffFFFC
)
{
#if SIZE_MAX > UINT32_MAX
minCol
=
b
+
minIdx
;
#else
minCol
=
minIdx
;
#endif
minRow
=
l
;
}
if
(
maxIdx
<
0xffffFFFC
)
{
#if SIZE_MAX > UINT32_MAX
maxCol
=
b
+
maxIdx
;
#else
maxCol
=
maxIdx
;
#endif
maxRow
=
l
;
}
}
}
for
(;
i
<
size
.
width
;
++
i
)
{
s32
val
=
src
[
i
];
if
(
val
<
minVal
)
{
minVal
=
val
;
minCol
=
i
;
minRow
=
l
;
}
else
if
(
val
>
maxVal
)
{
maxVal
=
val
;
maxCol
=
i
;
maxRow
=
l
;
}
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minCol
;
(
void
)
minRow
;
(
void
)
maxVal
;
(
void
)
maxCol
;
(
void
)
maxRow
;
#endif
}
void
minMaxLoc
(
const
Size2D
&
size
,
const
s16
*
srcBase
,
ptrdiff_t
srcStride
,
s16
&
minVal
,
size_t
&
minCol
,
size_t
&
minRow
,
s16
&
maxVal
,
size_t
&
maxCol
,
size_t
&
maxRow
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minVal
=
srcBase
[
0
];
minCol
=
0
;
minRow
=
0
;
maxVal
=
srcBase
[
0
];
maxCol
=
0
;
maxRow
=
0
;
for
(
size_t
l
=
0
,
i
=
0
;
l
<
size
.
height
;
++
l
,
i
=
0
)
{
const
s16
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
l
);
if
(
size
.
width
>=
32
)
{
u32
tmp0123
[
4
]
=
{
0
,
1
,
2
,
3
};
uint32x4_t
c8
=
vdupq_n_u32
(
8
);
#if SIZE_MAX > UINT32_MAX
size_t
boundAll
=
size
.
width
-
(
8
-
1
);
for
(
size_t
b
=
0
;
i
<
boundAll
;
b
=
i
)
{
size_t
bound
=
std
::
min
<
size_t
>
(
boundAll
,
b
+
0xffffFFF8
);
#else
{
size_t
bound
=
size
.
width
-
(
8
-
1
);
#endif
uint32x4_t
lineIdxOffset
=
vld1q_u32
(
tmp0123
);
int16x8_t
n_min
=
vdupq_n_s16
(
minVal
);
uint32x4_t
n_minIdxl
=
vdupq_n_u32
(
0xffffFFF8
);
uint32x4_t
n_minIdxh
=
vdupq_n_u32
(
0xffffFFF8
);
int16x8_t
n_max
=
vdupq_n_s16
(
maxVal
);
uint32x4_t
n_maxIdxl
=
vdupq_n_u32
(
0xffffFFF8
);
uint32x4_t
n_maxIdxh
=
vdupq_n_u32
(
0xffffFFF8
);
for
(;
i
<
bound
;
i
+=
8
)
{
internal
::
prefetch
(
src
+
i
);
int16x8_t
line
=
vld1q_s16
(
src
+
i
);
uint16x8_t
minmask
=
vcltq_s16
(
line
,
n_min
);
uint16x8_t
maxmask
=
vcgtq_s16
(
line
,
n_max
);
n_min
=
vbslq_s16
(
minmask
,
line
,
n_min
);
uint16x4_t
minml
=
vget_low_u16
(
minmask
);
uint16x4_t
minmh
=
vget_high_u16
(
minmask
);
uint32x4_t
minml2
=
vmovl_u16
(
minml
);
uint32x4_t
minmh2
=
vmovl_u16
(
minmh
);
minml2
=
vqshlq_n_u32
(
minml2
,
31
);
minmh2
=
vqshlq_n_u32
(
minmh2
,
31
);
n_minIdxl
=
vbslq_u32
(
minml2
,
lineIdxOffset
,
n_minIdxl
);
n_minIdxh
=
vbslq_u32
(
minmh2
,
lineIdxOffset
,
n_minIdxh
);
n_max
=
vbslq_s16
(
maxmask
,
line
,
n_max
);
uint16x4_t
maxml
=
vget_low_u16
(
maxmask
);
uint16x4_t
maxmh
=
vget_high_u16
(
maxmask
);
uint32x4_t
maxml2
=
vmovl_u16
(
maxml
);
uint32x4_t
maxmh2
=
vmovl_u16
(
maxmh
);
maxml2
=
vqshlq_n_u32
(
maxml2
,
31
);
maxmh2
=
vqshlq_n_u32
(
maxmh2
,
31
);
n_maxIdxl
=
vbslq_u32
(
maxml2
,
lineIdxOffset
,
n_maxIdxl
);
n_maxIdxh
=
vbslq_u32
(
maxmh2
,
lineIdxOffset
,
n_maxIdxh
);
// idx[] +=8
lineIdxOffset
=
vaddq_u32
(
lineIdxOffset
,
c8
);
}
// fix high part of indexes
uint32x4_t
c4
=
vdupq_n_u32
((
int32_t
)
4
);
n_minIdxh
=
vaddq_u32
(
n_minIdxh
,
c4
);
n_maxIdxh
=
vaddq_u32
(
n_maxIdxh
,
c4
);
s16
fmin
[
8
],
fmax
[
8
];
u32
fminIdx
[
8
],
fmaxIdx
[
8
];
vst1q_s16
(
fmin
,
n_min
);
vst1q_s16
(
fmax
,
n_max
);
vst1q_u32
(
fminIdx
+
0
,
n_minIdxl
);
vst1q_u32
(
fmaxIdx
+
0
,
n_maxIdxl
);
vst1q_u32
(
fminIdx
+
4
,
n_minIdxh
);
vst1q_u32
(
fmaxIdx
+
4
,
n_maxIdxh
);
size_t
minIdx
=
fminIdx
[
0
];
size_t
maxIdx
=
fmaxIdx
[
0
];
minVal
=
fmin
[
0
];
maxVal
=
fmax
[
0
];
for
(
s32
j
=
1
;
j
<
8
;
++
j
)
{
s16
minval
=
fmin
[
j
];
s16
maxval
=
fmax
[
j
];
if
(
minval
<
minVal
||
(
minval
==
minVal
&&
fminIdx
[
j
]
<
minIdx
))
{
minIdx
=
fminIdx
[
j
];
minVal
=
minval
;
}
if
(
maxval
>
maxVal
||
(
maxval
==
maxVal
&&
fmaxIdx
[
j
]
<
maxIdx
))
{
maxIdx
=
fmaxIdx
[
j
];
maxVal
=
maxval
;
}
}
if
(
minIdx
<
0xffffFFF8
)
{
#if SIZE_MAX > UINT32_MAX
minCol
=
b
+
minIdx
;
#else
minCol
=
minIdx
;
#endif
minRow
=
l
;
}
if
(
maxIdx
<
0xffffFFF8
)
{
#if SIZE_MAX > UINT32_MAX
maxCol
=
b
+
maxIdx
;
#else
maxCol
=
maxIdx
;
#endif
maxRow
=
l
;
}
}
}
for
(;
i
<
size
.
width
;
++
i
)
{
short
val
=
src
[
i
];
if
(
val
<
minVal
)
{
minVal
=
val
;
minCol
=
i
;
minRow
=
l
;
}
else
if
(
val
>
maxVal
)
{
maxVal
=
val
;
maxCol
=
i
;
maxRow
=
l
;
}
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minCol
;
(
void
)
minRow
;
(
void
)
maxVal
;
(
void
)
maxCol
;
(
void
)
maxRow
;
#endif
}
void
minMaxLoc
(
const
Size2D
&
size
,
const
u16
*
srcBase
,
ptrdiff_t
srcStride
,
u16
&
minVal
,
size_t
&
minCol
,
size_t
&
minRow
,
u16
&
maxVal
,
size_t
&
maxCol
,
size_t
&
maxRow
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minVal
=
srcBase
[
0
];
minCol
=
0
;
minRow
=
0
;
maxVal
=
srcBase
[
0
];
maxCol
=
0
;
maxRow
=
0
;
for
(
size_t
l
=
0
,
i
=
0
;
l
<
size
.
height
;
++
l
,
i
=
0
)
{
const
u16
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
l
);
if
(
size
.
width
>=
32
)
{
u32
tmp0123
[
4
]
=
{
0
,
1
,
2
,
3
};
uint32x4_t
c8
=
vdupq_n_u32
(
8
);
#if SIZE_MAX > UINT32_MAX
size_t
boundAll
=
size
.
width
-
(
8
-
1
);
for
(
size_t
b
=
0
;
i
<
boundAll
;
b
=
i
)
{
size_t
bound
=
std
::
min
<
size_t
>
(
boundAll
,
b
+
0xffffFFF8
);
#else
{
size_t
bound
=
size
.
width
-
(
8
-
1
);
#endif
uint32x4_t
lineIdxOffset
=
vld1q_u32
(
tmp0123
);
uint16x8_t
n_min
=
vdupq_n_u16
(
minVal
);
uint32x4_t
n_minIdxl
=
vdupq_n_u32
(
0xffffFFF8
);
uint32x4_t
n_minIdxh
=
vdupq_n_u32
(
0xffffFFF8
);
uint16x8_t
n_max
=
vdupq_n_u16
(
maxVal
);
uint32x4_t
n_maxIdxl
=
vdupq_n_u32
(
0xffffFFF8
);
uint32x4_t
n_maxIdxh
=
vdupq_n_u32
(
0xffffFFF8
);
for
(;
i
<
bound
;
i
+=
8
)
{
internal
::
prefetch
(
src
+
i
);
uint16x8_t
line
=
vld1q_u16
(
src
+
i
);
uint16x8_t
minmask
=
vcltq_u16
(
line
,
n_min
);
uint16x8_t
maxmask
=
vcgtq_u16
(
line
,
n_max
);
n_min
=
vbslq_u16
(
minmask
,
line
,
n_min
);
uint16x4_t
minml
=
vget_low_u16
(
minmask
);
uint16x4_t
minmh
=
vget_high_u16
(
minmask
);
uint32x4_t
minml2
=
vmovl_u16
(
minml
);
uint32x4_t
minmh2
=
vmovl_u16
(
minmh
);
minml2
=
vqshlq_n_u32
(
minml2
,
31
);
minmh2
=
vqshlq_n_u32
(
minmh2
,
31
);
n_minIdxl
=
vbslq_u32
(
minml2
,
lineIdxOffset
,
n_minIdxl
);
n_minIdxh
=
vbslq_u32
(
minmh2
,
lineIdxOffset
,
n_minIdxh
);
n_max
=
vbslq_u16
(
maxmask
,
line
,
n_max
);
uint16x4_t
maxml
=
vget_low_u16
(
maxmask
);
uint16x4_t
maxmh
=
vget_high_u16
(
maxmask
);
uint32x4_t
maxml2
=
vmovl_u16
(
maxml
);
uint32x4_t
maxmh2
=
vmovl_u16
(
maxmh
);
maxml2
=
vqshlq_n_u32
(
maxml2
,
31
);
maxmh2
=
vqshlq_n_u32
(
maxmh2
,
31
);
n_maxIdxl
=
vbslq_u32
(
maxml2
,
lineIdxOffset
,
n_maxIdxl
);
n_maxIdxh
=
vbslq_u32
(
maxmh2
,
lineIdxOffset
,
n_maxIdxh
);
// idx[] +=8
lineIdxOffset
=
vaddq_u32
(
lineIdxOffset
,
c8
);
}
// fix high part of indexes
uint32x4_t
c4
=
vdupq_n_u32
(
4
);
n_minIdxh
=
vaddq_u32
(
n_minIdxh
,
c4
);
n_maxIdxh
=
vaddq_u32
(
n_maxIdxh
,
c4
);
u16
fmin
[
8
],
fmax
[
8
];
u32
fminIdx
[
8
],
fmaxIdx
[
8
];
vst1q_u16
(
fmin
,
n_min
);
vst1q_u16
(
fmax
,
n_max
);
vst1q_u32
(
fminIdx
+
0
,
n_minIdxl
);
vst1q_u32
(
fmaxIdx
+
0
,
n_maxIdxl
);
vst1q_u32
(
fminIdx
+
4
,
n_minIdxh
);
vst1q_u32
(
fmaxIdx
+
4
,
n_maxIdxh
);
size_t
minIdx
=
fminIdx
[
0
];
size_t
maxIdx
=
fmaxIdx
[
0
];
minVal
=
fmin
[
0
];
maxVal
=
fmax
[
0
];
for
(
s32
j
=
1
;
j
<
8
;
++
j
)
{
u16
minval
=
fmin
[
j
];
u16
maxval
=
fmax
[
j
];
if
(
minval
<
minVal
||
(
minval
==
minVal
&&
fminIdx
[
j
]
<
minIdx
))
{
minIdx
=
fminIdx
[
j
];
minVal
=
minval
;
}
if
(
maxval
>
maxVal
||
(
maxval
==
maxVal
&&
fmaxIdx
[
j
]
<
maxIdx
))
{
maxIdx
=
fmaxIdx
[
j
];
maxVal
=
maxval
;
}
}
if
(
minIdx
<
0xffffFFF8
)
{
#if SIZE_MAX > UINT32_MAX
minCol
=
b
+
minIdx
;
#else
minCol
=
minIdx
;
#endif
minRow
=
l
;
}
if
(
maxIdx
<
0xffffFFF8
)
{
#if SIZE_MAX > UINT32_MAX
maxCol
=
b
+
maxIdx
;
#else
maxCol
=
maxIdx
;
#endif
maxRow
=
l
;
}
}
}
for
(;
i
<
size
.
width
;
++
i
)
{
u16
val
=
src
[
i
];
if
(
val
<
minVal
)
{
minVal
=
val
;
minCol
=
i
;
minRow
=
l
;
}
else
if
(
val
>
maxVal
)
{
maxVal
=
val
;
maxCol
=
i
;
maxRow
=
l
;
}
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minCol
;
(
void
)
minRow
;
(
void
)
maxVal
;
(
void
)
maxCol
;
(
void
)
maxRow
;
#endif
}
#ifdef CAROTENE_NEON
namespace
{
void
minMaxLocBlock
(
const
u8
*
src
,
u32
len
,
u8
&
minVal
,
u16
&
minIdx
,
u8
&
maxVal
,
u16
&
maxIdx
)
{
u16
tmp0123
[
8
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
uint8x16_t
n_min
=
vdupq_n_u8
(
src
[
0
]);
uint16x8_t
n_minIdxl
=
vdupq_n_u16
(
0
);
uint16x8_t
n_minIdxh
=
vdupq_n_u16
(
0
);
uint8x16_t
n_max
=
vdupq_n_u8
(
src
[
0
]);
uint16x8_t
n_maxIdxl
=
vdupq_n_u16
(
0
);
uint16x8_t
n_maxIdxh
=
vdupq_n_u16
(
0
);
uint16x8_t
c16
=
vdupq_n_u16
(
16
);
uint16x8_t
lineIdxOffset
=
vld1q_u16
(
tmp0123
);
s32
i
=
0
;
s32
bound
=
len
-
(
16
-
1
);
for
(;
i
<
bound
;
i
+=
16
)
{
internal
::
prefetch
(
src
+
i
);
uint8x16_t
line
=
vld1q_u8
(
src
+
i
);
uint8x16_t
minmask
=
vcltq_u8
(
line
,
n_min
);
uint8x16_t
maxmask
=
vcgtq_u8
(
line
,
n_max
);
n_min
=
vbslq_u8
(
minmask
,
line
,
n_min
);
uint8x8_t
minml
=
vget_low_u8
(
minmask
);
uint8x8_t
minmh
=
vget_high_u8
(
minmask
);
uint16x8_t
minml2
=
vmovl_u8
(
minml
);
uint16x8_t
minmh2
=
vmovl_u8
(
minmh
);
minml2
=
vqshlq_n_u16
(
minml2
,
15
);
minmh2
=
vqshlq_n_u16
(
minmh2
,
15
);
n_minIdxl
=
vbslq_u16
(
minml2
,
lineIdxOffset
,
n_minIdxl
);
n_minIdxh
=
vbslq_u16
(
minmh2
,
lineIdxOffset
,
n_minIdxh
);
n_max
=
vbslq_u8
(
maxmask
,
line
,
n_max
);
uint8x8_t
maxml
=
vget_low_u8
(
maxmask
);
uint8x8_t
maxmh
=
vget_high_u8
(
maxmask
);
uint16x8_t
maxml2
=
vmovl_u8
(
maxml
);
uint16x8_t
maxmh2
=
vmovl_u8
(
maxmh
);
maxml2
=
vqshlq_n_u16
(
maxml2
,
15
);
maxmh2
=
vqshlq_n_u16
(
maxmh2
,
15
);
n_maxIdxl
=
vbslq_u16
(
maxml2
,
lineIdxOffset
,
n_maxIdxl
);
n_maxIdxh
=
vbslq_u16
(
maxmh2
,
lineIdxOffset
,
n_maxIdxh
);
// idx[] +=16
lineIdxOffset
=
vaddq_u16
(
lineIdxOffset
,
c16
);
}
// fix high part of indexes
uint16x8_t
c8
=
vdupq_n_u16
(
8
);
n_minIdxh
=
vaddq_u16
(
n_minIdxh
,
c8
);
n_maxIdxh
=
vaddq_u16
(
n_maxIdxh
,
c8
);
u8
fmin
[
16
],
fmax
[
16
];
u16
fminIdx
[
16
],
fmaxIdx
[
16
];
/*{
uint8x8_t min_low = vget_low_u8(n_min);
uint8x8_t min_high = vget_high_u8(n_min);
uint8x8_t max_low = vget_low_u8(n_max);
uint8x8_t max_high = vget_high_u8(n_max);
uint8x8_t minmask = vclt_u8(min_low, min_high);
uint8x8_t maxmask = vcgt_u8(max_low, max_high);
uint8x8_t min2 = vbsl_u8(minmask, min_low, min_high);
uint8x8_t max2 = vbsl_u8(maxmask, max_low, max_high);
uint16x8_t minidxmask = vmovl_u8(minmask);
uint16x8_t maxidxmask = vmovl_u8(maxmask);
minidxmask = vqshlq_n_u16(minidxmask, 15);
maxidxmask = vqshlq_n_u16(maxidxmask, 15);
uint16x8_t n_minIdx = vbslq_u16(minidxmask, n_minIdxl, n_minIdxh);
uint16x8_t n_maxIdx = vbslq_u16(maxidxmask, n_maxIdxl, n_maxIdxh);
vst1_u8((uint8_t*)fmin, min2);
vst1_u8((uint8_t*)fmax, max2);
vst1q_u16((uint16_t*)(fminIdx), n_minIdx);
vst1q_u16((uint16_t*)(fmaxIdx), n_maxIdx);
}*/
vst1q_u8
(
fmin
,
n_min
);
vst1q_u8
(
fmax
,
n_max
);
vst1q_u16
(
fminIdx
+
0
,
n_minIdxl
);
vst1q_u16
(
fmaxIdx
+
0
,
n_maxIdxl
);
vst1q_u16
(
fminIdx
+
8
,
n_minIdxh
);
vst1q_u16
(
fmaxIdx
+
8
,
n_maxIdxh
);
minIdx
=
fminIdx
[
0
];
maxIdx
=
fmaxIdx
[
0
];
minVal
=
fmin
[
0
];
maxVal
=
fmax
[
0
];
for
(
s32
j
=
1
;
j
<
16
;
++
j
)
{
u8
minval
=
fmin
[
j
];
u8
maxval
=
fmax
[
j
];
if
(
minval
<
minVal
||
(
minval
==
minVal
&&
fminIdx
[
j
]
<
minIdx
))
{
minIdx
=
fminIdx
[
j
];
minVal
=
minval
;
}
if
(
maxval
>
maxVal
||
(
maxval
==
maxVal
&&
fmaxIdx
[
j
]
<
maxIdx
))
{
maxIdx
=
fmaxIdx
[
j
];
maxVal
=
maxval
;
}
}
for
(;
i
<
(
s32
)
len
;
++
i
)
{
u8
val
=
src
[
i
];
if
(
val
<
minVal
)
{
minVal
=
val
;
minIdx
=
(
u16
)
i
;
}
else
if
(
val
>
maxVal
)
{
maxVal
=
val
;
maxIdx
=
(
u16
)
i
;
}
}
}
void
minMaxLocBlock
(
const
s8
*
src
,
u32
len
,
s8
&
minVal
,
u16
&
minIdx
,
s8
&
maxVal
,
u16
&
maxIdx
)
{
u16
tmp0123
[
16
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
};
int8x16_t
n_min
=
vdupq_n_s8
(
src
[
0
]);
uint16x8_t
n_minIdxl
=
vdupq_n_u16
(
0
);
uint16x8_t
n_minIdxh
=
vdupq_n_u16
(
0
);
int8x16_t
n_max
=
vdupq_n_s8
(
src
[
0
]);
uint16x8_t
n_maxIdxl
=
vdupq_n_u16
(
0
);
uint16x8_t
n_maxIdxh
=
vdupq_n_u16
(
0
);
uint16x8_t
c16
=
vdupq_n_u16
(
16
);
uint16x8_t
lineIdxOffset
=
vld1q_u16
(
tmp0123
);
s32
i
=
0
;
s32
bound
=
len
-
(
16
-
1
);
for
(;
i
<
bound
;
i
+=
16
)
{
internal
::
prefetch
(
src
+
i
);
int8x16_t
line
=
vld1q_s8
(
src
+
i
);
uint8x16_t
minmask
=
vcltq_s8
(
line
,
n_min
);
uint8x16_t
maxmask
=
vcgtq_s8
(
line
,
n_max
);
n_min
=
vbslq_s8
(
minmask
,
line
,
n_min
);
uint8x8_t
minml
=
vget_low_u8
(
minmask
);
uint8x8_t
minmh
=
vget_high_u8
(
minmask
);
uint16x8_t
minml2
=
vmovl_u8
(
minml
);
uint16x8_t
minmh2
=
vmovl_u8
(
minmh
);
minml2
=
vqshlq_n_u16
(
minml2
,
15
);
minmh2
=
vqshlq_n_u16
(
minmh2
,
15
);
n_minIdxl
=
vbslq_u16
(
minml2
,
lineIdxOffset
,
n_minIdxl
);
n_minIdxh
=
vbslq_u16
(
minmh2
,
lineIdxOffset
,
n_minIdxh
);
n_max
=
vbslq_s8
(
maxmask
,
line
,
n_max
);
uint8x8_t
maxml
=
vget_low_u8
(
maxmask
);
uint8x8_t
maxmh
=
vget_high_u8
(
maxmask
);
uint16x8_t
maxml2
=
vmovl_u8
(
maxml
);
uint16x8_t
maxmh2
=
vmovl_u8
(
maxmh
);
maxml2
=
vqshlq_n_u16
(
maxml2
,
15
);
maxmh2
=
vqshlq_n_u16
(
maxmh2
,
15
);
n_maxIdxl
=
vbslq_u16
(
maxml2
,
lineIdxOffset
,
n_maxIdxl
);
n_maxIdxh
=
vbslq_u16
(
maxmh2
,
lineIdxOffset
,
n_maxIdxh
);
// idx[] +=16
lineIdxOffset
=
vaddq_u16
(
lineIdxOffset
,
c16
);
}
// fix high part of indexes
uint16x8_t
c8
=
vdupq_n_u16
(
8
);
n_minIdxh
=
vaddq_u16
(
n_minIdxh
,
c8
);
n_maxIdxh
=
vaddq_u16
(
n_maxIdxh
,
c8
);
s8
fmin
[
16
],
fmax
[
16
];
u16
fminIdx
[
16
],
fmaxIdx
[
16
];
vst1q_s8
(
fmin
,
n_min
);
vst1q_s8
(
fmax
,
n_max
);
vst1q_u16
(
fminIdx
+
0
,
n_minIdxl
);
vst1q_u16
(
fmaxIdx
+
0
,
n_maxIdxl
);
vst1q_u16
(
fminIdx
+
8
,
n_minIdxh
);
vst1q_u16
(
fmaxIdx
+
8
,
n_maxIdxh
);
minIdx
=
fminIdx
[
0
];
maxIdx
=
fmaxIdx
[
0
];
minVal
=
fmin
[
0
];
maxVal
=
fmax
[
0
];
for
(
s32
j
=
1
;
j
<
16
;
++
j
)
{
s8
minval
=
fmin
[
j
];
s8
maxval
=
fmax
[
j
];
if
(
minval
<
minVal
||
(
minval
==
minVal
&&
fminIdx
[
j
]
<
minIdx
))
{
minIdx
=
fminIdx
[
j
];
minVal
=
minval
;
}
if
(
maxval
>
maxVal
||
(
maxval
==
maxVal
&&
fmaxIdx
[
j
]
<
maxIdx
))
{
maxIdx
=
fmaxIdx
[
j
];
maxVal
=
maxval
;
}
}
for
(;
i
<
(
s32
)
len
;
++
i
)
{
s8
val
=
src
[
i
];
if
(
val
<
minVal
)
{
minVal
=
val
;
minIdx
=
(
u16
)
i
;
}
else
if
(
val
>
maxVal
)
{
maxVal
=
val
;
maxIdx
=
(
u16
)
i
;
}
}
}
}
// namespace
#endif // CAROTENE_NEON
#define USHORT_BLOCK_MAX_SIZE (1 << 16)
void
minMaxLoc
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
&
minVal
,
size_t
&
minCol
,
size_t
&
minRow
,
u8
&
maxVal
,
size_t
&
maxCol
,
size_t
&
maxRow
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minVal
=
srcBase
[
0
];
minCol
=
0
;
minRow
=
0
;
maxVal
=
srcBase
[
0
];
maxCol
=
0
;
maxRow
=
0
;
for
(
size_t
l
=
0
;
l
<
size
.
height
;
++
l
)
{
const
u8
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
l
);
if
(
size
.
width
>
128
)
{
for
(
size_t
blockStart
=
0
;
blockStart
<
size
.
width
;
blockStart
+=
USHORT_BLOCK_MAX_SIZE
)
{
u8
locMinVal
,
locMaxVal
;
u16
locMinIdx
,
locMaxIdx
;
size_t
tail
=
size
.
width
-
blockStart
;
minMaxLocBlock
(
src
+
blockStart
,
tail
<
USHORT_BLOCK_MAX_SIZE
?
tail
:
USHORT_BLOCK_MAX_SIZE
,
locMinVal
,
locMinIdx
,
locMaxVal
,
locMaxIdx
);
if
(
locMinVal
==
0
&&
locMaxVal
==
255
)
{
minCol
=
blockStart
+
locMinIdx
;
maxCol
=
blockStart
+
locMaxIdx
;
minRow
=
l
;
maxRow
=
l
;
minVal
=
0
;
maxVal
=
255
;
return
;
}
else
{
if
(
locMinVal
<
minVal
)
{
minCol
=
blockStart
+
locMinIdx
;
minRow
=
l
;
minVal
=
locMinVal
;
}
if
(
locMaxVal
>
maxVal
)
{
maxCol
=
blockStart
+
locMaxIdx
;
maxRow
=
l
;
maxVal
=
locMaxVal
;
}
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
size
.
width
;
++
i
)
{
u8
val
=
src
[
i
];
if
(
val
<
minVal
)
{
minVal
=
val
;
minCol
=
i
;
minRow
=
l
;
}
else
if
(
val
>
maxVal
)
{
maxVal
=
val
;
maxCol
=
i
;
maxRow
=
l
;
}
}
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minCol
;
(
void
)
minRow
;
(
void
)
maxVal
;
(
void
)
maxCol
;
(
void
)
maxRow
;
#endif
}
void
minMaxLoc
(
const
Size2D
&
size
,
const
s8
*
srcBase
,
ptrdiff_t
srcStride
,
s8
&
minVal
,
size_t
&
minCol
,
size_t
&
minRow
,
s8
&
maxVal
,
size_t
&
maxCol
,
size_t
&
maxRow
)
{
internal
::
assertSupportedConfiguration
();
#ifdef CAROTENE_NEON
minVal
=
srcBase
[
0
];
minCol
=
0
;
minRow
=
0
;
maxVal
=
srcBase
[
0
];
maxCol
=
0
;
maxRow
=
0
;
for
(
size_t
l
=
0
;
l
<
size
.
height
;
++
l
)
{
const
s8
*
src
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
l
);
if
(
size
.
width
>
128
)
{
for
(
size_t
blockStart
=
0
;
blockStart
<
size
.
width
;
blockStart
+=
USHORT_BLOCK_MAX_SIZE
)
{
s8
locMinVal
,
locMaxVal
;
u16
locMinIdx
,
locMaxIdx
;
size_t
tail
=
size
.
width
-
blockStart
;
minMaxLocBlock
(
src
+
blockStart
,
tail
<
USHORT_BLOCK_MAX_SIZE
?
tail
:
USHORT_BLOCK_MAX_SIZE
,
locMinVal
,
locMinIdx
,
locMaxVal
,
locMaxIdx
);
if
(
locMinVal
==
-
128
&&
locMaxVal
==
127
)
{
minCol
=
blockStart
+
locMinIdx
;
maxCol
=
blockStart
+
locMaxIdx
;
minRow
=
l
;
maxRow
=
l
;
minVal
=
-
128
;
maxVal
=
127
;
return
;
}
else
{
if
(
locMinVal
<
minVal
)
{
minCol
=
blockStart
+
locMinIdx
;
minRow
=
l
;
minVal
=
locMinVal
;
}
if
(
locMaxVal
>
maxVal
)
{
maxCol
=
blockStart
+
locMaxIdx
;
maxRow
=
l
;
maxVal
=
locMaxVal
;
}
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
size
.
width
;
++
i
)
{
s8
val
=
src
[
i
];
if
(
val
<
minVal
)
{
minVal
=
val
;
minRow
=
l
;
minCol
=
i
;
}
else
if
(
val
>
maxVal
)
{
maxVal
=
val
;
maxRow
=
l
;
maxCol
=
i
;
}
}
}
}
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
minVal
;
(
void
)
minCol
;
(
void
)
minRow
;
(
void
)
maxVal
;
(
void
)
maxCol
;
(
void
)
maxRow
;
#endif
}
}
// namespace CAROTENE_NS
opencv/3rdparty/carotene/src/morph.cpp
0 → 100644
View file @
fbd3199c
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <algorithm>
#include <limits>
#include <vector>
#include <cstring>
namespace
CAROTENE_NS
{
bool
isMorph3x3Supported
(
const
Size2D
&
size
,
BORDER_MODE
border
)
{
return
isSupportedConfiguration
()
&&
size
.
width
>=
16
&&
(
border
==
BORDER_MODE_CONSTANT
||
border
==
BORDER_MODE_REPLICATE
);
}
#ifdef CAROTENE_NEON
namespace
{
struct
ErodeVecOp
{
ErodeVecOp
()
:
borderValue
(
0
){}
ErodeVecOp
(
BORDER_MODE
border
,
u8
borderValue_
)
:
borderValue
(
borderValue_
)
{
if
(
border
==
BORDER_MODE_REPLICATE
)
borderValue
=
std
::
numeric_limits
<
u8
>::
max
();
}
inline
uint8x16_t
operator
()(
uint8x16_t
a
,
uint8x16_t
b
)
const
{
return
vminq_u8
(
a
,
b
);
}
inline
uint8x8_t
operator
()(
uint8x8_t
a
,
uint8x8_t
b
)
const
{
return
vmin_u8
(
a
,
b
);
}
inline
u8
operator
()(
u8
a
,
u8
b
)
const
{
return
std
::
min
(
a
,
b
);
}
u8
borderValue
;
};
struct
DilateVecOp
{
DilateVecOp
()
:
borderValue
(
0
){}
DilateVecOp
(
BORDER_MODE
border
,
u8
borderValue_
)
:
borderValue
(
borderValue_
)
{
if
(
border
==
BORDER_MODE_REPLICATE
)
borderValue
=
std
::
numeric_limits
<
u8
>::
min
();
}
inline
uint8x16_t
operator
()(
uint8x16_t
a
,
uint8x16_t
b
)
const
{
return
vmaxq_u8
(
a
,
b
);
}
inline
uint8x8_t
operator
()(
uint8x8_t
a
,
uint8x8_t
b
)
const
{
return
vmax_u8
(
a
,
b
);
}
inline
u8
operator
()(
u8
a
,
u8
b
)
const
{
return
std
::
max
(
a
,
b
);
}
u8
borderValue
;
};
template
<
typename
VecOp
>
void
morph3x3
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
const
VecOp
&
vop
)
{
u8
borderValue
=
vop
.
borderValue
;
ptrdiff_t
width
=
(
ptrdiff_t
)
size
.
width
,
height
=
(
ptrdiff_t
)
size
.
height
;
const
uint8x16_t
v_zero
=
vdupq_n_u8
(
0
);
const
uint8x16_t
v_border
=
vdupq_n_u8
(
borderValue
);
uint8x16_t
tprev
=
v_zero
,
tcurr
=
v_zero
,
tnext
=
v_zero
;
uint8x16_t
t0
=
v_zero
,
t1
=
v_zero
,
t2
=
v_zero
;
for
(
ptrdiff_t
y
=
0
;
y
<
height
;
++
y
)
{
const
u8
*
srow0
=
y
==
0
&&
border
==
BORDER_MODE_CONSTANT
?
NULL
:
internal
::
getRowPtr
(
srcBase
,
srcStride
,
std
::
max
<
ptrdiff_t
>
(
y
-
1
,
0
));
const
u8
*
srow1
=
internal
::
getRowPtr
(
srcBase
,
srcStride
,
y
);
const
u8
*
srow2
=
y
+
1
==
height
&&
border
==
BORDER_MODE_CONSTANT
?
NULL
:
internal
::
getRowPtr
(
srcBase
,
srcStride
,
std
::
min
(
y
+
1
,
height
-
1
));
u8
*
drow
=
internal
::
getRowPtr
(
dstBase
,
dstStride
,
y
);
u8
prevx
=
0
,
currx
=
0
,
nextx
=
0
;
ptrdiff_t
x
=
0
;
const
ptrdiff_t
bwidth
=
y
+
2
<
height
?
width
:
(
width
-
16
);
// perform vertical convolution
for
(
;
x
<=
bwidth
;
x
+=
16
)
{
internal
::
prefetch
(
srow0
+
x
);
internal
::
prefetch
(
srow1
+
x
);
internal
::
prefetch
(
srow2
+
x
);
uint8x16_t
x0
=
!
srow0
?
v_border
:
vld1q_u8
(
srow0
+
x
);
uint8x16_t
x1
=
vld1q_u8
(
srow1
+
x
);
uint8x16_t
x2
=
!
srow2
?
v_border
:
vld1q_u8
(
srow2
+
x
);
// calculate values for plain CPU part below if needed
if
(
x
+
16
>=
bwidth
)
{
ptrdiff_t
x3
=
x
==
width
?
width
-
1
:
x
;
ptrdiff_t
x4
=
border
==
BORDER_MODE_CONSTANT
?
x3
-
1
:
std
::
max
<
ptrdiff_t
>
(
x3
-
1
,
0
);
if
(
border
==
BORDER_MODE_CONSTANT
&&
x4
<
0
)
prevx
=
borderValue
;
else
prevx
=
vop
(
srow1
[
x4
],
vop
(
srow2
?
srow2
[
x4
]
:
borderValue
,
srow0
?
srow0
[
x4
]
:
borderValue
));
currx
=
vop
(
srow2
?
srow2
[
x3
]
:
borderValue
,
vop
(
srow1
[
x3
],
srow0
?
srow0
[
x3
]
:
borderValue
));
}
// make shift
if
(
x
)
{
tprev
=
tcurr
;
tcurr
=
tnext
;
}
// and calculate next value
tnext
=
vop
(
vop
(
x0
,
x1
),
x2
);
// make extrapolation for the first elements
if
(
!
x
)
{
// make border
if
(
border
==
BORDER_MODE_CONSTANT
)
tcurr
=
v_border
;
else
if
(
border
==
BORDER_MODE_REPLICATE
)
tcurr
=
vdupq_n_u8
(
vgetq_lane_u8
(
tnext
,
0
));
continue
;
}
// combine 3 "shifted" vectors
t0
=
vextq_u8
(
tprev
,
tcurr
,
15
);
t1
=
tcurr
;
t2
=
vextq_u8
(
tcurr
,
tnext
,
1
);
// and add them
t0
=
vop
(
t0
,
vop
(
t1
,
t2
));
vst1q_u8
(
drow
+
x
-
16
,
t0
);
}
x
-=
16
;
if
(
x
==
width
)
--
x
;
for
(
;
x
<
width
;
++
x
)
{
// make extrapolation for the last elements
if
(
x
+
1
>=
width
)
{
if
(
border
==
BORDER_MODE_CONSTANT
)
nextx
=
borderValue
;
else
if
(
border
==
BORDER_MODE_REPLICATE
)
nextx
=
vop
(
srow2
[
x
],
vop
(
srow1
[
x
],
srow0
[
x
]));
}
else
nextx
=
vop
(
vop
(
srow2
?
srow2
[
x
+
1
]
:
borderValue
,
srow0
?
srow0
[
x
+
1
]
:
borderValue
),
srow1
[
x
+
1
]);
drow
[
x
]
=
vop
(
prevx
,
vop
(
currx
,
nextx
));
// make shift
prevx
=
currx
;
currx
=
nextx
;
}
}
}
}
// namespace
#endif
void
erode3x3
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
u8
borderValue
)
{
internal
::
assertSupportedConfiguration
(
isMorph3x3Supported
(
size
,
border
));
#ifdef CAROTENE_NEON
morph3x3
(
size
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
border
,
ErodeVecOp
(
border
,
borderValue
));
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
border
;
(
void
)
borderValue
;
#endif
}
void
dilate3x3
(
const
Size2D
&
size
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
BORDER_MODE
border
,
u8
borderValue
)
{
internal
::
assertSupportedConfiguration
(
isMorph3x3Supported
(
size
,
border
));
#ifdef CAROTENE_NEON
morph3x3
(
size
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
border
,
DilateVecOp
(
border
,
borderValue
));
#else
(
void
)
size
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
border
;
(
void
)
borderValue
;
#endif
}
#ifdef CAROTENE_NEON
namespace
{
template
<
class
VecUpdate
>
void
MorphRow
(
const
u8
*
src
,
u8
*
dst
,
size_t
width
,
s32
cn
,
size_t
ksize
)
{
size_t
i
,
j
,
k
;
size_t
width16
=
(
width
&
-
16
)
*
cn
;
size_t
width8
=
(
width
&
-
8
)
*
cn
;
width
*=
cn
;
if
(
ksize
==
1
)
{
for
(
i
=
0
;
i
<
width
;
i
++
)
dst
[
i
]
=
src
[
i
];
return
;
}
ksize
=
ksize
*
cn
;
VecUpdate
updateOp
;
switch
(
cn
)
{
case
1
:
for
(
i
=
0
;
i
<
width16
;
i
+=
16
)
{
const
u8
*
sptr
=
src
+
i
;
uint8x16_t
s
=
vld1q_u8
(
sptr
);
internal
::
prefetch
(
sptr
);
for
(
k
=
1
;
k
<
ksize
;
++
k
)
s
=
updateOp
(
s
,
vld1q_u8
(
sptr
+
k
));
vst1q_u8
(
dst
+
i
,
s
);
}
for
(;
i
<
width8
;
i
+=
8
)
{
const
u8
*
sptr
=
src
+
i
;
uint8x8_t
s
=
vld1_u8
(
sptr
);
internal
::
prefetch
(
sptr
);
for
(
k
=
1
;
k
<
ksize
;
++
k
)
s
=
updateOp
(
s
,
vld1_u8
(
sptr
+
k
));
vst1_u8
(
dst
+
i
,
s
);
}
break
;
default:
for
(
i
=
0
;
i
<
width16
;
i
+=
16
)
{
uint8x16_t
s
=
vld1q_u8
(
src
+
i
);
internal
::
prefetch
(
src
+
i
);
for
(
k
=
cn
;
k
<
ksize
;
k
+=
cn
)
s
=
updateOp
(
s
,
vld1q_u8
(
src
+
i
+
k
));
vst1q_u8
(
dst
+
i
,
s
);
}
for
(;
i
<
width8
;
i
+=
8
)
{
uint8x8_t
s
=
vld1_u8
(
src
+
i
);
internal
::
prefetch
(
src
+
i
);
for
(
k
=
cn
;
k
<
ksize
;
k
+=
cn
)
s
=
updateOp
(
s
,
vld1_u8
(
src
+
i
+
k
));
vst1_u8
(
dst
+
i
,
s
);
}
break
;
}
ptrdiff_t
i0
=
i
;
for
(
k
=
0
;
k
<
(
size_t
)
cn
;
k
++
,
src
++
,
dst
++
)
{
for
(
i
=
i0
;
i
<=
width
-
cn
*
2
;
i
+=
cn
*
2
)
{
const
u8
*
s
=
src
+
i
;
u8
m
=
s
[
cn
];
for
(
j
=
cn
*
2
;
j
<
ksize
;
j
+=
cn
)
m
=
updateOp
(
m
,
s
[
j
]);
dst
[
i
]
=
updateOp
(
m
,
s
[
0
]);
dst
[
i
+
cn
]
=
updateOp
(
m
,
s
[
j
]);
}
for
(
;
i
<
width
;
i
+=
cn
)
{
const
u8
*
s
=
src
+
i
;
u8
m
=
s
[
0
];
for
(
j
=
cn
;
j
<
ksize
;
j
+=
cn
)
m
=
updateOp
(
m
,
s
[
j
]);
dst
[
i
]
=
m
;
}
}
}
template
<
class
VecUpdate
>
void
MorphColumn
(
const
u8
**
src
,
u8
*
dst
,
ptrdiff_t
dststep
,
size_t
count
,
size_t
width
,
size_t
ksize
)
{
size_t
i
,
k
;
size_t
width32
=
width
&
-
32
;
VecUpdate
updateOp
;
uint8x16_t
x0
,
x1
,
s0
,
s1
;
if
(
ksize
==
3
)
{
for
(;
count
>
1
;
count
-=
2
,
dst
+=
dststep
*
2
,
src
+=
2
)
{
for
(
i
=
0
;
i
<
width32
;
i
+=
32
)
{
const
u8
*
sptr
=
src
[
1
]
+
i
;
s0
=
vld1q_u8
(
sptr
);
s1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
sptr
=
src
[
2
]
+
i
;
x0
=
vld1q_u8
(
sptr
);
x1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
s0
=
updateOp
(
s0
,
x0
);
s1
=
updateOp
(
s1
,
x1
);
sptr
=
src
[
0
]
+
i
;
x0
=
vld1q_u8
(
sptr
);
x1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
vst1q_u8
(
dst
+
i
,
updateOp
(
s0
,
x0
));
vst1q_u8
(
dst
+
i
+
16
,
updateOp
(
s1
,
x1
));
sptr
=
src
[
3
]
+
i
;
x0
=
vld1q_u8
(
sptr
);
x1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
vst1q_u8
(
dst
+
dststep
+
i
,
updateOp
(
s0
,
x0
));
vst1q_u8
(
dst
+
dststep
+
i
+
16
,
updateOp
(
s1
,
x1
));
}
for
(;
i
<
width
;
i
++
)
{
u8
s
=
src
[
1
][
i
];
for
(
k
=
2
;
k
<
ksize
;
k
++
)
s
=
updateOp
(
s
,
src
[
k
][
i
]);
dst
[
i
]
=
updateOp
(
s
,
src
[
0
][
i
]);
dst
[
i
+
dststep
]
=
updateOp
(
s
,
src
[
k
][
i
]);
}
}
}
else
if
(
ksize
>
1
)
for
(;
count
>
1
;
count
-=
2
,
dst
+=
dststep
*
2
,
src
+=
2
)
{
for
(
i
=
0
;
i
<
width32
;
i
+=
32
)
{
const
u8
*
sptr
=
src
[
1
]
+
i
;
s0
=
vld1q_u8
(
sptr
);
s1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
for
(
k
=
2
;
k
<
ksize
;
k
++
)
{
sptr
=
src
[
k
]
+
i
;
x0
=
vld1q_u8
(
sptr
);
x1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
s0
=
updateOp
(
s0
,
x0
);
s1
=
updateOp
(
s1
,
x1
);
}
sptr
=
src
[
0
]
+
i
;
x0
=
vld1q_u8
(
sptr
);
x1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
vst1q_u8
(
dst
+
i
,
updateOp
(
s0
,
x0
));
vst1q_u8
(
dst
+
i
+
16
,
updateOp
(
s1
,
x1
));
sptr
=
src
[
k
]
+
i
;
x0
=
vld1q_u8
(
sptr
);
x1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
vst1q_u8
(
dst
+
dststep
+
i
,
updateOp
(
s0
,
x0
));
vst1q_u8
(
dst
+
dststep
+
i
+
16
,
updateOp
(
s1
,
x1
));
}
for
(;
i
<
width
;
i
++
)
{
u8
s
=
src
[
1
][
i
];
for
(
k
=
2
;
k
<
ksize
;
k
++
)
s
=
updateOp
(
s
,
src
[
k
][
i
]);
dst
[
i
]
=
updateOp
(
s
,
src
[
0
][
i
]);
dst
[
i
+
dststep
]
=
updateOp
(
s
,
src
[
k
][
i
]);
}
}
for
(;
count
>
0
;
count
--
,
dst
+=
dststep
,
src
++
)
{
for
(
i
=
0
;
i
<
width32
;
i
+=
32
)
{
const
u8
*
sptr
=
src
[
0
]
+
i
;
s0
=
vld1q_u8
(
sptr
);
s1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
for
(
k
=
1
;
k
<
ksize
;
k
++
)
{
sptr
=
src
[
k
]
+
i
;
x0
=
vld1q_u8
(
sptr
);
x1
=
vld1q_u8
(
sptr
+
16
);
internal
::
prefetch
(
sptr
);
s0
=
updateOp
(
s0
,
x0
);
s1
=
updateOp
(
s1
,
x1
);
}
vst1q_u8
(
dst
+
i
,
s0
);
vst1q_u8
(
dst
+
i
+
16
,
s1
);
}
for
(;
i
<
width
;
i
++
)
{
u8
s
=
src
[
0
][
i
];
for
(
k
=
1
;
k
<
ksize
;
k
++
)
s
=
updateOp
(
s
,
src
[
k
][
i
]);
dst
[
i
]
=
s
;
}
}
}
template
<
class
Op
>
inline
void
morphology
(
const
Size2D
&
ssize
,
u32
cn
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
const
Size2D
&
ksize
,
size_t
anchorX
,
size_t
anchorY
,
BORDER_MODE
rowBorderType
,
BORDER_MODE
columnBorderType
,
const
u8
*
borderValues
,
Margin
borderMargin
)
{
//Temporary buffers common for all iterations
std
::
vector
<
u8
>
_srcRow
(
cn
*
(
ssize
.
width
+
ksize
.
width
-
1
));
u8
*
srcRow
=
&
_srcRow
[
0
];
size_t
bufRows
=
std
::
max
<
size_t
>
(
ksize
.
height
+
3
,
std
::
max
<
size_t
>
(
anchorY
,
ksize
.
height
-
anchorY
-
1
)
*
2
+
1
);
std
::
vector
<
u8
*>
_rows
(
bufRows
);
u8
**
rows
=
&
_rows
[
0
];
// adjust swidthcn so that the used part of buffers stays compact in memory
ptrdiff_t
swidthcn
=
cn
*
((
ssize
.
width
+
15
)
&
-
16
);
// cn * (aligned ssize.width size)
std
::
vector
<
u8
>
_ringBuf
(
swidthcn
*
bufRows
+
16
);
u8
*
ringBuf
=
internal
::
alignPtr
(
&
_ringBuf
[
0
],
16
);
size_t
borderLength
=
std
::
max
<
size_t
>
(
ksize
.
width
-
1
,
1
)
*
cn
;
std
::
vector
<
ptrdiff_t
>
_borderTab
(
borderLength
);
ptrdiff_t
*
borderTab
=
&
_borderTab
[
0
];
std
::
vector
<
u8
>
_constBorderValue
;
std
::
vector
<
u8
>
_constBorderRow
;
u8
*
constBorderValue
=
NULL
;
u8
*
constBorderRow
=
NULL
;
if
(
rowBorderType
==
BORDER_MODE_CONSTANT
||
columnBorderType
==
BORDER_MODE_CONSTANT
)
{
_constBorderValue
.
resize
(
borderLength
);
constBorderValue
=
&
_constBorderValue
[
0
];
size_t
i
;
for
(
i
=
0
;
i
<
cn
;
i
++
)
constBorderValue
[
i
]
=
borderValues
[
i
];
for
(;
i
<
borderLength
;
i
++
)
constBorderValue
[
i
]
=
constBorderValue
[
i
-
cn
];
if
(
columnBorderType
==
BORDER_MODE_CONSTANT
)
{
_constBorderRow
.
resize
(
cn
*
(
ssize
.
width
+
ksize
.
width
-
1
+
16
));
constBorderRow
=
internal
::
alignPtr
(
&
_constBorderRow
[
0
],
16
);
size_t
N
=
(
ssize
.
width
+
ksize
.
width
-
1
)
*
cn
;
for
(
i
=
0
;
i
<
N
;
i
+=
borderLength
)
{
size_t
n
=
std
::
min
(
borderLength
,
N
-
i
);
for
(
size_t
j
=
0
;
j
<
n
;
j
++
)
srcRow
[
i
+
j
]
=
constBorderValue
[
j
];
}
MorphRow
<
Op
>
(
srcRow
,
constBorderRow
,
ssize
.
width
,
cn
,
ksize
.
width
);
}
}
Size2D
wholeSize
(
ssize
.
width
+
borderMargin
.
left
+
borderMargin
.
right
,
ssize
.
height
+
borderMargin
.
top
+
borderMargin
.
bottom
);
ptrdiff_t
dx1
=
std
::
max
<
ptrdiff_t
>
(
anchorX
-
(
ptrdiff_t
)
borderMargin
.
left
,
0
);
ptrdiff_t
dx2
=
std
::
max
<
ptrdiff_t
>
((
ptrdiff_t
)
ksize
.
width
-
anchorX
-
1
-
(
ptrdiff_t
)
borderMargin
.
right
,
0
);
// recompute border tables
if
(
dx1
>
0
||
dx2
>
0
)
{
if
(
rowBorderType
==
BORDER_MODE_CONSTANT
)
{
memcpy
(
srcRow
,
&
constBorderValue
[
0
],
dx1
*
cn
);
memcpy
(
srcRow
+
(
ssize
.
width
+
ksize
.
width
-
1
-
dx2
)
*
cn
,
&
constBorderValue
[
0
],
dx2
*
cn
);
}
else
{
ptrdiff_t
xofs1
=
std
::
min
<
ptrdiff_t
>
(
borderMargin
.
left
,
anchorX
)
-
borderMargin
.
left
;
ptrdiff_t
wholeWidth
=
wholeSize
.
width
;
ptrdiff_t
i
,
j
;
for
(
i
=
0
;
i
<
dx1
;
i
++
)
{
ptrdiff_t
p0
=
(
internal
::
borderInterpolate
(
i
-
dx1
,
wholeWidth
,
rowBorderType
)
+
xofs1
)
*
cn
;
for
(
j
=
0
;
j
<
(
ptrdiff_t
)
cn
;
j
++
)
borderTab
[
i
*
cn
+
j
]
=
p0
+
j
;
}
for
(
i
=
0
;
i
<
dx2
;
i
++
)
{
ptrdiff_t
p0
=
(
internal
::
borderInterpolate
(
wholeWidth
+
i
,
wholeWidth
,
rowBorderType
)
+
xofs1
)
*
cn
;
for
(
j
=
0
;
j
<
(
ptrdiff_t
)
cn
;
j
++
)
borderTab
[(
i
+
dx1
)
*
cn
+
j
]
=
p0
+
j
;
}
}
}
ptrdiff_t
startY
,
startY0
,
endY
,
rowCount
;
startY
=
startY0
=
std
::
max
<
ptrdiff_t
>
(
borderMargin
.
top
-
anchorY
,
0
);
endY
=
std
::
min
<
ptrdiff_t
>
(
borderMargin
.
top
+
ssize
.
height
+
ksize
.
height
-
anchorY
-
1
,
wholeSize
.
height
);
const
u8
*
src
=
srcBase
+
(
startY
-
borderMargin
.
top
)
*
srcStride
;
u8
*
dst
=
dstBase
;
ptrdiff_t
width
=
ssize
.
width
,
kwidth
=
ksize
.
width
;
ptrdiff_t
kheight
=
ksize
.
height
,
ay
=
anchorY
;
ptrdiff_t
width1
=
ssize
.
width
+
kwidth
-
1
;
ptrdiff_t
xofs1
=
std
::
min
<
ptrdiff_t
>
(
borderMargin
.
left
,
anchorX
);
bool
makeBorder
=
(
dx1
>
0
||
dx2
>
0
)
&&
rowBorderType
!=
BORDER_MODE_CONSTANT
;
ptrdiff_t
dy
=
0
,
i
=
0
;
src
-=
xofs1
*
cn
;
ptrdiff_t
count
=
endY
-
startY
;
rowCount
=
0
;
for
(;;
dst
+=
dstStride
*
i
,
dy
+=
i
)
{
ptrdiff_t
dcount
=
bufRows
-
ay
-
startY
-
rowCount
+
borderMargin
.
top
;
dcount
=
dcount
>
0
?
dcount
:
bufRows
-
kheight
+
1
;
dcount
=
std
::
min
(
dcount
,
count
);
count
-=
dcount
;
for
(
;
dcount
--
>
0
;
src
+=
srcStride
)
{
ptrdiff_t
bi
=
(
startY
-
startY0
+
rowCount
)
%
bufRows
;
u8
*
brow
=
ringBuf
+
bi
*
swidthcn
;
if
(
(
size_t
)(
++
rowCount
)
>
bufRows
)
{
--
rowCount
;
++
startY
;
}
memcpy
(
srcRow
+
dx1
*
cn
,
src
,
(
width1
-
dx2
-
dx1
)
*
cn
);
if
(
makeBorder
)
{
for
(
i
=
0
;
i
<
(
ptrdiff_t
)(
dx1
*
cn
);
i
++
)
srcRow
[
i
]
=
src
[
borderTab
[
i
]];
for
(
i
=
0
;
i
<
(
ptrdiff_t
)(
dx2
*
cn
);
i
++
)
srcRow
[
i
+
(
width1
-
dx2
)
*
cn
]
=
src
[
borderTab
[
i
+
dx1
*
cn
]];
}
MorphRow
<
Op
>
(
srcRow
,
brow
,
width
,
cn
,
ksize
.
width
);
}
ptrdiff_t
max_i
=
std
::
min
<
ptrdiff_t
>
(
bufRows
,
ssize
.
height
-
dy
+
(
kheight
-
1
));
for
(
i
=
0
;
i
<
max_i
;
i
++
)
{
ptrdiff_t
srcY
=
internal
::
borderInterpolate
(
dy
+
i
+
borderMargin
.
top
-
ay
,
wholeSize
.
height
,
columnBorderType
);
if
(
srcY
<
0
)
// can happen only with constant border type
rows
[
i
]
=
constBorderRow
;
else
{
if
(
srcY
>=
startY
+
rowCount
)
break
;
ptrdiff_t
bi
=
(
srcY
-
startY0
)
%
bufRows
;
rows
[
i
]
=
ringBuf
+
bi
*
swidthcn
;
}
}
if
(
i
<
kheight
)
break
;
i
-=
kheight
-
1
;
MorphColumn
<
Op
>
((
const
u8
**
)
rows
,
dst
,
dstStride
,
i
,
ssize
.
width
*
cn
,
ksize
.
height
);
}
}
}
// namespace
#endif // CAROTENE_NEON
void
erode
(
const
Size2D
&
ssize
,
u32
cn
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
const
Size2D
&
ksize
,
size_t
anchorX
,
size_t
anchorY
,
BORDER_MODE
rowBorderType
,
BORDER_MODE
columnBorderType
,
const
u8
*
borderValues
,
Margin
borderMargin
)
{
internal
::
assertSupportedConfiguration
(
ssize
.
width
>
0
&&
ssize
.
height
>
0
&&
anchorX
<
ksize
.
width
&&
anchorY
<
ksize
.
height
);
#ifdef CAROTENE_NEON
morphology
<
ErodeVecOp
>
(
ssize
,
cn
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
ksize
,
anchorX
,
anchorY
,
rowBorderType
,
columnBorderType
,
borderValues
,
borderMargin
);
#else
(
void
)
cn
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
rowBorderType
;
(
void
)
columnBorderType
;
(
void
)
borderValues
;
(
void
)
borderMargin
;
#endif
}
void
dilate
(
const
Size2D
&
ssize
,
u32
cn
,
const
u8
*
srcBase
,
ptrdiff_t
srcStride
,
u8
*
dstBase
,
ptrdiff_t
dstStride
,
const
Size2D
&
ksize
,
size_t
anchorX
,
size_t
anchorY
,
BORDER_MODE
rowBorderType
,
BORDER_MODE
columnBorderType
,
const
u8
*
borderValues
,
Margin
borderMargin
)
{
internal
::
assertSupportedConfiguration
(
ssize
.
width
>
0
&&
ssize
.
height
>
0
&&
anchorX
<
ksize
.
width
&&
anchorY
<
ksize
.
height
);
#ifdef CAROTENE_NEON
morphology
<
DilateVecOp
>
(
ssize
,
cn
,
srcBase
,
srcStride
,
dstBase
,
dstStride
,
ksize
,
anchorX
,
anchorY
,
rowBorderType
,
columnBorderType
,
borderValues
,
borderMargin
);
#else
(
void
)
cn
;
(
void
)
srcBase
;
(
void
)
srcStride
;
(
void
)
dstBase
;
(
void
)
dstStride
;
(
void
)
rowBorderType
;
(
void
)
columnBorderType
;
(
void
)
borderValues
;
(
void
)
borderMargin
;
#endif
}
}
// namespace CAROTENE_NS
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment