Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
02684438
Commit
02684438
authored
Jul 13, 2022
by
carlushuang
Browse files
add BiasAddRelu
parent
4b448373
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
410 additions
and
4 deletions
+410
-4
example/cpu_02_conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
...conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
+117
-3
include/ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp
...nsor_operation/cpu/element/element_wise_operation_cpu.hpp
+45
-0
include/ck/tensor_operation/cpu/grid/gridwise_gemm_bias_activation_add_avx2.hpp
...ation/cpu/grid/gridwise_gemm_bias_activation_add_avx2.hpp
+4
-0
include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
.../threadwise_tensor_slice_transfer_avx2_specialization.hpp
+1
-1
library/src/tensor_operation_instance/cpu/conv2d_fwd_bias_activation_add/device_conv2d_bias_activation_add_avx2_nhwc_kyxc_nhwk_instance.cpp
...nv2d_bias_activation_add_avx2_nhwc_kyxc_nhwk_instance.cpp
+81
-0
library/src/tensor_operation_instance/cpu/conv2d_fwd_bias_activation_add/device_conv2d_bias_activation_add_avx2_nhwc_kyxck8_nhwk_instance.cpp
...2d_bias_activation_add_avx2_nhwc_kyxck8_nhwk_instance.cpp
+81
-0
library/src/tensor_operation_instance/cpu/conv2d_fwd_bias_activation_add/device_conv2d_bias_activation_add_avx2_nhwc_yxck_nhwk_instance.cpp
...nv2d_bias_activation_add_avx2_nhwc_yxck_nhwk_instance.cpp
+81
-0
No files found.
example/cpu_02_conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
View file @
02684438
...
@@ -19,7 +19,8 @@
...
@@ -19,7 +19,8 @@
#define TEST_FUSION_BIAS_RELU_ADD 0
#define TEST_FUSION_BIAS_RELU_ADD 0
#define TEST_FUSION_BIAS_RELU 1
#define TEST_FUSION_BIAS_RELU 1
#define TEST_FUSION_BIAS 2
#define TEST_FUSION_BIAS 2
#define TEST_FUSION TEST_FUSION_BIAS
#define TEST_FUSION_BIAS_ADD_RELU 3
#define TEST_FUSION TEST_FUSION_BIAS_ADD_RELU
#define TEST_LAYOUT_NHWC_KYXC_NHWK 0
#define TEST_LAYOUT_NHWC_KYXC_NHWK 0
#define TEST_LAYOUT_NHWC_KYXCK8_NHWK 1
#define TEST_LAYOUT_NHWC_KYXCK8_NHWK 1
...
@@ -39,6 +40,7 @@ using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
...
@@ -39,6 +40,7 @@ using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
AddAddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddAddRelu
;
// ------------------ nhwc-kyxc-nhwk
// ------------------ nhwc-kyxc-nhwk
void
add_device_conv2d_fwd_bias_relu_add_avx2_nhwc_kyxc_nhwk
(
void
add_device_conv2d_fwd_bias_relu_add_avx2_nhwc_kyxc_nhwk
(
...
@@ -71,6 +73,18 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_kyxc_nhwk_local_c(
...
@@ -71,6 +73,18 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_kyxc_nhwk_local_c(
void
add_device_conv2d_fwd_bias_avx2_nhwc_kyxc_nhwk_mt
(
void
add_device_conv2d_fwd_bias_avx2_nhwc_kyxc_nhwk_mt
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
Add
>>&
instances
);
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
Add
>>&
instances
);
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>>&
instances
);
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk_local_c
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>>&
instances
);
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk_mt
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>>&
instances
);
// ------------------ nhwc-kcyxk8-nhwk
// ------------------ nhwc-kcyxk8-nhwk
void
add_device_conv2d_fwd_bias_relu_add_avx2_nhwc_kyxck8_nhwk
(
void
add_device_conv2d_fwd_bias_relu_add_avx2_nhwc_kyxck8_nhwk
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddReluAdd
>>&
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddReluAdd
>>&
...
@@ -102,6 +116,18 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_kyxck8_nhwk_local_c(
...
@@ -102,6 +116,18 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_kyxck8_nhwk_local_c(
void
add_device_conv2d_fwd_bias_avx2_nhwc_kyxck8_nhwk_mt
(
void
add_device_conv2d_fwd_bias_avx2_nhwc_kyxck8_nhwk_mt
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
Add
>>&
instances
);
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
Add
>>&
instances
);
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>>&
instances
);
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk_local_c
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>>&
instances
);
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk_mt
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>>&
instances
);
// ------------------ nhwc-yxck-nhwk
// ------------------ nhwc-yxck-nhwk
void
add_device_conv2d_fwd_bias_relu_add_avx2_nhwc_yxck_nhwk
(
void
add_device_conv2d_fwd_bias_relu_add_avx2_nhwc_yxck_nhwk
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddReluAdd
>>&
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddReluAdd
>>&
...
@@ -133,6 +159,18 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_yxck_nhwk_local_c(
...
@@ -133,6 +159,18 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_yxck_nhwk_local_c(
void
add_device_conv2d_fwd_bias_avx2_nhwc_yxck_nhwk_mt
(
void
add_device_conv2d_fwd_bias_avx2_nhwc_yxck_nhwk_mt
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
Add
>>&
instances
);
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
Add
>>&
instances
);
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>>&
instances
);
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk_local_c
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>>&
instances
);
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk_mt
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>>&
instances
);
}
// namespace device_conv2d_fwd_bias_activation_add_avx2_instance
}
// namespace device_conv2d_fwd_bias_activation_add_avx2_instance
}
// namespace device
}
// namespace device
}
// namespace cpu
}
// namespace cpu
...
@@ -147,6 +185,8 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::AddReluAdd;
...
@@ -147,6 +185,8 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::AddReluAdd;
using
OutElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
OutElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
#elif TEST_FUSION == TEST_FUSION_BIAS
#elif TEST_FUSION == TEST_FUSION_BIAS
using
OutElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
OutElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
#elif TEST_FUSION == TEST_FUSION_BIAS_ADD_RELU
using
OutElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddAddRelu
;
#endif
#endif
template
<
typename
T
>
template
<
typename
T
>
...
@@ -352,7 +392,7 @@ int main(int argc, char* argv[])
...
@@ -352,7 +392,7 @@ int main(int argc, char* argv[])
using
WeiDataType
=
decltype
(
wei_type
);
using
WeiDataType
=
decltype
(
wei_type
);
using
OutDataType
=
decltype
(
out_type
);
using
OutDataType
=
decltype
(
out_type
);
#if
TEST_FUSION == TEST_FUSION_BIAS_RELU_ADD
#if
(
TEST_FUSION == TEST_FUSION_BIAS_RELU_ADD
) || (TEST_FUSION == TEST_FUSION_BIAS_ADD_RELU)
using
ReferenceConvFwdInstance
=
using
ReferenceConvFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd_Bias_Activation_Add
<
InDataType
,
ck
::
tensor_operation
::
host
::
ReferenceConvFwd_Bias_Activation_Add
<
InDataType
,
WeiDataType
,
WeiDataType
,
...
@@ -497,7 +537,7 @@ int main(int argc, char* argv[])
...
@@ -497,7 +537,7 @@ int main(int argc, char* argv[])
wei_k_c_y_x
,
wei_k_c_y_x
,
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_host_result
,
bias
,
bias
,
#if
TEST_FUSION == TEST_FUSION_BIAS_RELU_ADD
#if
(
TEST_FUSION == TEST_FUSION_BIAS_RELU_ADD
) || (TEST_FUSION == TEST_FUSION_BIAS_ADD_RELU)
residual
,
residual
,
#endif
#endif
conv_filter_strides
,
conv_filter_strides
,
...
@@ -514,6 +554,7 @@ int main(int argc, char* argv[])
...
@@ -514,6 +554,7 @@ int main(int argc, char* argv[])
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
AddAddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddAddRelu
;
#if TEST_FUSION == TEST_FUSION_BIAS_RELU_ADD
#if TEST_FUSION == TEST_FUSION_BIAS_RELU_ADD
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
cpu
::
device
::
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
cpu
::
device
::
...
@@ -524,6 +565,9 @@ int main(int argc, char* argv[])
...
@@ -524,6 +565,9 @@ int main(int argc, char* argv[])
#elif TEST_FUSION == TEST_FUSION_BIAS
#elif TEST_FUSION == TEST_FUSION_BIAS
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
cpu
::
device
::
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
cpu
::
device
::
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
Add
>
;
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
Add
>
;
#elif TEST_FUSION == TEST_FUSION_BIAS_ADD_RELU
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
cpu
::
device
::
DeviceConvFwdBiasActivationAddPtr
<
PassThrough
,
PassThrough
,
AddAddRelu
>
;
#endif
#endif
// add device Conv instances
// add device Conv instances
...
@@ -738,6 +782,76 @@ int main(int argc, char* argv[])
...
@@ -738,6 +782,76 @@ int main(int argc, char* argv[])
add_device_conv2d_fwd_bias_avx2_nhwc_yxck_nhwk_local_c
(
conv_ptrs
);
add_device_conv2d_fwd_bias_avx2_nhwc_yxck_nhwk_local_c
(
conv_ptrs
);
}
}
#endif
#endif
#elif TEST_FUSION == TEST_FUSION_BIAS_ADD_RELU
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXC_NHWK
if
(
omp_get_max_threads
()
>
1
)
{
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk_mt
(
conv_ptrs
);
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk
(
conv_ptrs
);
}
else
{
if
(
K
%
8
==
0
)
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk
(
conv_ptrs
);
else
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk_local_c
(
conv_ptrs
);
}
#endif
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
if
(
omp_get_max_threads
()
>
1
)
{
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk_mt
(
conv_ptrs
);
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk
(
conv_ptrs
);
}
else
{
if
(
K
%
8
==
0
)
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk
(
conv_ptrs
);
else
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk_local_c
(
conv_ptrs
);
}
#endif
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_YXCK_NHWK
if
(
omp_get_max_threads
()
>
1
)
{
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk_mt
(
conv_ptrs
);
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk
(
conv_ptrs
);
}
else
{
if
(
K
%
8
==
0
)
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk
(
conv_ptrs
);
else
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_bias_activation_add_avx2_instance
::
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk_local_c
(
conv_ptrs
);
}
#endif
#endif
#endif
}
}
...
...
include/ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp
View file @
02684438
...
@@ -172,6 +172,51 @@ struct AddReluAdd
...
@@ -172,6 +172,51 @@ struct AddReluAdd
static
constexpr
char
*
Name
()
{
return
"AddReluAdd"
;
}
static
constexpr
char
*
Name
()
{
return
"AddReluAdd"
;
}
};
};
struct
AddAddRelu
{
void
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
{
float
a
=
x0
+
x1
+
x2
;
y
=
a
>
0
?
a
:
0
;
}
void
operator
()(
float4_t
&
y
,
const
float4_t
&
x0
,
const
float4_t
&
x1
,
const
float4_t
&
x2
)
const
{
float4_t
a
=
_mm_add_ps
(
x0
,
x1
);
float4_t
b
=
_mm_add_ps
(
a
,
x2
);
y
=
_mm_max_ps
(
b
,
_mm_setzero_ps
());
}
void
operator
()(
float8_t
&
y
,
const
float8_t
&
x0
,
const
float8_t
&
x1
,
const
float8_t
&
x2
)
const
{
float8_t
a
=
_mm256_add_ps
(
x0
,
x1
);
float8_t
b
=
_mm256_add_ps
(
a
,
x2
);
y
=
_mm256_max_ps
(
b
,
_mm256_setzero_ps
());
}
float
Apply
(
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
{
float
a
=
x0
+
x1
+
x2
;
return
a
>
0
?
a
:
0
;
}
float4_t
Apply
(
const
float4_t
&
x0
,
const
float4_t
&
x1
,
const
float4_t
&
x2
)
const
{
float4_t
a
=
_mm_add_ps
(
x0
,
x1
);
float4_t
b
=
_mm_add_ps
(
a
,
x2
);
return
_mm_max_ps
(
b
,
_mm_setzero_ps
());
}
float8_t
Apply
(
const
float8_t
&
x0
,
const
float8_t
&
x1
,
const
float8_t
&
x2
)
const
{
float8_t
a
=
_mm256_add_ps
(
x0
,
x1
);
float8_t
b
=
_mm256_add_ps
(
a
,
x2
);
return
_mm256_max_ps
(
b
,
_mm256_setzero_ps
());
}
static
constexpr
char
*
Name
()
{
return
"AddAddRelu"
;
}
};
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
...
...
include/ck/tensor_operation/cpu/grid/gridwise_gemm_bias_activation_add_avx2.hpp
View file @
02684438
...
@@ -623,6 +623,10 @@ struct GridwiseGemmBiasActivationAddAvx2_MxN
...
@@ -623,6 +623,10 @@ struct GridwiseGemmBiasActivationAddAvx2_MxN
c_threadwise_copy
.
SetSrc2SliceOrigin
(
c_block_desc
,
c_threadwise_copy
.
SetSrc2SliceOrigin
(
c_block_desc
,
GetCIndex
(
i_mc
,
i_nc
));
GetCIndex
(
i_mc
,
i_nc
));
_mm_prefetch
(
reinterpret_cast
<
const
float
*>
(
c1_grid_buf
.
p_data_
)
+
c_threadwise_copy
.
src2_offset
,
_MM_HINT_T1
);
if
constexpr
(
!
UseCLocalBuffer
)
if
constexpr
(
!
UseCLocalBuffer
)
{
{
c_threadwise_copy
.
SetSrcSliceOrigin
(
c_block_desc
,
c_threadwise_copy
.
SetSrcSliceOrigin
(
c_block_desc
,
...
...
include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
View file @
02684438
...
@@ -2403,7 +2403,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
...
@@ -2403,7 +2403,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
void
MoveDstSliceWindow
(
const
DstDesc
&
,
const
Index
&
)
{}
void
MoveDstSliceWindow
(
const
DstDesc
&
,
const
Index
&
)
{}
private:
//
private:
const
ElementwiseOperation
element_op_
;
const
ElementwiseOperation
element_op_
;
intptr_t
i_dst_gemm_m
;
intptr_t
i_dst_gemm_m
;
...
...
library/src/tensor_operation_instance/cpu/conv2d_fwd_bias_activation_add/device_conv2d_bias_activation_add_avx2_nhwc_kyxc_nhwk_instance.cpp
View file @
02684438
...
@@ -25,6 +25,7 @@ using PT = ck::tensor_operation::cpu::element_wise::PassThrough;
...
@@ -25,6 +25,7 @@ using PT = ck::tensor_operation::cpu::element_wise::PassThrough;
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
AddAddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddAddRelu
;
static
constexpr
auto
ConvFwdDefault
=
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardSpecialization_t
::
Default
;
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardSpecialization_t
::
Default
;
...
@@ -299,6 +300,86 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_kyxc_nhwk_mt(
...
@@ -299,6 +300,86 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_kyxc_nhwk_mt(
));
));
}
}
/****************************************************************************************************/
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PT
,
PT
,
AddAddRelu
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
instance
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
64
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
128
,
256
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
240
,
128
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
256
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
768
,
320
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
896
,
352
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
1024
,
416
,
128
,
6
,
16
,
false
,
true
,
true
,
false
)
// clang-format on
));
}
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk_local_c
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PT
,
PT
,
AddAddRelu
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
instance
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
64
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
128
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
240
,
128
,
4
,
24
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
768
,
320
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
896
,
352
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
1024
,
416
,
128
,
6
,
16
,
true
,
true
,
true
,
false
)
// clang-format on
));
}
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxc_nhwk_mt
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PT
,
PT
,
AddAddRelu
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
instance
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
24
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
32
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
40
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
48
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
48
,
48
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
56
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
16
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
16
,
256
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
32
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
32
,
256
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
96
,
32
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
96
,
64
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
120
,
32
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
120
,
64
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
128
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
240
,
128
,
4
,
24
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
768
,
320
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
896
,
352
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXC_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
1024
,
416
,
128
,
6
,
16
,
true
,
true
,
true
,
false
)
// clang-format on
));
}
}
// namespace device_conv2d_fwd_bias_activation_add_avx2_instance
}
// namespace device_conv2d_fwd_bias_activation_add_avx2_instance
}
// namespace device
}
// namespace device
}
// namespace cpu
}
// namespace cpu
...
...
library/src/tensor_operation_instance/cpu/conv2d_fwd_bias_activation_add/device_conv2d_bias_activation_add_avx2_nhwc_kyxck8_nhwk_instance.cpp
View file @
02684438
...
@@ -25,6 +25,7 @@ using PT = ck::tensor_operation::cpu::element_wise::PassThrough;
...
@@ -25,6 +25,7 @@ using PT = ck::tensor_operation::cpu::element_wise::PassThrough;
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
AddAddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddAddRelu
;
static
constexpr
auto
ConvFwdDefault
=
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardSpecialization_t
::
Default
;
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardSpecialization_t
::
Default
;
...
@@ -299,6 +300,86 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_kyxck8_nhwk_mt(
...
@@ -299,6 +300,86 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_kyxck8_nhwk_mt(
));
));
}
}
/****************************************************************************************************/
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PT
,
PT
,
AddAddRelu
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
instance
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
64
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
128
,
256
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
240
,
128
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
256
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
768
,
320
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
896
,
352
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
1024
,
416
,
128
,
6
,
16
,
false
,
true
,
true
,
false
)
// clang-format on
));
}
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk_local_c
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PT
,
PT
,
AddAddRelu
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
instance
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
64
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
128
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
240
,
128
,
4
,
24
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
768
,
320
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
896
,
352
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
1024
,
416
,
128
,
6
,
16
,
true
,
true
,
true
,
false
)
// clang-format on
));
}
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_kyxck8_nhwk_mt
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PT
,
PT
,
AddAddRelu
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
instance
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
24
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
32
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
40
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
48
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
48
,
48
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
56
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
16
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
16
,
256
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
32
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
32
,
256
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
96
,
32
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
96
,
64
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
120
,
32
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
120
,
64
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
128
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
240
,
128
,
4
,
24
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
768
,
320
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
896
,
352
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_KYXCK8_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
1024
,
416
,
128
,
6
,
16
,
true
,
true
,
true
,
false
)
// clang-format on
));
}
}
// namespace device_conv2d_fwd_bias_activation_add_avx2_instance
}
// namespace device_conv2d_fwd_bias_activation_add_avx2_instance
}
// namespace device
}
// namespace device
}
// namespace cpu
}
// namespace cpu
...
...
library/src/tensor_operation_instance/cpu/conv2d_fwd_bias_activation_add/device_conv2d_bias_activation_add_avx2_nhwc_yxck_nhwk_instance.cpp
View file @
02684438
...
@@ -24,6 +24,7 @@ using PT = ck::tensor_operation::cpu::element_wise::PassThrough;
...
@@ -24,6 +24,7 @@ using PT = ck::tensor_operation::cpu::element_wise::PassThrough;
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddReluAdd
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddReluAdd
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
AddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddRelu
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
Add
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Add
;
using
AddAddRelu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
AddAddRelu
;
static
constexpr
auto
ConvFwdDefault
=
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardSpecialization_t
::
Default
;
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardSpecialization_t
::
Default
;
...
@@ -298,6 +299,86 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_yxck_nhwk_mt(
...
@@ -298,6 +299,86 @@ void add_device_conv2d_fwd_bias_avx2_nhwc_yxck_nhwk_mt(
));
));
}
}
/****************************************************************************************************/
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PT
,
PT
,
AddAddRelu
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
instance
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
64
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
128
,
256
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
240
,
128
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
256
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
768
,
320
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
896
,
352
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
1024
,
416
,
128
,
6
,
16
,
false
,
true
,
true
,
false
)
// clang-format on
));
}
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk_local_c
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PT
,
PT
,
AddAddRelu
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
instance
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
64
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
128
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
240
,
128
,
4
,
24
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
768
,
320
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
896
,
352
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
1024
,
416
,
128
,
6
,
16
,
true
,
true
,
true
,
false
)
// clang-format on
));
}
void
add_device_conv2d_fwd_bias_add_relu_avx2_nhwc_yxck_nhwk_mt
(
std
::
vector
<
DeviceConvFwdBiasActivationAddPtr
<
PT
,
PT
,
AddAddRelu
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
instance
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
24
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
32
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
40
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
48
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
48
,
48
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
56
,
24
,
256
,
4
,
24
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
16
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
16
,
256
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
32
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
72
,
32
,
256
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
96
,
32
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
96
,
64
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
120
,
32
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
120
,
64
,
128
,
6
,
16
,
false
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
256
,
128
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
128
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
240
,
128
,
4
,
24
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
512
,
256
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
768
,
320
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
896
,
352
,
128
,
6
,
16
,
true
,
true
,
true
,
false
),
DEVICE_CONV2D_FWD_BAA_AVX2_NHWC_YXCK_NHWK_F32
(
PT
,
PT
,
AddAddRelu
,
1024
,
416
,
128
,
6
,
16
,
true
,
true
,
true
,
false
)
// clang-format on
));
}
}
// namespace device_conv2d_fwd_bias_activation_add_avx2_instance
}
// namespace device_conv2d_fwd_bias_activation_add_avx2_instance
}
// namespace device
}
// namespace device
}
// namespace cpu
}
// namespace cpu
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment