Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f29a5350
Commit
f29a5350
authored
Jun 20, 2022
by
carlushuang
Browse files
add direct-conv first version
parent
19a6cc89
Changes
6
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1712 additions
and
3 deletions
+1712
-3
example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
+9
-1
example/cpu_02_conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
...conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
+3
-1
include/ck/tensor_operation/cpu/device/device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp
...device/device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp
+949
-0
include/ck/tensor_operation/cpu/grid/gridwise_direct_conv_avx2.hpp
...k/tensor_operation/cpu/grid/gridwise_direct_conv_avx2.hpp
+676
-0
include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
...e/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
+1
-1
library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk_instance.cpp
...vice_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk_instance.cpp
+74
-0
No files found.
example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
View file @
f29a5350
...
...
@@ -10,6 +10,7 @@
#include "reference_conv_fwd.hpp"
#include "element_wise_operation_cpu.hpp"
#include "dynamic_buffer_cpu.hpp"
#include "envvar.hpp"
#include <omp.h>
#define AVX2_DATA_ALIGNMENT 32
...
...
@@ -92,6 +93,10 @@ void add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_local_c_relu(
void
add_device_conv2d_fwd_avx2_nhwc_yxck_nhwk_mt_relu
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
// ------------------ direct-conv nhwc-kcyxk8-nhwk
void
add_device_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
}
// namespace device_conv2d_fwd_avx2_instance
}
// namespace device
}
// namespace cpu
...
...
@@ -501,6 +506,8 @@ int main(int argc, char* argv[])
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c
(
conv_ptrs
);
}
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk
(
conv_ptrs
);
#endif
#if TEST_FUSION == TEST_FUSION_RELU
if
(
omp_get_max_threads
()
>
1
)
...
...
@@ -571,6 +578,7 @@ int main(int argc, char* argv[])
double
fastest_kernel_time
=
std
::
numeric_limits
<
double
>::
max
();
std
::
string
fastest_kernel_name
=
""
;
double
fastest_kernel_gflops
=
0
;
int
loop
=
ck
::
getenv_int
(
"CK_LOOP"
,
10
);
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
...
...
@@ -594,7 +602,7 @@ int main(int argc, char* argv[])
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
double
time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{},
10
);
double
time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{},
loop
);
double
total_flop
=
static_cast
<
double
>
(
2
)
*
N
*
C
*
Ho
*
Wo
*
K
*
Y
*
X
;
...
...
example/cpu_02_conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
View file @
f29a5350
...
...
@@ -11,6 +11,7 @@
#include "reference_conv_fwd_bias_activation.hpp"
#include "element_wise_operation_cpu.hpp"
#include "dynamic_buffer_cpu.hpp"
#include "envvar.hpp"
#include <omp.h>
#define AVX2_DATA_ALIGNMENT 32
...
...
@@ -713,6 +714,7 @@ int main(int argc, char* argv[])
double
fastest_kernel_time
=
std
::
numeric_limits
<
double
>::
max
();
std
::
string
fastest_kernel_name
=
""
;
double
fastest_kernel_gflops
=
0
;
int
loop
=
ck
::
getenv_int
(
"CK_LOOP"
,
10
);
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
...
...
@@ -738,7 +740,7 @@ int main(int argc, char* argv[])
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
double
time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{},
10
);
double
time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{},
loop
);
double
total_flop
=
static_cast
<
double
>
(
2
)
*
N
*
C
*
Ho
*
Wo
*
K
*
Y
*
X
;
...
...
include/ck/tensor_operation/cpu/device/device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp
0 → 100644
View file @
f29a5350
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/cpu/grid/gridwise_direct_conv_avx2.hpp
0 → 100644
View file @
f29a5350
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
View file @
f29a5350
...
...
@@ -855,7 +855,7 @@ struct ThreadwiseGemmAvx2_MxN_4x24
".if m_TransA != 0
\n
"
" lea m_ABytes(%%rax), %%rax
\n
"
".if m_Mr >
3
\n
lea m_ABytes(%%r8), %%r8
\n
.endif
\n
"
".if m_Mr >
2
\n
lea m_ABytes(%%r8), %%r8
\n
.endif
\n
"
".else
\n
"
" lea (%%rax, %%rcx, 1), %%rax
\n
"
" lea (%%r8, %%rcx, 1), %%r8
\n
"
...
...
library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk_instance.cpp
0 → 100644
View file @
f29a5350
#include <stdlib.h>
#include <utility>
#include "config.hpp"
#include "convolution_forward_specialization_cpu.hpp"
#include "device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
cpu
{
namespace
device
{
namespace
device_conv2d_fwd_avx2_instance
{
using
InType
=
float
;
using
WeiType
=
float
;
using
OutType
=
float
;
using
AccType
=
float
;
using
InLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
// NHWC
using
WeiLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
// KYXCK8
static
constexpr
bool
NonTemporalStore
=
false
;
using
PT
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
using
Relu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Relu
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardSpecialization_t
::
Default
;
static
constexpr
auto
ConvFwd1x1P0
=
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardSpecialization_t
::
Filter1x1Pad0
;
static
constexpr
auto
ConvFwd1x1S1P0
=
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardSpecialization_t
::
Filter1x1Stride1Pad0
;
static
constexpr
auto
DefaultGemmKLoop
=
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardGemmKSpecialization_t
::
DefaultGemmKLoop
;
static
constexpr
auto
GemmKLoopOverC
=
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardGemmKSpecialization_t
::
NHWC_GemmKLoopOverC
;
static
constexpr
auto
LoopOver_MNK
=
ck
::
tensor_operation
::
cpu
::
device
::
LoopOver_MNK
;
static
constexpr
auto
LoopOver_MKN
=
ck
::
tensor_operation
::
cpu
::
device
::
LoopOver_MKN
;
// clang-format off
#define DEVICE_CONV2D_FWD_AVX2_NHWC_KYXCK8_NHWK_F32(a_elem_op, b_elem_op, c_elem_op, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, c_local_buf) \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MNK}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MNK}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, DefaultGemmKLoop, LoopOver_MNK}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, 2, m_per_thread, n_per_thread, false, false, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MNK}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, false, c_local_buf>({m_per_block, n_per_block, k_per_block, DefaultGemmKLoop, LoopOver_MNK}), \
\
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MKN}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MKN}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, true, c_local_buf>({m_per_block, n_per_block, k_per_block, DefaultGemmKLoop, LoopOver_MKN}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, 2, m_per_thread, n_per_thread, false, false, c_local_buf>({m_per_block, n_per_block, k_per_block, GemmKLoopOverC , LoopOver_MKN}), \
DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, 2, m_per_thread, n_per_thread, true, false, c_local_buf>({m_per_block, n_per_block, k_per_block, DefaultGemmKLoop, LoopOver_MKN})
// clang-format on
void
add_device_conv2d_direct_fwd_avx2_nhwc_kyxck8_nhwk
(
std
::
vector
<
DeviceConvFwdPtr
<
PT
,
PT
,
PT
>>&
instances
)
{
ck
::
tensor_operation
::
device
::
add_device_operation_instances
(
instances
,
std
::
make_tuple
(
// clang-format off
DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
<
float
,
float
,
float
,
PT
,
PT
,
PT
,
ConvFwdDefault
,
2
,
4
,
24
,
false
,
false
,
false
>
({
0
,
0
,
0
,
DefaultGemmKLoop
,
LoopOver_MKN
})
// clang-format on
));
}
}
// namespace device_conv2d_fwd_avx2_instance
}
// namespace device
}
// namespace cpu
}
// namespace tensor_operation
}
// namespace ck
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment