Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
505194d7
Commit
505194d7
authored
May 18, 2022
by
carlushuang
Browse files
rename to example
parent
ffb13372
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
578 additions
and
844 deletions
+578
-844
example/CMakeLists.txt
example/CMakeLists.txt
+7
-0
example/cpu_01_conv2d_fwd/CMakeLists.txt
example/cpu_01_conv2d_fwd/CMakeLists.txt
+6
-0
example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
+565
-565
test/CMakeLists.txt
test/CMakeLists.txt
+0
-2
test/convnd_fwd_cpu/CMakeLists.txt
test/convnd_fwd_cpu/CMakeLists.txt
+0
-7
test/cpu_threadwise_transfer/CMakeLists.txt
test/cpu_threadwise_transfer/CMakeLists.txt
+0
-6
test/cpu_threadwise_transfer/cpu_threadwise_transfer.cpp
test/cpu_threadwise_transfer/cpu_threadwise_transfer.cpp
+0
-264
No files found.
example/CMakeLists.txt
View file @
505194d7
...
@@ -10,6 +10,11 @@ include_directories(BEFORE
...
@@ -10,6 +10,11 @@ include_directories(BEFORE
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/warp
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/warp
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/thread
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/thread
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/element
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/element
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/cpu/device
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/cpu/grid
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/cpu/block
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/cpu/thread
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/cpu/element
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
...
@@ -51,3 +56,5 @@ add_subdirectory(17_convnd_bwd_data_xdl)
...
@@ -51,3 +56,5 @@ add_subdirectory(17_convnd_bwd_data_xdl)
add_subdirectory
(
15_grouped_gemm
)
add_subdirectory
(
15_grouped_gemm
)
add_subdirectory
(
16_gemm_reduce
)
add_subdirectory
(
16_gemm_reduce
)
add_subdirectory
(
18_batched_gemm_reduce
)
add_subdirectory
(
18_batched_gemm_reduce
)
add_subdirectory
(
cpu_01_conv2d_fwd
)
example/cpu_01_conv2d_fwd/CMakeLists.txt
0 → 100644
View file @
505194d7
add_example_executable
(
example_cpu_conv2d_fwd cpu_conv2d_fwd.cpp
)
target_link_libraries
(
example_cpu_conv2d_fwd PRIVATE device_conv2d_fwd_cpu_instance
)
set_target_properties
(
example_cpu_conv2d_fwd PROPERTIES LINK_FLAGS
"
${
OMP_LINK_FLAG
}
"
)
target_link_libraries
(
example_cpu_conv2d_fwd PRIVATE
"
${
OMP_LIBRARY
}
"
)
target_compile_options
(
example_cpu_conv2d_fwd PRIVATE
"
${
OMP_CXX_FLAG
}
"
)
test/
conv
n
d_fwd
_
cpu
/
conv2d_fwd
_cpu
.cpp
→
example/cpu_01_
conv
2
d_fwd
/
cpu
_
conv2d_fwd.cpp
View file @
505194d7
#include <sstream>
#include <sstream>
#include "config.hpp"
#include "config.hpp"
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation_cpu.hpp"
#include "element_wise_operation_cpu.hpp"
#include "reference_conv_fwd.hpp"
#include "reference_conv_fwd.hpp"
#include "element_wise_operation_cpu.hpp"
#include "element_wise_operation_cpu.hpp"
#include "dynamic_buffer_cpu.hpp"
#include "dynamic_buffer_cpu.hpp"
#include <omp.h>
#include <omp.h>
#define AVX2_DATA_ALIGNMENT 32
#define AVX2_DATA_ALIGNMENT 32
#define TEST_FUSION_PASSTHROUGH 0
#define TEST_FUSION_PASSTHROUGH 0
#define TEST_FUSION_RELU 1
#define TEST_FUSION_RELU 1
#define TEST_FUSION TEST_FUSION_PASSTHROUGH
#define TEST_FUSION TEST_FUSION_PASSTHROUGH
#define TEST_LAYOUT_NHWC_KYXC_NHWK 0
#define TEST_LAYOUT_NHWC_KYXC_NHWK 0
#define TEST_LAYOUT_NHWC_KYXCK8_NHWK 1
#define TEST_LAYOUT_NHWC_KYXCK8_NHWK 1
#define TEST_LAYOUT TEST_LAYOUT_NHWC_KYXCK8_NHWK
#define TEST_LAYOUT TEST_LAYOUT_NHWC_KYXCK8_NHWK
using
F32
=
float
;
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
cpu
{
namespace
cpu
{
namespace
device
{
namespace
device
{
namespace
device_conv2d_fwd_avx2_instance
{
namespace
device_conv2d_fwd_avx2_instance
{
using
PassThrough
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
using
Relu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Relu
;
using
Relu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Relu
;
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt_relu
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt_relu
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt_relu
(
void
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt_relu
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>>&
instances
);
}
// namespace device_conv2d_fwd_avx2_instance
}
// namespace device_conv2d_fwd_avx2_instance
}
// namespace device
}
// namespace device
}
// namespace cpu
}
// namespace cpu
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
using
InElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
using
InElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
using
OutElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
#endif
#endif
#if TEST_FUSION == TEST_FUSION_RELU
#if TEST_FUSION == TEST_FUSION_RELU
using
OutElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Relu
;
using
OutElementOp
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Relu
;
#endif
#endif
template
<
typename
T
>
template
<
typename
T
>
static
bool
static
bool
check_out
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
,
double
nrms
,
int
per_pixel_check
=
0
)
check_out
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
,
double
nrms
,
int
per_pixel_check
=
0
)
{
{
int
error_count
=
0
;
int
error_count
=
0
;
float
max_diff
=
1e-5
;
float
max_diff
=
1e-5
;
double
square_difference
=
.0
;
double
square_difference
=
.0
;
double
mag1
=
.0
;
double
mag1
=
.0
;
double
mag2
=
.0
;
double
mag2
=
.0
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
{
double
ri
=
(
double
)
ref
.
mData
[
i
];
double
ri
=
(
double
)
ref
.
mData
[
i
];
double
pi
=
(
double
)
result
.
mData
[
i
];
double
pi
=
(
double
)
result
.
mData
[
i
];
double
d
=
ri
-
pi
;
double
d
=
ri
-
pi
;
if
(
per_pixel_check
)
if
(
per_pixel_check
)
{
{
if
(
max_diff
<
std
::
abs
(
d
))
if
(
max_diff
<
std
::
abs
(
d
))
{
{
error_count
++
;
error_count
++
;
printf
(
"idx:%3d, ref:%f, res:%f (diff:%f)
\n
"
,
printf
(
"idx:%3d, ref:%f, res:%f (diff:%f)
\n
"
,
i
,
i
,
double
(
ref
.
mData
[
i
]),
double
(
ref
.
mData
[
i
]),
double
(
result
.
mData
[
i
]),
double
(
result
.
mData
[
i
]),
d
);
d
);
}
}
}
}
square_difference
+=
d
*
d
;
square_difference
+=
d
*
d
;
if
(
std
::
abs
(
mag1
)
<
std
::
abs
(
ri
))
if
(
std
::
abs
(
mag1
)
<
std
::
abs
(
ri
))
mag1
=
ri
;
mag1
=
ri
;
if
(
std
::
abs
(
mag2
)
<
std
::
abs
(
pi
))
if
(
std
::
abs
(
mag2
)
<
std
::
abs
(
pi
))
mag2
=
pi
;
mag2
=
pi
;
}
}
double
mag
=
std
::
max
({
std
::
fabs
(
mag1
),
std
::
fabs
(
mag2
),
std
::
numeric_limits
<
double
>::
min
()});
double
mag
=
std
::
max
({
std
::
fabs
(
mag1
),
std
::
fabs
(
mag2
),
std
::
numeric_limits
<
double
>::
min
()});
double
computed_nrms
=
std
::
sqrt
(
square_difference
)
/
(
std
::
sqrt
(
ref
.
mData
.
size
())
*
mag
);
double
computed_nrms
=
std
::
sqrt
(
square_difference
)
/
(
std
::
sqrt
(
ref
.
mData
.
size
())
*
mag
);
if
(
computed_nrms
>=
nrms
)
if
(
computed_nrms
>=
nrms
)
printf
(
"nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f
\n
"
,
printf
(
"nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f
\n
"
,
computed_nrms
,
computed_nrms
,
mag1
,
mag1
,
mag2
,
mag2
,
nrms
);
nrms
);
return
computed_nrms
<
nrms
&&
error_count
==
0
;
return
computed_nrms
<
nrms
&&
error_count
==
0
;
}
}
float
calculate_gflops
()
{}
float
calculate_gflops
()
{}
template
<
typename
T
>
template
<
typename
T
>
void
transpose_kyxc_2_kyxc8k
(
Tensor
<
T
>&
dst
,
void
transpose_kyxc_2_kyxc8k
(
Tensor
<
T
>&
dst
,
const
Tensor
<
T
>&
src
,
const
Tensor
<
T
>&
src
,
ck
::
index_t
K
,
ck
::
index_t
K
,
ck
::
index_t
Y
,
ck
::
index_t
Y
,
ck
::
index_t
X
,
ck
::
index_t
X
,
ck
::
index_t
C
)
ck
::
index_t
C
)
{
{
ck
::
index_t
batch
=
K
/
8
;
ck
::
index_t
batch
=
K
/
8
;
ck
::
index_t
row
=
8
;
ck
::
index_t
row
=
8
;
ck
::
index_t
col
=
C
*
Y
*
X
;
ck
::
index_t
col
=
C
*
Y
*
X
;
for
(
auto
i_b
=
0
;
i_b
<
batch
;
i_b
++
)
for
(
auto
i_b
=
0
;
i_b
<
batch
;
i_b
++
)
{
{
for
(
auto
i_r
=
0
;
i_r
<
row
;
i_r
++
)
for
(
auto
i_r
=
0
;
i_r
<
row
;
i_r
++
)
{
{
for
(
auto
i_c
=
0
;
i_c
<
col
;
i_c
++
)
for
(
auto
i_c
=
0
;
i_c
<
col
;
i_c
++
)
{
{
ck
::
index_t
src_idx
=
i_b
*
row
*
col
+
i_r
*
col
+
i_c
;
ck
::
index_t
src_idx
=
i_b
*
row
*
col
+
i_r
*
col
+
i_c
;
ck
::
index_t
dst_idx
=
i_b
*
col
*
row
+
i_c
*
row
+
i_r
;
ck
::
index_t
dst_idx
=
i_b
*
col
*
row
+
i_c
*
row
+
i_r
;
dst
.
mData
[
dst_idx
]
=
src
.
mData
[
src_idx
];
dst
.
mData
[
dst_idx
]
=
src
.
mData
[
src_idx
];
}
}
}
}
}
}
}
}
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
int
data_type
=
0
;
int
data_type
=
0
;
int
init_method
=
0
;
int
init_method
=
0
;
// Conv shape
// Conv shape
ck
::
index_t
N
=
2
;
ck
::
index_t
N
=
2
;
ck
::
index_t
K
=
256
;
ck
::
index_t
K
=
256
;
ck
::
index_t
C
=
192
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
conv_stride_h
=
1
;
ck
::
index_t
conv_stride_h
=
1
;
ck
::
index_t
conv_stride_w
=
1
;
ck
::
index_t
conv_stride_w
=
1
;
ck
::
index_t
conv_dilation_h
=
1
;
ck
::
index_t
conv_dilation_h
=
1
;
ck
::
index_t
conv_dilation_w
=
1
;
ck
::
index_t
conv_dilation_w
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
if
(
argc
==
1
)
if
(
argc
==
1
)
{
{
data_type
=
0
;
data_type
=
0
;
init_method
=
1
;
init_method
=
1
;
}
}
else
if
(
argc
==
3
)
else
if
(
argc
==
3
)
{
{
data_type
=
std
::
stoi
(
argv
[
1
]);
data_type
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
}
}
else
if
(
argc
==
18
)
else
if
(
argc
==
18
)
{
{
data_type
=
std
::
stoi
(
argv
[
1
]);
data_type
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
N
=
std
::
stoi
(
argv
[
3
]);
N
=
std
::
stoi
(
argv
[
3
]);
K
=
std
::
stoi
(
argv
[
4
]);
K
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
conv_stride_h
=
std
::
stoi
(
argv
[
10
]);
conv_stride_h
=
std
::
stoi
(
argv
[
10
]);
conv_stride_w
=
std
::
stoi
(
argv
[
11
]);
conv_stride_w
=
std
::
stoi
(
argv
[
11
]);
conv_dilation_h
=
std
::
stoi
(
argv
[
12
]);
conv_dilation_h
=
std
::
stoi
(
argv
[
12
]);
conv_dilation_w
=
std
::
stoi
(
argv
[
13
]);
conv_dilation_w
=
std
::
stoi
(
argv
[
13
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
15
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
15
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
16
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
16
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
17
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
17
]);
}
}
else
else
{
{
printf
(
"arg1: data type (0=fp32, 1=fp16)
\n
"
);
printf
(
"arg1: data type (0=fp32, 1=fp16)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
printf
(
"arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
"RightPx
\n
"
);
exit
(
1
);
exit
(
1
);
}
}
auto
Run
=
[
&
](
auto
input_type
,
auto
wei_type
,
auto
out_type
)
{
auto
Run
=
[
&
](
auto
input_type
,
auto
wei_type
,
auto
out_type
)
{
using
InDataType
=
decltype
(
input_type
);
using
InDataType
=
decltype
(
input_type
);
using
WeiDataType
=
decltype
(
wei_type
);
using
WeiDataType
=
decltype
(
wei_type
);
using
OutDataType
=
decltype
(
out_type
);
using
OutDataType
=
decltype
(
out_type
);
using
ReferenceConvFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
InDataType
,
using
ReferenceConvFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
InDataType
,
WeiDataType
,
WeiDataType
,
OutDataType
,
OutDataType
,
InElementOp
,
InElementOp
,
WeiElementOp
,
WeiElementOp
,
OutElementOp
>
;
OutElementOp
>
;
const
ck
::
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
ck
::
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
ck
::
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
ck
::
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
{{
Hi
,
Wi
}};
const
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
{{
Hi
,
Wi
}};
const
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
{{
Y
,
X
}};
const
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
{{
Y
,
X
}};
const
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
{{
Ho
,
Wo
}};
const
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
{{
Ho
,
Wo
}};
const
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
{{
conv_stride_h
,
conv_stride_w
}};
const
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
{{
conv_stride_h
,
conv_stride_w
}};
const
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
{{
conv_dilation_h
,
conv_dilation_w
}};
const
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
{{
conv_dilation_h
,
conv_dilation_w
}};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{{
in_left_pad_h
,
in_left_pad_w
}};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{{
in_left_pad_h
,
in_left_pad_w
}};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{{
in_right_pad_h
,
in_right_pad_w
}};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{{
in_right_pad_h
,
in_right_pad_w
}};
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
C_
,
std
::
size_t
H_
,
std
::
size_t
H_
,
std
::
size_t
W_
)
{
std
::
size_t
W_
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H_
,
W_
}),
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H_
,
W_
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H_
*
W_
,
1
,
W_
*
C_
,
C_
}));
std
::
vector
<
std
::
size_t
>
({
C_
*
H_
*
W_
,
1
,
W_
*
C_
,
C_
}));
};
};
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
Tensor
<
WeiDataType
>
wei_k_c_y_x
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
));
Tensor
<
WeiDataType
>
wei_k_c_y_x
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
));
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
Tensor
<
WeiDataType
>
wei_k_c_y_x_k8
(
Tensor
<
WeiDataType
>
wei_k_c_y_x_k8
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
));
// TODO: This is only to hold data
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
));
// TODO: This is only to hold data
#endif
#endif
Tensor
<
OutDataType
>
out_n_k_ho_wo_host_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_k_ho_wo_host_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_k_ho_wo_device_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_k_ho_wo_device_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
std
::
cout
<<
"in (N, C, Hi, Wi): "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"in (N, C, Hi, Wi): "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei(K, C, Y, X): "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei(K, C, Y, X): "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out(N, K, Ho, Wo): "
<<
out_n_k_ho_wo_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out(N, K, Ho, Wo): "
<<
out_n_k_ho_wo_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"LPad(H, W):"
<<
in_left_pad_h
<<
","
<<
in_left_pad_w
std
::
cout
<<
"LPad(H, W):"
<<
in_left_pad_h
<<
","
<<
in_left_pad_w
<<
", RPad(H, W):"
<<
in_right_pad_h
<<
","
<<
in_right_pad_w
<<
", RPad(H, W):"
<<
in_right_pad_h
<<
","
<<
in_right_pad_w
<<
", Stride(H, W):"
<<
conv_stride_h
<<
", "
<<
conv_stride_w
<<
", Stride(H, W):"
<<
conv_stride_h
<<
", "
<<
conv_stride_w
<<
", Dilation(H, W):"
<<
conv_dilation_h
<<
", "
<<
conv_dilation_w
<<
", Dilation(H, W):"
<<
conv_dilation_h
<<
", "
<<
conv_dilation_w
<<
", Threads:"
<<
omp_get_max_threads
()
<<
std
::
endl
;
<<
", Threads:"
<<
omp_get_max_threads
()
<<
std
::
endl
;
int
per_pixel_check
=
0
;
int
per_pixel_check
=
0
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
case
0
:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{});
per_pixel_check
=
1
;
per_pixel_check
=
1
;
break
;
break
;
case
1
:
case
1
:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
// in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
// in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
// wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
// wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
per_pixel_check
=
1
;
per_pixel_check
=
1
;
break
;
break
;
case
2
:
case
2
:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
break
;
break
;
case
3
:
case
3
:
#define PACK_32(v24, v16, v8, v0) \
#define PACK_32(v24, v16, v8, v0) \
(((v24 & 0xff) << 24) | ((v16 & 0xff) << 16) | ((v8 & 0xff) << 8) | ((v0 & 0xff) << 0))
(((
v24
&
0xff
)
<<
24
)
|
((
v16
&
0xff
)
<<
16
)
|
((
v8
&
0xff
)
<<
8
)
|
((
v0
&
0xff
)
<<
0
))
for
(
auto
i_n
=
0
;
i_n
<
N
;
i_n
++
)
for
(
auto
i_n
=
0
;
i_n
<
N
;
i_n
++
)
{
{
for
(
auto
i_c
=
0
;
i_c
<
C
;
i_c
++
)
for
(
auto
i_c
=
0
;
i_c
<
C
;
i_c
++
)
{
{
for
(
auto
i_hi
=
0
;
i_hi
<
Hi
;
i_hi
++
)
for
(
auto
i_hi
=
0
;
i_hi
<
Hi
;
i_hi
++
)
{
{
for
(
auto
i_wi
=
0
;
i_wi
<
Wi
;
i_wi
++
)
for
(
auto
i_wi
=
0
;
i_wi
<
Wi
;
i_wi
++
)
{
{
uint32_t
v
=
PACK_32
(
i_n
,
i_c
,
i_hi
,
i_wi
);
uint32_t
v
=
PACK_32
(
i_n
,
i_c
,
i_hi
,
i_wi
);
in_n_c_hi_wi
(
i_n
,
i_c
,
i_hi
,
i_wi
)
=
*
reinterpret_cast
<
float
*>
(
&
v
);
in_n_c_hi_wi
(
i_n
,
i_c
,
i_hi
,
i_wi
)
=
*
reinterpret_cast
<
float
*>
(
&
v
);
}
}
}
}
}
}
}
}
for
(
auto
i_k
=
0
;
i_k
<
K
;
i_k
++
)
for
(
auto
i_k
=
0
;
i_k
<
K
;
i_k
++
)
{
{
for
(
auto
i_c
=
0
;
i_c
<
C
;
i_c
++
)
for
(
auto
i_c
=
0
;
i_c
<
C
;
i_c
++
)
{
{
for
(
auto
i_y
=
0
;
i_y
<
Y
;
i_y
++
)
for
(
auto
i_y
=
0
;
i_y
<
Y
;
i_y
++
)
{
{
for
(
auto
i_x
=
0
;
i_x
<
X
;
i_x
++
)
for
(
auto
i_x
=
0
;
i_x
<
X
;
i_x
++
)
{
{
uint32_t
v
=
PACK_32
(
i_k
,
i_c
,
i_y
,
i_x
);
uint32_t
v
=
PACK_32
(
i_k
,
i_c
,
i_y
,
i_x
);
wei_k_c_y_x
(
i_k
,
i_c
,
i_y
,
i_x
)
=
*
reinterpret_cast
<
float
*>
(
&
v
);
wei_k_c_y_x
(
i_k
,
i_c
,
i_y
,
i_x
)
=
*
reinterpret_cast
<
float
*>
(
&
v
);
}
}
}
}
}
}
}
}
break
;
break
;
default:
default:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0
,
1
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0
,
1
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
1
,
1
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
1
,
1
});
}
}
DeviceAlignedMemCPU
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
(),
DeviceAlignedMemCPU
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
(),
AVX2_DATA_ALIGNMENT
);
AVX2_DATA_ALIGNMENT
);
DeviceAlignedMemCPU
wei_device_buf
(
DeviceAlignedMemCPU
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
(),
AVX2_DATA_ALIGNMENT
);
sizeof
(
WeiDataType
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
(),
AVX2_DATA_ALIGNMENT
);
DeviceAlignedMemCPU
out_device_buf
(
sizeof
(
OutDataType
)
*
DeviceAlignedMemCPU
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_k_ho_wo_host_result
.
mDesc
.
GetElementSpace
(),
out_n_k_ho_wo_host_result
.
mDesc
.
GetElementSpace
(),
AVX2_DATA_ALIGNMENT
);
AVX2_DATA_ALIGNMENT
);
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXC_NHWK
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXC_NHWK
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
#endif
#endif
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
transpose_kyxc_2_kyxc8k
(
wei_k_c_y_x_k8
,
wei_k_c_y_x
,
K
,
Y
,
X
,
C
);
transpose_kyxc_2_kyxc8k
(
wei_k_c_y_x_k8
,
wei_k_c_y_x
,
K
,
Y
,
X
,
C
);
wei_device_buf
.
ToDevice
(
wei_k_c_y_x_k8
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x_k8
.
mData
.
data
());
#endif
#endif
// get host result
// get host result
{
{
auto
ref_conv
=
ReferenceConvFwdInstance
{};
auto
ref_conv
=
ReferenceConvFwdInstance
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in_n_c_hi_wi
,
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in_n_c_hi_wi
,
wei_k_c_y_x
,
wei_k_c_y_x
,
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_host_result
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
input_right_pads
,
input_right_pads
,
InElementOp
{},
InElementOp
{},
WeiElementOp
{},
WeiElementOp
{},
OutElementOp
{});
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
}
}
using
PassThrough
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
using
Relu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Relu
;
using
Relu
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
Relu
;
#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
cpu
::
device
::
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
cpu
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>
;
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>
;
#endif
#endif
#if TEST_FUSION == TEST_FUSION_RELU
#if TEST_FUSION == TEST_FUSION_RELU
using
DeviceConvFwdNoOpPtr
=
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
cpu
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>
;
ck
::
tensor_operation
::
cpu
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
Relu
>
;
#endif
#endif
// add device Conv instances
// add device Conv instances
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
float
>
&&
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
float
>
)
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
float
>
)
{
{
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXC_NHWK
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXC_NHWK
#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
if
(
omp_get_max_threads
()
>
1
)
if
(
omp_get_max_threads
()
>
1
)
{
{
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt
(
conv_ptrs
);
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk
(
conv_ptrs
);
}
}
else
else
{
{
if
(
K
%
8
==
0
)
if
(
K
%
8
==
0
)
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk
(
conv_ptrs
);
else
else
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c
(
conv_ptrs
);
}
}
#endif
#endif
#if TEST_FUSION == TEST_FUSION_RELU
#if TEST_FUSION == TEST_FUSION_RELU
if
(
omp_get_max_threads
()
>
1
)
if
(
omp_get_max_threads
()
>
1
)
{
{
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt_relu
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt_relu
(
conv_ptrs
);
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu
(
conv_ptrs
);
}
}
else
else
{
{
if
(
K
%
8
==
0
)
if
(
K
%
8
==
0
)
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu
(
conv_ptrs
);
else
else
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu
(
conv_ptrs
);
}
}
#endif
#endif
#endif
#endif
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
if
(
omp_get_max_threads
()
>
1
)
if
(
omp_get_max_threads
()
>
1
)
{
{
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt
(
conv_ptrs
);
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk
(
conv_ptrs
);
}
}
else
else
{
{
if
(
K
%
8
==
0
)
if
(
K
%
8
==
0
)
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk
(
conv_ptrs
);
else
else
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c
(
conv_ptrs
);
}
}
#endif
#endif
#if TEST_FUSION == TEST_FUSION_RELU
#if TEST_FUSION == TEST_FUSION_RELU
if
(
omp_get_max_threads
()
>
1
)
if
(
omp_get_max_threads
()
>
1
)
{
{
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt_relu
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt_relu
(
conv_ptrs
);
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu
(
conv_ptrs
);
}
}
else
else
{
{
if
(
K
%
8
==
0
)
if
(
K
%
8
==
0
)
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu
(
conv_ptrs
);
else
else
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
ck
::
tensor_operation
::
cpu
::
device
::
device_conv2d_fwd_avx2_instance
::
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu
(
conv_ptrs
);
add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu
(
conv_ptrs
);
}
}
#endif
#endif
#endif
#endif
}
}
if
(
conv_ptrs
.
size
()
<=
0
)
if
(
conv_ptrs
.
size
()
<=
0
)
{
{
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
}
// profile device Conv instances
// profile device Conv instances
bool
success
=
true
;
bool
success
=
true
;
double
fastest_kernel_time
=
std
::
numeric_limits
<
double
>::
max
();
double
fastest_kernel_time
=
std
::
numeric_limits
<
double
>::
max
();
std
::
string
fastest_kernel_name
=
""
;
std
::
string
fastest_kernel_name
=
""
;
double
fastest_kernel_gflops
=
0
;
double
fastest_kernel_gflops
=
0
;
for
(
auto
&
conv_ptr
:
conv_ptrs
)
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
{
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
N
,
N
,
K
,
K
,
C
,
C
,
input_spatial_lengths
,
input_spatial_lengths
,
filter_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
input_right_pads
,
input_right_pads
,
InElementOp
{},
InElementOp
{},
WeiElementOp
{},
WeiElementOp
{},
OutElementOp
{});
OutElementOp
{});
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
double
time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{},
10
);
double
time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{},
10
);
double
total_flop
=
static_cast
<
double
>
(
2
)
*
N
*
C
*
Ho
*
Wo
*
K
*
Y
*
X
;
double
total_flop
=
static_cast
<
double
>
(
2
)
*
N
*
C
*
Ho
*
Wo
*
K
*
Y
*
X
;
double
gflops
=
(
total_flop
*
1e-6
)
/
time
;
double
gflops
=
(
total_flop
*
1e-6
)
/
time
;
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
if
(
!
check_out
(
out_n_k_ho_wo_host_result
,
if
(
!
check_out
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
,
out_n_k_ho_wo_device_result
,
1e-6
,
1e-6
,
per_pixel_check
))
per_pixel_check
))
{
{
std
::
cout
<<
"Fail Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
std
::
cout
<<
"Fail Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
success
=
false
;
success
=
false
;
}
}
else
else
{
{
std
::
cout
<<
"Pass Info: "
<<
conv_ptr
->
GetTypeString
()
<<
", Time:"
<<
time
std
::
cout
<<
"Pass Info: "
<<
conv_ptr
->
GetTypeString
()
<<
", Time:"
<<
time
<<
"ms, Gflops:"
<<
gflops
<<
std
::
endl
;
<<
"ms, Gflops:"
<<
gflops
<<
std
::
endl
;
if
(
time
<
fastest_kernel_time
)
if
(
time
<
fastest_kernel_time
)
{
{
fastest_kernel_time
=
time
;
fastest_kernel_time
=
time
;
fastest_kernel_name
=
conv_ptr
->
GetTypeString
();
fastest_kernel_name
=
conv_ptr
->
GetTypeString
();
fastest_kernel_gflops
=
gflops
;
fastest_kernel_gflops
=
gflops
;
}
}
}
}
}
}
else
else
{
{
std
::
cout
<<
"Not support Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
std
::
cout
<<
"Not support Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
}
}
}
if
(
fastest_kernel_time
!=
std
::
numeric_limits
<
double
>::
max
())
if
(
fastest_kernel_time
!=
std
::
numeric_limits
<
double
>::
max
())
{
{
std
::
cout
<<
" fastest:"
<<
fastest_kernel_name
<<
", time:"
<<
fastest_kernel_time
std
::
cout
<<
" fastest:"
<<
fastest_kernel_name
<<
", time:"
<<
fastest_kernel_time
<<
"ms, Gflops:"
<<
fastest_kernel_gflops
<<
std
::
endl
;
<<
"ms, Gflops:"
<<
fastest_kernel_gflops
<<
std
::
endl
;
}
}
return
0
;
return
0
;
// if(success)
// if(success)
// {
// {
// std::cout << "test conv2d fwd cpu : Pass" << std::endl;
// std::cout << "test conv2d fwd cpu : Pass" << std::endl;
// return 0;
// return 0;
// }
// }
// else
// else
// {
// {
// std::cout << "test conv2d fwd cpu: Fail " << std::endl;
// std::cout << "test conv2d fwd cpu: Fail " << std::endl;
// return -1;
// return -1;
// }
// }
};
};
if
(
data_type
==
0
)
if
(
data_type
==
0
)
{
{
return
Run
(
F32
(),
F32
(),
F32
());
return
Run
(
F32
(),
F32
(),
F32
());
}
}
else
else
{
{
return
1
;
return
1
;
}
}
}
}
test/CMakeLists.txt
View file @
505194d7
...
@@ -69,6 +69,4 @@ add_subdirectory(reduce)
...
@@ -69,6 +69,4 @@ add_subdirectory(reduce)
add_subdirectory
(
conv2d_bwd_weight
)
add_subdirectory
(
conv2d_bwd_weight
)
add_subdirectory
(
convnd_bwd_data
)
add_subdirectory
(
convnd_bwd_data
)
add_subdirectory
(
cpu_ukernel
)
add_subdirectory
(
cpu_ukernel
)
add_subdirectory
(
cpu_threadwise_transfer
)
add_subdirectory
(
convnd_fwd_cpu
)
# DONOT add client_app, that is tested via CI independently
# DONOT add client_app, that is tested via CI independently
test/convnd_fwd_cpu/CMakeLists.txt
deleted
100644 → 0
View file @
ffb13372
add_test_executable
(
test_conv2d_fwd_cpu conv2d_fwd_cpu.cpp
)
target_link_libraries
(
test_conv2d_fwd_cpu PRIVATE host_tensor
)
target_link_libraries
(
test_conv2d_fwd_cpu PRIVATE device_conv2d_fwd_cpu_instance
)
# 3.13 introduce target_link_directories, which is better
set_target_properties
(
test_conv2d_fwd_cpu PROPERTIES LINK_FLAGS
"
${
OMP_LINK_FLAG
}
"
)
target_link_libraries
(
test_conv2d_fwd_cpu PRIVATE
"
${
OMP_LIBRARY
}
"
)
target_compile_options
(
test_conv2d_fwd_cpu PRIVATE
"
${
OMP_CXX_FLAG
}
"
)
test/cpu_threadwise_transfer/CMakeLists.txt
deleted
100644 → 0
View file @
ffb13372
add_test_executable
(
test_cpu_threadwise_transfer cpu_threadwise_transfer.cpp
)
target_link_libraries
(
test_cpu_threadwise_transfer PRIVATE host_tensor
)
# 3.13 introduce target_link_directories, which is better
set_target_properties
(
test_cpu_threadwise_transfer PROPERTIES LINK_FLAGS -Wl,-rpath,/opt/rocm/llvm/lib
)
target_link_libraries
(
test_cpu_threadwise_transfer PRIVATE /opt/rocm/llvm/lib/libomp.so
)
target_compile_options
(
test_cpu_threadwise_transfer PRIVATE -fopenmp=libomp -Wno-unused-command-line-argument
)
test/cpu_threadwise_transfer/cpu_threadwise_transfer.cpp
deleted
100644 → 0
View file @
ffb13372
#include <iostream>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <string>
#include <sstream>
#include <tuple>
#include <memory>
#include <half.hpp>
#include <omp.h>
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "device.hpp"
#include "config.hpp"
#include "print.hpp"
#include "cpuid.hpp"
#include "threadwise_tensor_slice_transfer_avx2.hpp"
#include "element_wise_operation_cpu.hpp"
#include "dynamic_buffer_cpu.hpp"
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
// using AType = half_float::half;
// using BType = half_float::half;
using
AType
=
float
;
using
BType
=
float
;
using
CType
=
float
;
#define NTStore false
using
PassThrough
=
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
;
static
inline
int
conv_out_size
(
int
in_size
,
int
pad
,
int
dilation
,
int
ksize
,
int
stride
)
{
return
(
in_size
+
2
*
pad
-
dilation
*
(
ksize
-
1
)
-
1
)
/
stride
+
1
;
}
#define MC 16
#define NC 24
#define KC 32
#define IsInputPadded true
#define IsInputCBlockTranspose false
#define CBlockMVector 8
template
<
typename
T
>
static
inline
void
dump_memory
(
T
*
ptr
,
ck
::
index_t
elem
)
{
for
(
ck
::
index_t
i
=
0
;
i
<
elem
;
i
++
)
{
std
::
cout
<<
i
<<
": 0x"
<<
std
::
hex
<<
ptr
[
i
]
<<
std
::
dec
<<
std
::
endl
;
}
}
int
main
(
int
argc
,
char
**
argv
)
{
int
n
=
2
;
int
hi
=
8
;
int
wi
=
6
;
int
c
=
8
;
int
fy
=
3
;
int
fx
=
3
;
int
dy
=
1
;
int
dx
=
1
;
int
sy
=
1
;
int
sx
=
1
;
int
py
=
0
;
int
px
=
0
;
if
(
argc
>
12
)
{
n
=
std
::
atoi
(
argv
[
1
]);
hi
=
std
::
atoi
(
argv
[
2
]);
wi
=
std
::
atoi
(
argv
[
3
]);
c
=
std
::
atoi
(
argv
[
4
]);
fy
=
std
::
atoi
(
argv
[
5
]);
fx
=
std
::
atoi
(
argv
[
6
]);
dy
=
std
::
atoi
(
argv
[
7
]);
dx
=
std
::
atoi
(
argv
[
8
]);
sy
=
std
::
atoi
(
argv
[
9
]);
sx
=
std
::
atoi
(
argv
[
10
]);
py
=
std
::
atoi
(
argv
[
11
]);
px
=
std
::
atoi
(
argv
[
12
]);
}
int
ho
=
conv_out_size
(
hi
,
py
,
dy
,
fy
,
sy
);
int
wo
=
conv_out_size
(
wi
,
px
,
dx
,
fx
,
sx
);
DeviceAlignedMemCPU
input_mem
(
n
*
c
*
hi
*
wi
*
sizeof
(
AType
),
32
);
DeviceAlignedMemCPU
input_cblock_mem
(
MC
*
KC
*
sizeof
(
AType
),
32
);
auto
gen_input_buffer
=
[
&
](
AType
*
ptr
,
ck
::
index_t
N
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
ck
::
index_t
C
)
{
for
(
auto
i_n
=
0
;
i_n
<
N
;
i_n
++
)
{
for
(
auto
i_hi
=
0
;
i_hi
<
Hi
;
i_hi
++
)
{
for
(
auto
i_wi
=
0
;
i_wi
<
Wi
;
i_wi
++
)
{
for
(
auto
i_c
=
0
;
i_c
<
C
;
i_c
++
)
{
auto
index
=
i_n
*
Hi
*
Wi
*
C
+
i_hi
*
Wi
*
C
+
i_wi
*
C
+
i_c
;
auto
value
=
((
i_n
&
0xff
)
<<
24
)
|
((
i_hi
&
0xff
)
<<
16
)
|
((
i_wi
&
0xff
)
<<
8
)
|
((
i_c
&
0xff
)
<<
0
);
ptr
[
index
]
=
*
reinterpret_cast
<
AType
*>
(
&
value
);
}
}
}
}
};
gen_input_buffer
(
reinterpret_cast
<
AType
*>
(
input_mem
.
mpDeviceBuf
),
n
,
hi
,
wi
,
c
);
const
auto
input_desc
=
[
&
]()
{
const
auto
in_n_hi_wi_c_grid_desc
=
ck
::
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
n
,
hi
,
wi
,
c
));
const
auto
in_n_hip_wip_c_grid_desc
=
ck
::
transform_tensor_descriptor
(
in_n_hi_wi_c_grid_desc
,
ck
::
make_tuple
(
ck
::
make_pass_through_transform
(
n
),
ck
::
make_pad_transform
(
hi
,
py
,
py
),
ck
::
make_pad_transform
(
wi
,
px
,
px
),
ck
::
make_pass_through_transform
(
c
)),
ck
::
make_tuple
(
ck
::
Sequence
<
0
>
{},
ck
::
Sequence
<
1
>
{},
ck
::
Sequence
<
2
>
{},
ck
::
Sequence
<
3
>
{}),
ck
::
make_tuple
(
ck
::
Sequence
<
0
>
{},
ck
::
Sequence
<
1
>
{},
ck
::
Sequence
<
2
>
{},
ck
::
Sequence
<
3
>
{}));
const
auto
in_n_y_ho_x_wo_c_grid_desc
=
ck
::
transform_tensor_descriptor
(
in_n_hip_wip_c_grid_desc
,
ck
::
make_tuple
(
ck
::
make_pass_through_transform
(
n
),
ck
::
make_embed_transform
(
ck
::
make_tuple
(
fy
,
ho
),
ck
::
make_tuple
(
dy
,
sy
)),
ck
::
make_embed_transform
(
ck
::
make_tuple
(
fx
,
wo
),
ck
::
make_tuple
(
dx
,
sx
)),
ck
::
make_pass_through_transform
(
c
)),
ck
::
make_tuple
(
ck
::
Sequence
<
0
>
{},
ck
::
Sequence
<
1
>
{},
ck
::
Sequence
<
2
>
{},
ck
::
Sequence
<
3
>
{}),
ck
::
make_tuple
(
ck
::
Sequence
<
0
>
{},
ck
::
Sequence
<
1
,
2
>
{},
ck
::
Sequence
<
3
,
4
>
{},
ck
::
Sequence
<
5
>
{}));
const
auto
in_gemm_m_k_grid_desc
=
ck
::
transform_tensor_descriptor
(
in_n_y_ho_x_wo_c_grid_desc
,
ck
::
make_tuple
(
ck
::
make_merge_transform
(
ck
::
make_tuple
(
n
,
ho
,
wo
)),
ck
::
make_merge_transform
(
ck
::
make_tuple
(
fy
,
fx
,
c
))),
ck
::
make_tuple
(
ck
::
Sequence
<
0
,
2
,
4
>
{},
ck
::
Sequence
<
1
,
3
,
5
>
{}),
ck
::
make_tuple
(
ck
::
Sequence
<
0
>
{},
ck
::
Sequence
<
1
>
{}));
if
constexpr
(
IsInputPadded
)
{
const
auto
gemm_m_raw
=
n
*
ho
*
wo
;
const
auto
gemm_m_pad
=
ck
::
math
::
integer_least_multiple
(
gemm_m_raw
,
MC
)
-
gemm_m_raw
;
const
auto
gemm_k_raw
=
c
*
fy
*
fx
;
const
auto
gemm_k_pad
=
ck
::
math
::
integer_least_multiple
(
gemm_k_raw
,
KC
)
-
gemm_k_raw
;
const
auto
in_gemm_pm_pk_grid_desc
=
ck
::
transform_tensor_descriptor
(
in_gemm_m_k_grid_desc
,
ck
::
make_tuple
(
ck
::
make_right_pad_transform
(
gemm_m_raw
,
gemm_m_pad
),
ck
::
make_right_pad_transform
(
gemm_k_raw
,
gemm_k_pad
)),
ck
::
make_tuple
(
ck
::
Sequence
<
0
>
{},
ck
::
Sequence
<
1
>
{}),
ck
::
make_tuple
(
ck
::
Sequence
<
0
>
{},
ck
::
Sequence
<
1
>
{}));
if
constexpr
(
IsInputCBlockTranspose
)
{
constexpr
auto
I0
=
ck
::
Number
<
0
>
{};
constexpr
auto
I1
=
ck
::
Number
<
1
>
{};
const
auto
in_gemm_pm0_pk_pm1
=
ck
::
transform_tensor_descriptor
(
in_gemm_pm_pk_grid_desc
,
ck
::
make_tuple
(
ck
::
make_unmerge_transform
(
ck
::
make_tuple
(
in_gemm_pm_pk_grid_desc
.
GetLength
(
I0
)
/
CBlockMVector
,
CBlockMVector
)),
ck
::
make_pass_through_transform
(
in_gemm_pm_pk_grid_desc
.
GetLength
(
I1
))),
ck
::
make_tuple
(
ck
::
Sequence
<
0
>
{},
ck
::
Sequence
<
1
>
{}),
ck
::
make_tuple
(
ck
::
Sequence
<
0
,
2
>
{},
ck
::
Sequence
<
1
>
{}));
return
in_gemm_pm0_pk_pm1
;
}
else
return
in_gemm_pm_pk_grid_desc
;
}
else
{
return
in_gemm_m_k_grid_desc
;
}
}();
const
auto
input_cblock_desc
=
[
&
]()
{
if
constexpr
(
IsInputCBlockTranspose
)
{
const
auto
in_cblock_m_k_m8
=
ck
::
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
MC
/
CBlockMVector
,
KC
,
CBlockMVector
));
return
in_cblock_m_k_m8
;
}
else
{
return
ck
::
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
MC
,
KC
));
}
}();
constexpr
auto
get_dim_access_order
=
[]()
{
if
constexpr
(
IsInputCBlockTranspose
)
return
ck
::
Sequence
<
1
,
0
,
2
>
{};
else
return
ck
::
Sequence
<
0
,
1
>
{};
};
constexpr
auto
get_slice_length
=
[]()
{
if
constexpr
(
IsInputCBlockTranspose
)
return
ck
::
Sequence
<
MC
/
CBlockMVector
,
KC
,
CBlockMVector
>
{};
else
return
ck
::
Sequence
<
MC
,
KC
>
{};
};
using
threadwise_transfer_t
=
ck
::
cpu
::
ThreadwiseTensorSliceTransferAvx2
<
AType
,
// SrcData
AType
,
// DstData
decltype
(
input_desc
),
// SrcDesc
decltype
(
input_cblock_desc
),
// DstDesc
PassThrough
,
// ElementwiseOperation
decltype
(
get_slice_length
()),
// SliceLengths
decltype
(
get_dim_access_order
()),
// DimAccessOrder
1
,
// VectorDim
1
,
// ScalarPerVector
ck
::
InMemoryDataOperationEnum
::
Set
,
// InMemoryDataOperationEnum
false
,
// SrcResetCoordinateAfterRun
true
// DstResetCoordinateAfterRun
>
;
static
constexpr
ck
::
index_t
nDim
=
ck
::
remove_reference_t
<
decltype
(
input_desc
)
>::
GetNumOfDimension
();
input_desc
.
Print
();
auto
threadwise_transfer
=
threadwise_transfer_t
{
input_desc
,
ck
::
make_zero_multi_index
<
nDim
>
(),
input_cblock_desc
,
ck
::
make_zero_multi_index
<
nDim
>
(),
PassThrough
{}};
auto
input_buf
=
ck
::
cpu
::
make_dynamic_buffer
<
ck
::
AddressSpaceEnum
::
Global
>
(
static_cast
<
AType
*>
(
input_mem
.
mpDeviceBuf
),
input_mem
.
mMemSize
/
sizeof
(
AType
));
auto
input_cblock
=
ck
::
cpu
::
make_dynamic_buffer
<
ck
::
AddressSpaceEnum
::
Global
>
(
static_cast
<
AType
*>
(
input_cblock_mem
.
mpDeviceBuf
),
input_cblock_mem
.
mMemSize
/
sizeof
(
AType
));
constexpr
auto
fwd_move_step
=
[]()
{
if
constexpr
(
IsInputCBlockTranspose
)
return
ck
::
make_multi_index
(
0
,
KC
,
0
);
// m/8 * k * 8
else
return
ck
::
make_multi_index
(
0
,
KC
);
};
threadwise_transfer
.
RunGeneric
(
input_desc
,
input_buf
,
input_cblock_desc
,
input_cblock
);
printf
(
"----------------------
\n
"
);
threadwise_transfer
.
MoveSrcSliceWindow
(
input_desc
,
fwd_move_step
());
// threadwise_transfer.RunGeneric(input_desc, input_buf , input_cblock_desc, input_cblock);
dump_memory
(
reinterpret_cast
<
uint32_t
*>
(
input_mem
.
mpDeviceBuf
),
input_mem
.
mMemSize
/
sizeof
(
AType
));
std
::
cout
<<
"======================"
<<
std
::
endl
;
dump_memory
(
reinterpret_cast
<
uint32_t
*>
(
input_cblock_mem
.
mpDeviceBuf
),
input_cblock_mem
.
mMemSize
/
sizeof
(
AType
));
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment