Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
07a673c6
Commit
07a673c6
authored
Apr 14, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
c0f698d5
ac0d8066
Changes
307
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
477 additions
and
152 deletions
+477
-152
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
...device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
...device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+2
-2
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+3
-0
profiler/README.md
profiler/README.md
+48
-0
profiler/include/profile_batched_gemm_impl.hpp
profiler/include/profile_batched_gemm_impl.hpp
+2
-2
profiler/include/profile_conv_bwd_data_impl.hpp
profiler/include/profile_conv_bwd_data_impl.hpp
+4
-1
profiler/include/profile_conv_bwd_weight_impl.hpp
profiler/include/profile_conv_bwd_weight_impl.hpp
+275
-0
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+4
-1
profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
...er/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
+3
-1
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+3
-1
profiler/include/profile_conv_fwd_impl.hpp
profiler/include/profile_conv_fwd_impl.hpp
+4
-1
profiler/include/profile_convnd_bwd_data_impl.hpp
profiler/include/profile_convnd_bwd_data_impl.hpp
+59
-93
profiler/include/profile_gemm_bias_2d_impl.hpp
profiler/include/profile_gemm_bias_2d_impl.hpp
+3
-1
profiler/include/profile_gemm_bias_relu_add_impl.hpp
profiler/include/profile_gemm_bias_relu_add_impl.hpp
+3
-1
profiler/include/profile_gemm_bias_relu_impl.hpp
profiler/include/profile_gemm_bias_relu_impl.hpp
+3
-1
profiler/include/profile_gemm_impl.hpp
profiler/include/profile_gemm_impl.hpp
+4
-2
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+3
-1
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+28
-24
profiler/src/profile_batched_gemm.cpp
profiler/src/profile_batched_gemm.cpp
+14
-7
profiler/src/profile_batched_gemm_reduce.cpp
profiler/src/profile_batched_gemm_reduce.cpp
+11
-12
No files found.
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
View file @
07a673c6
...
...
@@ -20,7 +20,7 @@ using S = ck::Sequence<Is...>;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
_t
::
Default
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
using
device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances
=
std
::
tuple
<
...
...
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
View file @
07a673c6
...
...
@@ -20,8 +20,8 @@ using S = ck::Sequence<Is...>;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
_t
::
Default
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
_t
::
MNPadding
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
using
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
=
std
::
tuple
<
...
...
profiler/CMakeLists.txt
View file @
07a673c6
...
...
@@ -15,6 +15,7 @@ include_directories(BEFORE
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/tensor_operation_instance/gpu/reduce
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/utility
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
...
...
@@ -36,6 +37,7 @@ set(PROFILER_SOURCE
src/profile_convnd_bwd_data.cpp
src/profile_reduce.cpp
src/profile_grouped_gemm.cpp
src/profile_conv_bwd_weight.cpp
src/profile_batched_gemm_reduce.cpp
)
...
...
@@ -57,4 +59,5 @@ target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_grouped_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_reduce_instance
)
profiler/
src/
README.md
→
profiler/README.md
View file @
07a673c6
## Docker script
```
bash
docker run
\
-it
\
--rm
\
--privileged
\
--group-add
sudo
\
-w
/root/workspace
\
-v
${
PATH_TO_LOCAL_WORKSPACE
}
:/root/workspace
\
rocm/tensorflow:rocm4.3.1-tf2.6-dev
\
/bin/bash
```
## Build ```ckProfiler```
```
bash
mkdir
build
&&
cd
build
```
```
bash
# Need to Specify target ID, example below is gfx908
cmake
\
-D
BUILD_DEV
=
OFF
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
..
```
```
bash
make
-j
ckProfiler
```
## Profile GEMM kernels
```
bash
#arg1: tensor operation (gemm=GEMM)
...
...
@@ -42,8 +9,8 @@ cmake \
#arg7: run kernel # of times (>1)
#arg8 to 13: M, N, K, StrideA, StrideB, StrideC
################
#####
op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
./
profiler
/ckProfiler gemm 1 1 1 1 0 5 3840 4096 4096 4096 4096 4096
################
op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
./
bin
/ckProfiler
gemm 1 1 1 1 0 5 3840 4096 4096 4096 4096 4096
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
...
...
@@ -55,7 +22,7 @@ c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
```
## Profile forward convolution kernels
## Profile
2d
forward convolution kernels
```
bash
#arg1: tensor operation (conv=Convolution)
#arg2: data type (0=fp32, 1=fp16)
...
...
@@ -67,8 +34,8 @@ Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
#arg8: print matrix value (0=no, 1=yes)
#arg9: run kernel # of times (>1)
#arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
################
#####
op
datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
./
profiler
/ckProfiler conv_fwd 1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
################
op
datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
./
bin
/ckProfiler
conv
2d
_fwd 1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
...
...
profiler/include/profile_batched_gemm_impl.hpp
View file @
07a673c6
...
...
@@ -2,6 +2,7 @@
#include <memory>
#include "check_err.hpp"
#include "config.hpp"
#include "element_wise_operation.hpp"
#include "tensor_layout.hpp"
...
...
@@ -69,7 +70,7 @@ bool profile_batched_gemm_impl(int do_verification,
int
StrideA
,
int
StrideB
,
int
StrideC
,
int
BatchCount
=
1
)
int
BatchCount
)
{
bool
pass
=
true
;
...
...
@@ -393,7 +394,6 @@ bool profile_batched_gemm_impl(int do_verification,
}
else
{
float
err
=
check_error
(
c_g_m_n_host_result
,
c_g_m_n_device_result
);
pass
=
pass
&&
(
err
<
1E-6
);
}
...
...
profiler/include/profile_conv_bwd_data_impl.hpp
View file @
07a673c6
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -253,7 +255,8 @@ void profile_conv_bwd_data_impl(int do_verification,
{
in_device_buf
.
FromDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
check_error
(
in_n_c_hi_wi_host_result
,
in_n_c_hi_wi_device_result
);
ck
::
utils
::
check_err
(
in_n_c_hi_wi_device_result
.
mData
,
in_n_c_hi_wi_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_bwd_weight_impl.hpp
0 → 100644
View file @
07a673c6
#pragma once
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_conv_backward_weight.hpp"
#include "element_wise_operation.hpp"
#include "reference_conv_backward_weight.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv2d_bwd_weight_instance
{
using
DeviceConvBwdWeightNoOpPtr
=
DeviceConvBwdWeightPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
void
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvBwdWeightNoOpPtr
>&
);
void
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvBwdWeightNoOpPtr
>&
);
}
// namespace device_conv2d_bwd_weight_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
profiler
{
template
<
int
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
bool
profile_conv_bwd_weight_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
ck
::
index_t
split_k
)
{
const
ck
::
index_t
Y
=
filter_spatial_lengths
[
0
];
const
ck
::
index_t
X
=
filter_spatial_lengths
[
1
];
const
ck
::
index_t
Hi
=
input_spatial_lengths
[
0
];
const
ck
::
index_t
Wi
=
input_spatial_lengths
[
1
];
const
ck
::
index_t
Ho
=
output_spatial_lengths
[
0
];
const
ck
::
index_t
Wo
=
output_spatial_lengths
[
1
];
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
if
constexpr
(
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
KCYX
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NKHW
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
H
*
W
,
W
,
1
}));
}
else
if
constexpr
(
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
KYXC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWK
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
}
};
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
WeiDataType
>
wei_k_c_y_x_host_result
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
,
WeiLayout
{}));
Tensor
<
WeiDataType
>
wei_k_c_y_x_device_result
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
,
WeiLayout
{}));
Tensor
<
OutDataType
>
out_n_k_ho_wo
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_k_ho_wo: "
<<
out_n_k_ho_wo
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
out_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
break
;
default:
out_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{
1
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
1
});
}
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{};
if
(
do_verification
)
{
using
ReferenceConvBwdWeightInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdWeight
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
auto
ref_conv
=
ReferenceConvBwdWeightInstance
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in_n_c_hi_wi
,
wei_k_c_y_x_host_result
,
out_n_k_ho_wo
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
ref_invoker
.
Run
(
ref_argument
);
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei_k_c_y_x_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
out_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceConvBwdWeightNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvBwdWeightPtr
<
PassThrough
,
PassThrough
,
PassThrough
>
;
// add device Conv instances
std
::
vector
<
DeviceConvBwdWeightNoOpPtr
>
conv_ptrs
;
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
float
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_weight_instance
::
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
half_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_weight_instance
::
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
if
(
conv_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
std
::
string
best_conv_name
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device Conv instances
bool
pass
=
true
;
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
// using atomic, so need to reset input
if
(
split_k
>
1
)
{
wei_device_buf
.
SetZero
();
}
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
N
,
K
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
,
split_k
);
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
string
conv_name
=
conv_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
std
::
size_t
num_btype
=
sizeof
(
InDataType
)
*
(
N
*
C
*
Hi
*
Wi
)
+
sizeof
(
WeiDataType
)
*
(
K
*
C
*
Y
*
X
)
+
sizeof
(
OutDataType
)
*
(
N
*
K
*
Ho
*
Wo
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_conv_name
=
conv_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
wei_device_buf
.
FromDevice
(
wei_k_c_y_x_device_result
.
mData
.
data
());
float
max_error
=
check_error
(
wei_k_c_y_x_host_result
,
wei_k_c_y_x_device_result
);
if
(
max_error
>
8
)
{
pass
=
false
;
std
::
cout
<<
"Fail info:"
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out: "
,
out_n_k_ho_wo
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in_n_c_hi_wi
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_host : "
,
wei_k_c_y_x_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_device: "
,
wei_k_c_y_x_device_result
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_conv_name
<<
std
::
endl
;
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
View file @
07a673c6
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -245,7 +247,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
View file @
07a673c6
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -301,7 +302,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
View file @
07a673c6
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -233,7 +234,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_impl.hpp
View file @
07a673c6
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -253,7 +255,8 @@ void profile_conv_fwd_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_convnd_bwd_data_impl.hpp
View file @
07a673c6
#pragma once
#include "config.hpp"
#include "device.hpp"
#include "conv_util
s
.hpp"
#include "conv_
fwd_
util.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
...
...
@@ -12,7 +12,7 @@
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
BF16
=
ushor
t
;
using
BF16
=
ck
::
bhalf_
t
;
using
INT8
=
int8_t
;
namespace
ck
{
namespace
tensor_operation
{
...
...
@@ -68,13 +68,13 @@ HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::siz
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
conv_util
::
G
et
H
ost
T
ensor
D
escriptor
(
dims
,
InLayout
{});
return
ck
::
utils
::
conv
::
g
et
_h
ost
_t
ensor
_d
escriptor
(
dims
,
InLayout
{});
}
case
2
:
{
return
ck
::
conv_util
::
G
et
H
ost
T
ensor
D
escriptor
(
dims
,
InLayout
{});
return
ck
::
utils
::
conv
::
g
et
_h
ost
_t
ensor
_d
escriptor
(
dims
,
InLayout
{});
}
case
1
:
{
return
ck
::
conv_util
::
G
et
H
ost
T
ensor
D
escriptor
(
dims
,
InLayout
{});
return
ck
::
utils
::
conv
::
g
et
_h
ost
_t
ensor
_d
escriptor
(
dims
,
InLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
...
...
@@ -90,13 +90,13 @@ HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::s
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
conv_util
::
G
et
H
ost
T
ensor
D
escriptor
(
dims
,
WeiLayout
{});
return
ck
::
utils
::
conv
::
g
et
_h
ost
_t
ensor
_d
escriptor
(
dims
,
WeiLayout
{});
}
case
2
:
{
return
ck
::
conv_util
::
G
et
H
ost
T
ensor
D
escriptor
(
dims
,
WeiLayout
{});
return
ck
::
utils
::
conv
::
g
et
_h
ost
_t
ensor
_d
escriptor
(
dims
,
WeiLayout
{});
}
case
1
:
{
return
ck
::
conv_util
::
G
et
H
ost
T
ensor
D
escriptor
(
dims
,
WeiLayout
{});
return
ck
::
utils
::
conv
::
g
et
_h
ost
_t
ensor
_d
escriptor
(
dims
,
WeiLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
...
...
@@ -112,15 +112,14 @@ HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::siz
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
conv_util
::
G
et
H
ost
T
ensor
D
escriptor
(
dims
,
OutLayout
{});
return
ck
::
utils
::
conv
::
g
et
_h
ost
_t
ensor
_d
escriptor
(
dims
,
OutLayout
{});
}
case
2
:
{
return
ck
::
conv_util
::
G
et
H
ost
T
ensor
D
escriptor
(
dims
,
OutLayout
{});
return
ck
::
utils
::
conv
::
g
et
_h
ost
_t
ensor
_d
escriptor
(
dims
,
OutLayout
{});
}
case
1
:
{
return
ck
::
conv_util
::
G
et
H
ost
T
ensor
D
escriptor
(
dims
,
OutLayout
{});
return
ck
::
utils
::
conv
::
g
et
_h
ost
_t
ensor
_d
escriptor
(
dims
,
OutLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
...
...
@@ -274,13 +273,13 @@ bool profile_convnd_bwd_data_impl(int do_verification,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
const
std
::
vector
<
ck
::
index_t
>
&
input_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>
&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>
&
output_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>
&
conv_filter_strides
,
const
std
::
vector
<
ck
::
index_t
>
&
conv_filter_dilations
,
const
std
::
vector
<
ck
::
index_t
>
&
input_left_pads
,
const
std
::
vector
<
ck
::
index_t
>
&
input_right_pads
)
{
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
...
...
@@ -304,51 +303,49 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
in
_n_c_hi_wi
_host_result
(
Tensor
<
InDataType
>
in
put
_host_result
(
get_input_host_tensor_descriptor
<
InLayout
>
(
input_dims
,
NDimSpatial
));
Tensor
<
InDataType
>
in
_n_c_hi_wi
_device_result
(
Tensor
<
InDataType
>
in
put
_device_result
(
get_input_host_tensor_descriptor
<
InLayout
>
(
input_dims
,
NDimSpatial
));
Tensor
<
WeiDataType
>
wei
_k_c_y_x
(
Tensor
<
WeiDataType
>
wei
ghts
(
get_filters_host_tensor_descriptor
<
WeiLayout
>
(
filter_dims
,
NDimSpatial
));
Tensor
<
OutDataType
>
out
_n_k_ho_wo
(
Tensor
<
OutDataType
>
out
put
(
get_output_host_ensor_descriptor
<
OutLayout
>
(
output_dims
,
NDimSpatial
));
std
::
cout
<<
"in
_n_c_hi_wi: "
<<
in_n_c_hi_wi
_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei
_k_c_y_x
: "
<<
wei
_k_c_y_x
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out
_n_k_ho_wo: "
<<
out_n_k_ho_wo
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"in
put: "
<<
input
_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei
ghts
: "
<<
wei
ghts
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out
put: "
<<
output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
out
_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
wei
_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
out
put
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
wei
ghts
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
default:
out
_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{
1
});
wei
_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
out
put
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{
1
});
wei
ghts
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
out_device_buf
.
ToDevice
(
out
_n_k_ho_wo
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
_k_c_y_x
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
put
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
ghts
.
mData
.
data
());
// reset input to zero
in_n_c_hi_wi_device_result
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
0
});
in_device_buf
.
ToDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
in_device_buf
.
SetZero
();
if
(
do_verification
)
{
auto
RunReference
=
[
&
](
auto
&
ref_conv
)
{
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
_n_c_hi_wi
_host_result
,
wei
_k_c_y_x
,
out
_n_k_ho_wo
,
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
put
_host_result
,
wei
ghts
,
out
put
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
...
...
@@ -358,48 +355,16 @@ bool profile_convnd_bwd_data_impl(int do_verification,
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
};
switch
(
NDimSpatial
)
{
case
3
:
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
3
>
();
RunReference
(
ref_conv
);
break
;
}
case
2
:
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
2
>
();
RunReference
(
ref_conv
);
break
;
}
case
1
:
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
1
>
();
RunReference
(
ref_conv
);
break
;
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
NDimSpatial
>
();
RunReference
(
ref_conv
);
}
// add device Conv instances
...
...
@@ -448,9 +413,10 @@ bool profile_convnd_bwd_data_impl(int do_verification,
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
flop
=
ck
::
conv_util
::
GetFlops
(
N
,
C
,
K
,
filter_spatial_lengths
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
ck
::
conv_util
::
GetBtype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
N
,
C
,
K
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
);
ck
::
utils
::
conv
::
get_flops
(
N
,
C
,
K
,
filter_spatial_lengths
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
ck
::
utils
::
conv
::
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
N
,
C
,
K
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
...
...
@@ -468,9 +434,9 @@ bool profile_convnd_bwd_data_impl(int do_verification,
if
(
do_verification
)
{
in_device_buf
.
FromDevice
(
in
_n_c_hi_wi
_device_result
.
mData
.
data
());
in_device_buf
.
FromDevice
(
in
put
_device_result
.
mData
.
data
());
if
(
!
check_out
(
in
_n_c_hi_wi
_host_result
,
in
_n_c_hi_wi
_device_result
))
if
(
!
check_out
(
in
put
_host_result
,
in
put
_device_result
))
{
std
::
cout
<<
"Fail Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
...
...
@@ -481,24 +447,24 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std
::
cout
<<
"Pass Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
check_error
(
in
_n_c_hi_wi
_host_result
,
in
_n_c_hi_wi
_device_result
);
check_error
(
in
put
_host_result
,
in
put
_device_result
);
if
(
do_log
)
{
std
::
cout
<<
"in : "
;
show_data_nhwc_layout
(
out
_n_k_ho_wo
);
show_data_nhwc_layout
(
out
put
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"wei: "
;
show_data_nhwc_layout
(
wei
_k_c_y_x
);
show_data_nhwc_layout
(
wei
ghts
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"out_host : "
;
show_data_nhwc_layout
(
in
_n_c_hi_wi
_host_result
);
show_data_nhwc_layout
(
in
put
_host_result
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"out_device: "
;
show_data_nhwc_layout
(
in
_n_c_hi_wi
_device_result
);
show_data_nhwc_layout
(
in
put
_device_result
);
std
::
cout
<<
std
::
endl
;
}
}
...
...
profiler/include/profile_gemm_bias_2d_impl.hpp
View file @
07a673c6
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -283,7 +285,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
{
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
check_err
or
(
c_m_n_
host
_result
,
c_m_n_
device
_result
);
ck
::
utils
::
check_err
(
c_m_n_
device
_result
.
mData
,
c_m_n_
host
_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_gemm_bias_relu_add_impl.hpp
View file @
07a673c6
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -257,7 +259,7 @@ void profile_gemm_bias_relu_add_impl(int do_verification,
{
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
check_err
or
(
c_m_n_
host
_result
,
c_m_n_
device
_result
);
ck
::
utils
::
check_err
(
c_m_n_
device
_result
.
mData
,
c_m_n_
host
_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_gemm_bias_relu_impl.hpp
View file @
07a673c6
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -236,7 +238,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
{
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
check_err
or
(
c_m_n_
host
_result
,
c_m_n_
device
_result
);
ck
::
utils
::
check_err
(
c_m_n_
device
_result
.
mData
,
c_m_n_
host
_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_gemm_impl.hpp
View file @
07a673c6
#pragma once
#include <iomanip>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -470,7 +472,7 @@ void profile_gemm_impl(int do_verification,
ref_invoker
.
Run
(
ref_argument
);
check_err
or
(
c_m_n_
host
_result
,
c_m_n_
device_f32
_result
);
ck
::
utils
::
check_err
(
c_m_n_
device_f32
_result
.
mData
,
c_m_n_
host
_result
.
mData
);
if
(
do_log
)
{
...
...
@@ -499,7 +501,7 @@ void profile_gemm_impl(int do_verification,
a_m_k
,
b_k_n
,
c_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
check_err
or
(
c_m_n_
host
_result
,
c_m_n_
device
_result
);
ck
::
utils
::
check_err
(
c_m_n_
device
_result
.
mData
,
c_m_n_
host
_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_grouped_gemm_impl.hpp
View file @
07a673c6
#pragma once
#include <iomanip>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -283,7 +285,7 @@ void profile_grouped_gemm_impl(int do_verification,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
check_err
or
(
c_m_n_
host
_result
,
c_m_n_
device
_result
s
[
i
]
);
ck
::
utils
::
check_err
(
c_m_n_
device
_result
s
[
i
].
mData
,
c_m_n_
host
_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_reduce_impl.hpp
View file @
07a673c6
#pragma once
#include "check_err.hpp"
#include "device_reduce.hpp"
#include "device_reduce_instance.hpp"
#include "reduction_enums.hpp"
...
...
@@ -64,9 +66,9 @@ template <typename DescriptionType>
bool
description_match
(
const
DescriptionType
&
description
,
int
Rank
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
_t
ReduceOpId
,
NanPropagation
_t
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
)
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
)
{
if
(
description
.
Rank_
!=
Rank
||
description
.
ReduceOpId_
!=
static_cast
<
int
>
(
ReduceOpId
)
||
description
.
NanOpt_
!=
static_cast
<
int
>
(
NanOpt
)
||
...
...
@@ -148,9 +150,9 @@ template <typename InDataType,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
NanPropagation
_t
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
profile_reduce_impl_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
...
...
@@ -166,17 +168,17 @@ void profile_reduce_impl_impl(bool do_verification,
using
namespace
ck
::
host_reduce
;
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
(
op_support_indices
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
));
(
op_support_indices
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
));
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
PROPAGATE_NAN
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
PROPAGATE_NAN
);
constexpr
bool
out_support_atomic_add
=
std
::
is_same
<
OutDataType
,
float
>::
value
;
constexpr
bool
op_support_atomic_add
=
!
op_support_indices
&&
ReduceOpId
!=
ReduceTensorOp
_t
::
NORM2
;
!
op_support_indices
&&
ReduceOpId
!=
ReduceTensorOp
::
NORM2
;
constexpr
bool
use_atomic_add
=
(
out_support_atomic_add
&&
op_support_atomic_add
);
// 1) If InDataType is half_t, must use half_t as AccDataType for indexable reduction operations
...
...
@@ -194,7 +196,7 @@ void profile_reduce_impl_impl(bool do_verification,
// 1) The indices can only be used when the reduction operation is indexable
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
(
!
op_support_indices
&&
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
// 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
// 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
...
...
@@ -207,8 +209,8 @@ void profile_reduce_impl_impl(bool do_verification,
// 1) If InDataType is int8_t, the supported operation must be either indexable operations or
// ADD/AVG
constexpr
bool
invalid_reduce_5
=
std
::
is_same
<
InDataType
,
int8_t
>::
value
&&
(
!
op_support_indices
&&
ReduceOpId
!=
ReduceTensorOp
_t
::
ADD
&&
ReduceOpId
!=
ReduceTensorOp
_t
::
AVG
);
(
!
op_support_indices
&&
ReduceOpId
!=
ReduceTensorOp
::
ADD
&&
ReduceOpId
!=
ReduceTensorOp
::
AVG
);
// 1) If InDataType is bhalf_t, must use float as AccDataType for all reduction operations
constexpr
bool
invalid_reduce_6
=
...
...
@@ -455,12 +457,13 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
c
heck_error
(
out_ref
,
out
);
c
k
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
check_indices
(
out_indices_ref
,
out_indices
);
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
};
if
(
do_log
)
...
...
@@ -577,12 +580,13 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
c
heck_error
(
out_ref
,
out
);
c
k
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
check_indices
(
out_indices_ref
,
out_indices
);
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
};
if
(
do_log
)
...
...
@@ -631,9 +635,9 @@ void profile_reduce_impl(bool do_verification,
int
nrepeat
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
_t
ReduceOpId
,
NanPropagation
_t
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
,
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
,
float
alpha
,
float
beta
)
{
...
...
@@ -659,9 +663,9 @@ void profile_reduce_impl(bool do_verification,
OutDataType
,
descType
::
Rank_
,
descType
::
NumReduceDim_
,
static_cast
<
ReduceTensorOp
_t
>
(
descType
::
ReduceOpId_
),
static_cast
<
NanPropagation
_t
>
(
descType
::
NanOpt_
),
static_cast
<
ReduceTensorIndices
_t
>
(
descType
::
IndicesOpt_
)
>
(
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
static_cast
<
NanPropagation
>
(
descType
::
NanOpt_
),
static_cast
<
ReduceTensorIndices
>
(
descType
::
IndicesOpt_
)
>
(
do_verification
,
init_method
,
do_log
,
...
...
profiler/src/profile_batched_gemm.cpp
View file @
07a673c6
...
...
@@ -128,7 +128,8 @@ int profile_batched_gemm(int argc, char* argv[])
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
...
...
@@ -147,7 +148,8 @@ int profile_batched_gemm(int argc, char* argv[])
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
...
...
@@ -206,7 +208,8 @@ int profile_batched_gemm(int argc, char* argv[])
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
...
...
@@ -225,7 +228,8 @@ int profile_batched_gemm(int argc, char* argv[])
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
...
...
@@ -284,7 +288,8 @@ int profile_batched_gemm(int argc, char* argv[])
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
...
...
@@ -303,7 +308,8 @@ int profile_batched_gemm(int argc, char* argv[])
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
...
...
@@ -362,7 +368,8 @@ int profile_batched_gemm(int argc, char* argv[])
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
...
...
profiler/src/profile_batched_gemm_reduce.cpp
View file @
07a673c6
...
...
@@ -9,7 +9,7 @@
int
profile_batched_gemm_reduce
(
int
argc
,
char
*
argv
[])
{
enum
struct
GemmMatrixLayout
_t
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
...
...
@@ -17,7 +17,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
KM_NK_MN
,
// 3
};
enum
struct
GemmReduceDataType
_t
enum
struct
GemmReduceDataType
{
F32_F32_F32_F32_F32
,
// 0
F16_F16_F16_F32_F32
,
// 1
...
...
@@ -40,8 +40,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
GemmReduceDataType
_t
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
_t
>
(
std
::
stoi
(
argv
[
3
]));
const
auto
data_type
=
static_cast
<
GemmReduceDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
...
...
@@ -57,8 +57,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
const
int
BatchCount
=
std
::
stoi
(
argv
[
14
]);
if
(
data_type
==
GemmReduceDataType_t
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout_t
::
MK_KN_MN
)
if
(
data_type
==
GemmReduceDataType
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
...
...
@@ -79,8 +78,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmReduceDataType
_t
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
_t
::
MK_NK_MN
)
else
if
(
data_type
==
GemmReduceDataType
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
...
...
@@ -101,8 +100,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmReduceDataType
_t
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
_t
::
KM_KN_MN
)
else
if
(
data_type
==
GemmReduceDataType
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
...
...
@@ -123,8 +122,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
(
StrideC
<
0
)
?
N
:
StrideC
,
BatchCount
);
}
else
if
(
data_type
==
GemmReduceDataType
_t
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
_t
::
KM_NK_MN
)
else
if
(
data_type
==
GemmReduceDataType
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
...
...
Prev
1
…
10
11
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment