Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
31b40352
Unverified
Commit
31b40352
authored
Aug 18, 2021
by
Chao Liu
Committed by
GitHub
Aug 18, 2021
Browse files
Merge pull request #16 from ROCmSoftwarePlatform/develop
Merge develop into master
parents
5781adf5
b62bf8c3
Changes
145
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1252 additions
and
2281 deletions
+1252
-2281
host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
+413
-0
host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
+418
-0
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
+191
-0
host/driver_offline/src/conv_bwd_driver_offline.cpp
host/driver_offline/src/conv_bwd_driver_offline.cpp
+73
-109
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+108
-114
host/driver_online/CMakeLists.txt
host/driver_online/CMakeLists.txt
+0
-22
host/driver_online/conv_fwd_driver_online.cpp
host/driver_online/conv_fwd_driver_online.cpp
+0
-453
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+0
-395
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
...tion_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+0
-386
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
...tion_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+0
-389
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+0
-182
host/host_tensor/CMakeLists.txt
host/host_tensor/CMakeLists.txt
+2
-0
host/host_tensor/include/conv_common.hpp
host/host_tensor/include/conv_common.hpp
+5
-5
host/host_tensor/include/device.hpp
host/host_tensor/include/device.hpp
+7
-13
host/host_tensor/include/host_conv.hpp
host/host_tensor/include/host_conv.hpp
+15
-17
host/host_tensor/include/host_conv_bwd_data.hpp
host/host_tensor/include/host_conv_bwd_data.hpp
+9
-17
host/host_tensor/include/host_tensor.hpp
host/host_tensor/include/host_tensor.hpp
+1
-1
host/host_tensor/include/host_tensor_generator.hpp
host/host_tensor/include/host_tensor_generator.hpp
+1
-1
host/host_tensor/src/device.cpp
host/host_tensor/src/device.cpp
+9
-9
host/online_compile/CMakeLists.txt
host/online_compile/CMakeLists.txt
+0
-168
No files found.
host/driver_offline/include/driver_
dynamic_
gemm_dlops_v1r2.hpp
→
host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
View file @
31b40352
#ifndef DRIVER_
DYNAMIC_
GEMM_DLOPS_V1R2
#define DRIVER_
DYNAMIC_
GEMM_DLOPS_V1R2
#ifndef DRIVER_GEMM_DLOPS_V1R2
#define DRIVER_GEMM_DLOPS_V1R2
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_dlops_v1r2.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v1r2.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
...
...
@@ -43,23 +43,23 @@ template <ck::index_t BlockSize,
typename
CThreadTransferSrcDstAccessOrder
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
typename
AGrid
I
te
rator
Hacks
,
typename
BGrid
I
te
rator
Hacks
,
typename
CGrid
I
te
rator
Hacks
,
typename
AGridMoveSliceWindow
I
te
rator
Hacks
,
typename
BGridMoveSliceWindow
I
te
rator
Hacks
>
__host__
float
driver_
dynamic_
gemm_dlops_v1r2
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AKMGridDesc
&
a_k_m_grid_desc
,
const
BKNGridDesc
&
b_k_n_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
,
ck
::
index_t
nrepeat
)
typename
AGrid
S
te
p
Hacks
,
typename
BGrid
S
te
p
Hacks
,
typename
CGrid
S
te
p
Hacks
,
typename
AGridMoveSliceWindow
S
te
p
Hacks
,
typename
BGridMoveSliceWindow
S
te
p
Hacks
>
__host__
float
driver_gemm_dlops_v1r2
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AKMGridDesc
&
a_k_m_grid_desc
,
const
BKNGridDesc
&
b_k_n_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
...
...
@@ -72,49 +72,48 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
constexpr
auto
I5
=
Number
<
5
>
{};
// GEMM
using
GridwiseGemm
=
GridwiseDynamicGemmDlops_km_kn_mn_v1r2
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
CGlobalMemoryDataOperation
,
AKMGridDesc
,
BKNGridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
M1PerThread
,
N1PerThread
,
KPerThread
,
M1N1ThreadClusterM10
,
M1N1ThreadClusterN10
,
M1N1ThreadClusterM11
,
M1N1ThreadClusterN11
,
ABlockTransferThreadSliceLengths_K_M0_M1
,
ABlockTransferThreadClusterLengths_K_M0_M1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_M1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K_N0_N1
,
BBlockTransferThreadClusterLengths_K_N0_N1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_N1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridIteratorHacks
,
BGridIteratorHacks
,
CGridIteratorHacks
,
AGridMoveSliceWindowIteratorHacks
,
BGridMoveSliceWindowIteratorHacks
>
;
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v1r2
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
CGlobalMemoryDataOperation
,
AKMGridDesc
,
BKNGridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
M1PerThread
,
N1PerThread
,
KPerThread
,
M1N1ThreadClusterM10
,
M1N1ThreadClusterN10
,
M1N1ThreadClusterM11
,
M1N1ThreadClusterN11
,
ABlockTransferThreadSliceLengths_K_M0_M1
,
ABlockTransferThreadClusterLengths_K_M0_M1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_M1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K_N0_N1
,
BBlockTransferThreadClusterLengths_K_N0_N1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_N1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
>
;
const
auto
M
=
a_k_m_grid_desc
.
GetLength
(
I1
);
const
auto
N
=
b_k_n_grid_desc
.
GetLength
(
I1
);
...
...
@@ -122,8 +121,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
if
(
!
GridwiseGemm
::
CheckValidity
(
a_k_m_grid_desc
,
b_k_n_grid_desc
,
c_m_n_grid_desc
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r2 has invalid setting"
);
throw
std
::
runtime_error
(
"wrong! GridwiseGemmDlops_km_kn_mn_v1r2 has invalid setting"
);
}
const
auto
a_k_m0_m1_grid_desc
=
GridwiseGemm
::
MakeAKM0M1GridDescriptor
(
a_k_m_grid_desc
);
...
...
@@ -174,22 +172,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
true
>
;
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
...
...
@@ -201,22 +198,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
false
>
;
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
...
...
@@ -228,22 +224,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
true
>
;
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
...
...
@@ -255,22 +250,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
false
>
;
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
...
...
@@ -299,15 +293,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
true
>
;
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
...
...
@@ -315,27 +309,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
(
void
CONSTANT
*
)
a_k_m0_m1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
b_k_n0_n1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
());
cast_pointer_to_constant_address_space
(
a_k_m0_m1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_k_n0_n1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
false
>
;
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
...
...
@@ -343,27 +338,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
(
void
CONSTANT
*
)
a_k_m0_m1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
b_k_n0_n1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
());
cast_pointer_to_constant_address_space
(
a_k_m0_m1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_k_n0_n1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
true
>
;
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
...
...
@@ -371,27 +367,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
(
void
CONSTANT
*
)
a_k_m0_m1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
b_k_n0_n1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
());
cast_pointer_to_constant_address_space
(
a_k_m0_m1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_k_n0_n1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
false
>
;
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
remove_reference_t
<
BKN0N1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
...
...
@@ -399,14 +396,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
(
void
CONSTANT
*
)
a_k_m0_m1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
b_k_n0_n1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
());
cast_pointer_to_constant_address_space
(
a_k_m0_m1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_k_n0_n1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
return
ave_time
;
...
...
host/driver_offline/include/driver_
dynamic_
gemm_dlops_v1r3.hpp
→
host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
View file @
31b40352
#ifndef DRIVER_
DYNAMIC_
GEMM_DLOPS_V1R3
#define DRIVER_
DYNAMIC_
GEMM_DLOPS_V1R3
#ifndef DRIVER_GEMM_DLOPS_V1R3
#define DRIVER_GEMM_DLOPS_V1R3
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_dlops_v1r3.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v1r3.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
...
...
@@ -39,23 +39,23 @@ template <ck::index_t BlockSize,
typename
CThreadTransferSrcDstAccessOrder
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
typename
AGrid
I
te
rator
Hacks
,
typename
BGrid
I
te
rator
Hacks
,
typename
CGrid
I
te
rator
Hacks
,
typename
AGridMoveSliceWindow
I
te
rator
Hacks
,
typename
BGridMoveSliceWindow
I
te
rator
Hacks
>
__host__
float
driver_
dynamic_
gemm_dlops_v1r3
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
const
BK0NK1GridDesc
&
b_k0_n_k1_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
,
ck
::
index_t
nrepeat
)
typename
AGrid
S
te
p
Hacks
,
typename
BGrid
S
te
p
Hacks
,
typename
CGrid
S
te
p
Hacks
,
typename
AGridMoveSliceWindow
S
te
p
Hacks
,
typename
BGridMoveSliceWindow
S
te
p
Hacks
>
__host__
float
driver_gemm_dlops_v1r3
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
const
BK0NK1GridDesc
&
b_k0_n_k1_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
...
...
@@ -69,44 +69,44 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
// GEMM
using
GridwiseGemm
=
Gridwise
Dynamic
GemmDlops_km_kn_mn_v1r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
CGlobalMemoryDataOperation
,
AK0MK1GridDesc
,
BK0NK1GridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
M1PerThread
,
N1PerThread
,
KPerThread
,
M1N1ThreadClusterM1Xs
,
M1N1ThreadClusterN1Xs
,
ABlockTransferThreadSliceLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
,
ABlockTransferSrcVectorTensorContiguousDimOrder
,
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
,
BBlockTransferThreadSliceLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
,
BBlockTransferSrcVectorTensorContiguousDimOrder
,
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
>
;
GridwiseGemmDlops_km_kn_mn_v1r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
CGlobalMemoryDataOperation
,
AK0MK1GridDesc
,
BK0NK1GridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
M1PerThread
,
N1PerThread
,
KPerThread
,
M1N1ThreadClusterM1Xs
,
M1N1ThreadClusterN1Xs
,
ABlockTransferThreadSliceLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
,
ABlockTransferSrcVectorTensorContiguousDimOrder
,
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
,
BBlockTransferThreadSliceLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
,
BBlockTransferSrcVectorTensorContiguousDimOrder
,
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
>
;
const
auto
M
=
a_k0_m_k1_grid_desc
.
GetLength
(
I1
);
const
auto
N
=
b_k0_n_k1_grid_desc
.
GetLength
(
I1
);
...
...
@@ -114,8 +114,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
if
(
!
GridwiseGemm
::
CheckValidity
(
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r3 has invalid setting"
);
throw
std
::
runtime_error
(
"wrong! GridwiseGemmDlops_km_kn_mn_v1r3 has invalid setting"
);
}
const
auto
a_k0_m0_m1_k1_grid_desc
=
...
...
@@ -170,22 +169,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
true
>
;
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
...
...
@@ -197,22 +195,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
false
>
;
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
...
...
@@ -224,22 +221,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
true
>
;
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
...
...
@@ -251,22 +247,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
false
>
;
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
...
...
@@ -295,15 +290,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
true
>
;
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
...
...
@@ -311,27 +306,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
(
void
CONSTANT
*
)
a_k0_m0_m1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
b_k0_n0_n1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
());
cast_pointer_to_constant_address_space
(
a_k0_m0_m1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_k0_n0_n1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
false
>
;
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
true
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
...
...
@@ -339,27 +337,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
(
void
CONSTANT
*
)
a_k0_m0_m1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
b_k0_n0_n1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
());
cast_pointer_to_constant_address_space
(
a_k0_m0_m1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_k0_n0_n1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
true
>
;
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
...
...
@@ -367,27 +368,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
(
void
CONSTANT
*
)
a_k0_m0_m1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
b_k0_n0_n1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
());
cast_pointer_to_constant_address_space
(
a_k0_m0_m1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_k0_n0_n1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
false
>
;
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
remove_reference_t
<
BK0N0N1K1GridDesc
>
,
remove_reference_t
<
CM0M10M11N0N10N11GridDesc
>
,
remove_reference_t
<
CBlockIdToM0N0BlockClusterAdaptor
>
,
false
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
...
...
@@ -395,14 +399,17 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
(
void
CONSTANT
*
)
a_k0_m0_m1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
b_k0_n0_n1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
());
cast_pointer_to_constant_address_space
(
a_k0_m0_m1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_k0_n0_n1_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
return
ave_time
;
...
...
host/driver_offline/include/driver_
dynamic_
gemm_xdlops_v2r3.hpp
→
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
View file @
31b40352
#ifndef DRIVER_
DYNAMIC_
GEMM_XDLOPS_V2R3
#define DRIVER_
DYNAMIC_
GEMM_XDLOPS_V2R3
#ifndef DRIVER_GEMM_XDLOPS_V2R3
#define DRIVER_GEMM_XDLOPS_V2R3
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_xdlops_v2r3.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
...
...
@@ -41,24 +41,24 @@ template <ck::index_t BlockSize,
typename
CThreadTransferSrcDstAccessOrder
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
typename
AGrid
I
te
rator
Hacks
,
typename
BGrid
I
te
rator
Hacks
,
typename
CGrid
I
te
rator
Hacks
,
typename
AGridMoveSliceWindow
I
te
rator
Hacks
,
typename
BGridMoveSliceWindow
I
te
rator
Hacks
,
typename
AGrid
S
te
p
Hacks
,
typename
BGrid
S
te
p
Hacks
,
typename
CGrid
S
te
p
Hacks
,
typename
AGridMoveSliceWindow
S
te
p
Hacks
,
typename
BGridMoveSliceWindow
S
te
p
Hacks
,
bool
CAccessOrderMRepeatNRepeat
>
__host__
float
driver_
dynamic_
gemm_xdlops_v2r3
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
const
BK0NK1GridDesc
&
b_k0_n_k1_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
,
ck
::
index_t
nrepeat
)
__host__
float
driver_gemm_xdlops_v2r3
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
const
BK0NK1GridDesc
&
b_k0_n_k1_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
...
...
@@ -66,52 +66,49 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
using
GridwiseGemm
=
Gridwise
Dynamic
Gemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
CGlobalMemoryDataOperation
,
AK0MK1GridDesc
,
BK0NK1GridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerWave
,
NPerWave
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
,
CAccessOrderMRepeatNRepeat
>
;
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
CGlobalMemoryDataOperation
,
AK0MK1GridDesc
,
BK0NK1GridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerWave
,
NPerWave
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
,
CAccessOrderMRepeatNRepeat
>
;
{
std
::
cout
<<
"a_k0_m_k1_grid_desc{"
<<
a_k0_m_k1_grid_desc
.
GetLength
(
I0
)
<<
", "
...
...
@@ -129,7 +126,7 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
if
(
!
GridwiseGemm
::
CheckValidity
(
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
))
{
throw
std
::
runtime_error
(
"wrong! Gridwise
Dynamic
Gemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting"
);
"wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting"
);
}
const
auto
c_m0_m1_m2_n_grid_desc
=
GridwiseGemm
::
MakeCM0M1M2NGridDescriptor
(
c_m_n_grid_desc
);
...
...
@@ -142,13 +139,13 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
const
index_t
grid_size
=
GridwiseGemm
::
CalculateGridSize
(
c_m_n_grid_desc
);
const
auto
kernel
=
kernel_
dynamic_
gemm_xdlops_v2r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0MK1GridDesc
>
,
remove_reference_t
<
BK0NK1GridDesc
>
,
remove_reference_t
<
CM0M1M2NGridDesc
>
,
remove_reference_t
<
CBlockClusterAdaptor
>>
;
const
auto
kernel
=
kernel_gemm_xdlops_v2r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0MK1GridDesc
>
,
remove_reference_t
<
BK0NK1GridDesc
>
,
remove_reference_t
<
CM0M1M2NGridDesc
>
,
remove_reference_t
<
CBlockClusterAdaptor
>>
;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
float
ave_time
=
launch_and_time_kernel
(
kernel
,
...
...
@@ -156,7 +153,6 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
...
...
@@ -176,20 +172,19 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
c_m0_m1_m2_n_grid_desc_dev_buf
.
ToDevice
(
&
c_m0_m1_m2_n_grid_desc
);
c_block_cluster_adaptor_dev_buf
.
ToDevice
(
&
c_block_cluster_adaptor
);
float
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
(
void
CONSTANT
*
)
a_k0_m_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
b_k0_n_k1_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_m0_m1_m2_n_grid_desc_dev_buf
.
GetDeviceBuffer
(),
(
void
CONSTANT
*
)
c_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
());
float
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
cast_pointer_to_constant_address_space
(
a_k0_m_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_k0_n_k1_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_m0_m1_m2_n_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
#endif
return
ave_time
;
}
...
...
host/driver_offline/conv_bwd_driver_offline.cpp
→
host/driver_offline/
src/
conv_bwd_driver_offline.cpp
View file @
31b40352
...
...
@@ -12,10 +12,10 @@
#include "conv_common.hpp"
#include "host_conv_bwd_data.hpp"
#include "device_tensor.hpp"
#include "device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_
DYNAMIC_
MODE 1
#define USE_MODE 1
#define USE_CONV_BWD_V4R1_XDL_NHWC 1
#define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
...
...
@@ -37,7 +37,7 @@ int main(int argc, char* argv[])
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
#if USE_
DYNAMIC_
MODE
#if USE_MODE
// dynamic mode
if
(
argc
!=
22
)
{
...
...
@@ -46,29 +46,29 @@ int main(int argc, char* argv[])
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
a
toi
(
argv
[
1
]));
const
ConvBackwardDataAlgo
algo
=
static_cast
<
ConvBackwardDataAlgo
>
(
a
toi
(
argv
[
2
]));
const
bool
do_verification
=
a
toi
(
argv
[
3
]);
const
int
init_method
=
a
toi
(
argv
[
4
]);
const
bool
do_log
=
a
toi
(
argv
[
5
]);
const
int
nrepeat
=
a
toi
(
argv
[
6
]);
const
index_t
N
=
a
toi
(
argv
[
7
]);
const
index_t
K
=
a
toi
(
argv
[
8
]);
const
index_t
C
=
a
toi
(
argv
[
9
]);
const
index_t
Y
=
a
toi
(
argv
[
10
]);
const
index_t
X
=
a
toi
(
argv
[
11
]);
const
index_t
Hi
=
a
toi
(
argv
[
12
]);
const
index_t
Wi
=
a
toi
(
argv
[
13
]);
const
index_t
conv_stride_h
=
a
toi
(
argv
[
14
]);
const
index_t
conv_stride_w
=
a
toi
(
argv
[
15
]);
const
index_t
conv_dilation_h
=
a
toi
(
argv
[
16
]);
const
index_t
conv_dilation_w
=
a
toi
(
argv
[
17
]);
const
index_t
in_left_pad_h
=
a
toi
(
argv
[
18
]);
const
index_t
in_left_pad_w
=
a
toi
(
argv
[
19
]);
const
index_t
in_right_pad_h
=
a
toi
(
argv
[
20
]);
const
index_t
in_right_pad_w
=
a
toi
(
argv
[
21
]);
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
s
toi
(
argv
[
1
]));
const
ConvBackwardDataAlgo
algo
=
static_cast
<
ConvBackwardDataAlgo
>
(
std
::
s
toi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
s
toi
(
argv
[
3
]);
const
int
init_method
=
std
::
s
toi
(
argv
[
4
]);
const
bool
do_log
=
std
::
s
toi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
s
toi
(
argv
[
6
]);
const
index_t
N
=
std
::
s
toi
(
argv
[
7
]);
const
index_t
K
=
std
::
s
toi
(
argv
[
8
]);
const
index_t
C
=
std
::
s
toi
(
argv
[
9
]);
const
index_t
Y
=
std
::
s
toi
(
argv
[
10
]);
const
index_t
X
=
std
::
s
toi
(
argv
[
11
]);
const
index_t
Hi
=
std
::
s
toi
(
argv
[
12
]);
const
index_t
Wi
=
std
::
s
toi
(
argv
[
13
]);
const
index_t
conv_stride_h
=
std
::
s
toi
(
argv
[
14
]);
const
index_t
conv_stride_w
=
std
::
s
toi
(
argv
[
15
]);
const
index_t
conv_dilation_h
=
std
::
s
toi
(
argv
[
16
]);
const
index_t
conv_dilation_w
=
std
::
s
toi
(
argv
[
17
]);
const
index_t
in_left_pad_h
=
std
::
s
toi
(
argv
[
18
]);
const
index_t
in_left_pad_w
=
std
::
s
toi
(
argv
[
19
]);
const
index_t
in_right_pad_h
=
std
::
s
toi
(
argv
[
20
]);
const
index_t
in_right_pad_w
=
std
::
s
toi
(
argv
[
21
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
...
...
@@ -83,12 +83,12 @@ int main(int argc, char* argv[])
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
a
toi
(
argv
[
1
]));
const
ConvBackwardDataAlgo
algo
=
static_cast
<
ConvBackwardDataAlgo
>
(
a
toi
(
argv
[
2
]));
const
bool
do_verification
=
a
toi
(
argv
[
3
]);
const
int
init_method
=
a
toi
(
argv
[
4
]);
const
bool
do_log
=
a
toi
(
argv
[
5
]);
const
int
nrepeat
=
a
toi
(
argv
[
6
]);
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
s
toi
(
argv
[
1
]));
const
ConvBackwardDataAlgo
algo
=
static_cast
<
ConvBackwardDataAlgo
>
(
std
::
s
toi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
s
toi
(
argv
[
3
]);
const
int
init_method
=
std
::
s
toi
(
argv
[
4
]);
const
bool
do_log
=
std
::
s
toi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
s
toi
(
argv
[
6
]);
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
...
...
@@ -115,23 +115,19 @@ int main(int argc, char* argv[])
#endif
#if 0
constexpr index_t in_vector_size = 1;
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
constexpr
index_t
in_vector_size
=
1
;
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
switch
(
layout
)
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
case
ConvTensorLayout
::
NCHW
:
// NCHW
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
...
...
@@ -144,9 +140,9 @@ int main(int argc, char* argv[])
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
break
;
case
ConvTensorLayout
::
NHWC
:
// NHWC
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
...
...
@@ -159,8 +155,10 @@ int main(int argc, char* argv[])
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
break
;
default:
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
else
{
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in_host
(
in_lengths_host
);
...
...
@@ -213,40 +211,8 @@ int main(int argc, char* argv[])
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchw
=
[
&
]()
{
#if USE_DYNAMIC_MODE
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C
,
Hi
,
Wi
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
C
,
Y
,
X
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K
,
Ho
,
Wo
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
#else
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
C
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
C
>
{},
Number
<
Y
>
{},
Number
<
X
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
K
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
#endif
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
#if USE_
DYNAMIC_
MODE
#if USE_MODE
const
auto
in_lengths_dev
=
make_tuple
(
N
,
Hi
,
Wi
,
C
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
Y
,
X
,
C
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
Ho
,
Wo
,
K
);
...
...
@@ -277,8 +243,6 @@ int main(int argc, char* argv[])
in_right_pads_dev
);
};
const
auto
nhwc_desc
=
f_make_for_device_nhwc
();
#if USE_CONV_BWD_V4R1_XDL_NHWC
if
(
algo
==
ConvBackwardDataAlgo
::
V4R1XDLNHWC
)
{
...
...
@@ -289,20 +253,20 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nhwc
();
device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk
<
in
_data_t
,
acc
_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in_device
,
wei
,
out
,
nrepeat
);
device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc
_data_t
,
out
_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in_device
,
wei
,
out
,
nrepeat
);
}
#endif
...
...
@@ -316,20 +280,20 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nhwc
();
device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
<
in
_data_t
,
acc
_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in_device
,
wei
,
out
,
nrepeat
);
device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc
_data_t
,
out
_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in_device
,
wei
,
out
,
nrepeat
);
}
#endif
...
...
host/driver_offline/conv_fwd_driver_offline.cpp
→
host/driver_offline/
src/
conv_fwd_driver_offline.cpp
View file @
31b40352
...
...
@@ -12,17 +12,17 @@
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_
DYNAMIC_
MODE 1
#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_MODE 1
#define USE_CONV_FWD_V4R4_NCHW 1
#define USE_CONV_FWD_V4R4R2_NHWC 1
#define USE_CONV_FWD_V6R1_NCHW
1
#define USE_CONV_FWD_V6R1_NCHW
0
#define USE_CONV_FWD_V5R1_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
...
...
@@ -49,7 +49,7 @@ int main(int argc, char* argv[])
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
#if USE_
DYNAMIC_
MODE
#if USE_MODE
// dynamic mode
if
(
argc
!=
22
)
{
...
...
@@ -58,29 +58,29 @@ int main(int argc, char* argv[])
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
a
toi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
a
toi
(
argv
[
2
]));
const
bool
do_verification
=
a
toi
(
argv
[
3
]);
const
int
init_method
=
a
toi
(
argv
[
4
]);
const
bool
do_log
=
a
toi
(
argv
[
5
]);
const
int
nrepeat
=
a
toi
(
argv
[
6
]);
const
index_t
N
=
a
toi
(
argv
[
7
]);
const
index_t
K
=
a
toi
(
argv
[
8
]);
const
index_t
C
=
a
toi
(
argv
[
9
]);
const
index_t
Y
=
a
toi
(
argv
[
10
]);
const
index_t
X
=
a
toi
(
argv
[
11
]);
const
index_t
Hi
=
a
toi
(
argv
[
12
]);
const
index_t
Wi
=
a
toi
(
argv
[
13
]);
const
index_t
conv_stride_h
=
a
toi
(
argv
[
14
]);
const
index_t
conv_stride_w
=
a
toi
(
argv
[
15
]);
const
index_t
conv_dilation_h
=
a
toi
(
argv
[
16
]);
const
index_t
conv_dilation_w
=
a
toi
(
argv
[
17
]);
const
index_t
in_left_pad_h
=
a
toi
(
argv
[
18
]);
const
index_t
in_left_pad_w
=
a
toi
(
argv
[
19
]);
const
index_t
in_right_pad_h
=
a
toi
(
argv
[
20
]);
const
index_t
in_right_pad_w
=
a
toi
(
argv
[
21
]);
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
s
toi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
s
toi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
s
toi
(
argv
[
3
]);
const
int
init_method
=
std
::
s
toi
(
argv
[
4
]);
const
bool
do_log
=
std
::
s
toi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
s
toi
(
argv
[
6
]);
const
index_t
N
=
std
::
s
toi
(
argv
[
7
]);
const
index_t
K
=
std
::
s
toi
(
argv
[
8
]);
const
index_t
C
=
std
::
s
toi
(
argv
[
9
]);
const
index_t
Y
=
std
::
s
toi
(
argv
[
10
]);
const
index_t
X
=
std
::
s
toi
(
argv
[
11
]);
const
index_t
Hi
=
std
::
s
toi
(
argv
[
12
]);
const
index_t
Wi
=
std
::
s
toi
(
argv
[
13
]);
const
index_t
conv_stride_h
=
std
::
s
toi
(
argv
[
14
]);
const
index_t
conv_stride_w
=
std
::
s
toi
(
argv
[
15
]);
const
index_t
conv_dilation_h
=
std
::
s
toi
(
argv
[
16
]);
const
index_t
conv_dilation_w
=
std
::
s
toi
(
argv
[
17
]);
const
index_t
in_left_pad_h
=
std
::
s
toi
(
argv
[
18
]);
const
index_t
in_left_pad_w
=
std
::
s
toi
(
argv
[
19
]);
const
index_t
in_right_pad_h
=
std
::
s
toi
(
argv
[
20
]);
const
index_t
in_right_pad_w
=
std
::
s
toi
(
argv
[
21
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
...
...
@@ -95,12 +95,12 @@ int main(int argc, char* argv[])
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
a
toi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
a
toi
(
argv
[
2
]));
const
bool
do_verification
=
a
toi
(
argv
[
3
]);
const
int
init_method
=
a
toi
(
argv
[
4
]);
const
bool
do_log
=
a
toi
(
argv
[
5
]);
const
int
nrepeat
=
a
toi
(
argv
[
6
]);
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
std
::
s
toi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
s
toi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
s
toi
(
argv
[
3
]);
const
int
init_method
=
std
::
s
toi
(
argv
[
4
]);
const
bool
do_log
=
std
::
s
toi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
s
toi
(
argv
[
6
]);
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
...
...
@@ -142,10 +142,8 @@ int main(int argc, char* argv[])
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
switch
(
layout
)
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
case
ConvTensorLayout
::
NCHW
:
// NCHW
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
...
...
@@ -158,9 +156,9 @@ int main(int argc, char* argv[])
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
break
;
case
ConvTensorLayout
::
NHWC
:
// NHWC
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
...
...
@@ -173,8 +171,10 @@ int main(int argc, char* argv[])
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
break
;
default:
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
else
{
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
...
...
@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
}
auto
f_make_for_device_nchw
=
[
&
]()
{
#if USE_
DYNAMIC_
MODE
#if USE_MODE
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C
,
Hi
,
Wi
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
C
,
Y
,
X
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K
,
Ho
,
Wo
);
...
...
@@ -260,7 +260,7 @@ int main(int argc, char* argv[])
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
#if USE_
DYNAMIC_
MODE
#if USE_MODE
const
auto
in_lengths_dev
=
make_tuple
(
N
,
Hi
,
Wi
,
C
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
Y
,
X
,
C
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
Ho
,
Wo
,
K
);
...
...
@@ -301,20 +301,19 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
...
...
@@ -328,20 +327,19 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nhwc
();
device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
...
...
@@ -355,20 +353,19 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
...
...
@@ -382,21 +379,20 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
16
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
16
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
...
...
@@ -410,9 +406,9 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
...
...
@@ -437,9 +433,9 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nhwc
();
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
...
...
@@ -467,7 +463,6 @@ int main(int argc, char* argv[])
check_error
(
out_host
,
out_device
);
#if 0
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
...
...
@@ -475,6 +470,5 @@ int main(int argc, char* argv[])
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
}
#endif
}
}
host/driver_online/CMakeLists.txt
deleted
100644 → 0
View file @
5781adf5
include_directories
(
BEFORE
include
${
PROJECT_BINARY_DIR
}
/host/online_compile/include
${
PROJECT_SOURCE_DIR
}
/host/online_compile/include
${
PROJECT_SOURCE_DIR
}
/host/host_tensor/include
${
PROJECT_SOURCE_DIR
}
/host/solver/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_description
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_operation
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/problem_transform
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/driver
${
PROJECT_SOURCE_DIR
}
/external/rocm/include
${
PROJECT_SOURCE_DIR
}
/external/half/include
)
set
(
CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp
)
add_executable
(
conv_fwd_driver_online
${
CONV_FWD_DRIVER_ONLINE_SOURCE
}
)
target_link_libraries
(
conv_fwd_driver_online PRIVATE host_tensor
)
target_link_libraries
(
conv_fwd_driver_online PRIVATE online_compile
)
host/driver_online/conv_fwd_driver_online.cpp
deleted
100644 → 0
View file @
5781adf5
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "handle.hpp"
#include "hipCheck.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_CONV_FWD_V4R4_NCHW 1
#define USE_CONV_FWD_V6R1_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
enum
ConvForwardAlgo
{
V4R4NCHW
,
// 0
V6R1NCHW
,
// 1
V4R4XDLNCHW
,
// 2
V4R4XDLNHWC
// 3
};
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
using
namespace
ck_driver
;
using
size_t
=
std
::
size_t
;
hipStream_t
stream
;
online_compile
::
Handle
*
handle
;
MY_HIP_CHECK
(
hipStreamCreate
(
&
stream
));
handle
=
new
online_compile
::
Handle
(
stream
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
if
(
argc
!=
22
)
{
printf
(
"arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
atoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
atoi
(
argv
[
2
]));
const
bool
do_verification
=
atoi
(
argv
[
3
]);
const
int
init_method
=
atoi
(
argv
[
4
]);
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
const
index_t
N
=
atoi
(
argv
[
7
]);
const
index_t
K
=
atoi
(
argv
[
8
]);
const
index_t
C
=
atoi
(
argv
[
9
]);
const
index_t
Y
=
atoi
(
argv
[
10
]);
const
index_t
X
=
atoi
(
argv
[
11
]);
const
index_t
Hi
=
atoi
(
argv
[
12
]);
const
index_t
Wi
=
atoi
(
argv
[
13
]);
const
index_t
conv_stride_h
=
atoi
(
argv
[
14
]);
const
index_t
conv_stride_w
=
atoi
(
argv
[
15
]);
const
index_t
conv_dilation_h
=
atoi
(
argv
[
16
]);
const
index_t
conv_dilation_w
=
atoi
(
argv
[
17
]);
const
index_t
in_left_pad_h
=
atoi
(
argv
[
18
]);
const
index_t
in_left_pad_w
=
atoi
(
argv
[
19
]);
const
index_t
in_right_pad_h
=
atoi
(
argv
[
20
]);
const
index_t
in_right_pad_w
=
atoi
(
argv
[
21
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#if 1
using
in_data_t
=
float
;
using
acc_data_t
=
float
;
using
out_data_t
=
float
;
#elif 0
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
// NCHW
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
break
;
case
ConvTensorLayout
::
NHWC
:
// NHWC
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
break
;
default:
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchw
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C
,
Hi
,
Wi
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
C
,
Y
,
X
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K
,
Ho
,
Wo
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
);
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
Hi
,
Wi
,
C
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
Y
,
X
,
C
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
Ho
,
Wo
,
K
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
);
};
const
auto
conv_strides
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
#if USE_CONV_FWD_V4R4_NCHW
if
(
algo
==
ConvForwardAlgo
::
V4R4NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
*
tunable
=
&
default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
handle
,
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
in
,
wei
,
out_device
,
tunable
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V6R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V6R1NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
#if 1
const
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
compile_param
=
{
get_datatype_enum_from_type
<
in_data_t
>::
value
,
get_datatype_enum_from_type
<
acc_data_t
>::
value
,
get_datatype_enum_from_type
<
out_data_t
>::
value
,
256
,
4
,
1
,
128
,
32
,
8
,
4
,
4
,
1
,
{
8
,
2
},
{
8
,
2
},
{
4
,
1
,
1
,
1
,
1
},
{
2
,
1
,
1
,
128
,
1
},
{
4
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
4
,
1
,
1
,
1
},
{
8
,
1
,
1
,
32
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
4
,
true
,
true
};
#elif 0
const
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
compile_param
=
{
get_datatype_enum_from_type
<
in_data_t
>::
value
,
get_datatype_enum_from_type
<
acc_data_t
>::
value
,
get_datatype_enum_from_type
<
out_data_t
>::
value
,
256
,
4
,
2
,
128
,
32
,
8
,
4
,
4
,
1
,
{
8
,
2
},
{
8
,
2
},
{
4
,
1
,
1
,
1
,
2
},
{
2
,
1
,
1
,
128
,
1
},
{
4
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
4
,
1
,
1
,
2
},
{
8
,
1
,
1
,
32
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
4
,
true
,
true
};
#elif 1
const
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
compile_param
=
{
get_datatype_enum_from_type
<
in_data_t
>::
value
,
get_datatype_enum_from_type
<
acc_data_t
>::
value
,
get_datatype_enum_from_type
<
out_data_t
>::
value
,
256
,
4
,
4
,
128
,
32
,
8
,
4
,
4
,
1
,
{
8
,
2
},
{
8
,
2
},
{
4
,
1
,
1
,
1
,
4
},
{
2
,
1
,
1
,
128
,
1
},
{
4
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
4
,
1
,
1
,
4
},
{
8
,
1
,
1
,
32
,
1
},
{
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
},
4
,
true
,
true
};
#endif
online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
handle
,
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
in
,
wei
,
out_device
,
compile_param
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V4R4_XDLOPS_NCHW
if
(
algo
==
ConvForwardAlgo
::
V4R4XDLNCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
*
tunable
=
&
default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
handle
,
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
in
,
wei
,
out_device
,
tunable
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V4R4_XDLOPS_NHWC
if
(
algo
==
ConvForwardAlgo
::
V4R4XDLNHWC
)
{
if
(
layout
!=
ConvTensorLayout
::
NHWC
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nhwc
();
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
*
tunable
=
&
default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
handle
,
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
in
,
wei
,
out_device
,
tunable
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
check_error
(
out_host
,
out_device
);
#if 0
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
#endif
}
delete
handle
;
MY_HIP_CHECK
(
hipStreamDestroy
(
stream
));
}
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
5781adf5
#pragma once
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
namespace
detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
{
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_network_config_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_network_config_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
*
pt
)
{
std
::
string
out
(
"TUN_"
);
out
+=
std
::
to_string
(
pt
->
BlockSize
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
NPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
KPerBlock
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
M1PerThread
)
+
"x"
+
std
::
to_string
(
pt
->
N1PerThread
)
+
"x"
+
std
::
to_string
(
pt
->
KPerThread
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
M1N1ThreadClusterM10
)
+
"x"
+
std
::
to_string
(
pt
->
M1N1ThreadClusterN10
)
+
"x"
+
std
::
to_string
(
pt
->
M1N1ThreadClusterM11
)
+
"x"
+
std
::
to_string
(
pt
->
M1N1ThreadClusterN11
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_M1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_N1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_definition_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
" -DCK_PARAM_ABDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
" -DCK_PARAM_AccDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
" -DCK_PARAM_CDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_definition_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
*
pt
)
{
std
::
string
out
;
out
+=
" -DCK_PARAM_BlockSize="
+
std
::
to_string
(
pt
->
BlockSize
);
out
+=
" -DCK_PARAM_MPerBlock="
+
std
::
to_string
(
pt
->
MPerBlock
)
+
" -DCK_PARAM_NPerBlock="
+
std
::
to_string
(
pt
->
NPerBlock
)
+
" -DCK_PARAM_KPerBlock="
+
std
::
to_string
(
pt
->
KPerBlock
);
out
+=
" -DCK_PARAM_M1PerThread="
+
std
::
to_string
(
pt
->
M1PerThread
)
+
" -DCK_PARAM_N1PerThread="
+
std
::
to_string
(
pt
->
N1PerThread
)
+
" -DCK_PARAM_KPerThread="
+
std
::
to_string
(
pt
->
KPerThread
);
out
+=
" -DCK_PARAM_M1N1ThreadClusterM10="
+
std
::
to_string
(
pt
->
M1N1ThreadClusterM10
)
+
" -DCK_PARAM_M1N1ThreadClusterN10="
+
std
::
to_string
(
pt
->
M1N1ThreadClusterN10
)
+
" -DCK_PARAM_M1N1ThreadClusterM11="
+
std
::
to_string
(
pt
->
M1N1ThreadClusterM11
)
+
" -DCK_PARAM_M1N1ThreadClusterN11="
+
std
::
to_string
(
pt
->
M1N1ThreadClusterN11
);
out
+=
" -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K_M0_M1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K_M0_M1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_ABlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_ABlockTransferDstScalarPerVector_M1="
+
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_M1
);
out
+=
" -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K_N0_N1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K_N0_N1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_BBlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_BBlockTransferDstScalarPerVector_N1="
+
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_N1
);
out
+=
" -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstAccessOrder="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
]);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstVectorDim="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
);
out
+=
" -DCK_PARAM_CThreadTransferDstScalarPerVector="
+
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
}
// namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
(
online_compile
::
Handle
*
handle
,
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
const
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
*
tunable
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
namespace
ck_driver
;
using
namespace
detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
;
using
size_t
=
std
::
size_t
;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
in_n_c_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_c_hi_wi_lengths
);
const
auto
wei_k_c_y_x_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_c_y_x_lengths
);
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_k_ho_wo_lengths
);
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
in_n_c_hi_wi_desc
,
out_n_k_ho_wo_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
auto
a_k_m_grid_desc
=
descs
[
I0
];
const
auto
c_m_n_grid_desc
=
descs
[
I2
];
const
auto
M
=
c_m_n_grid_desc
.
GetLength
(
I0
);
const
auto
N
=
c_m_n_grid_desc
.
GetLength
(
I1
);
const
auto
K
=
a_k_m_grid_desc
.
GetLength
(
I0
);
const
index_t
grid_size
=
(
M
/
tunable
->
MPerBlock
)
*
(
N
/
tunable
->
NPerBlock
);
const
bool
hasMainKBlockLoop
=
((
K
+
tunable
->
KPerBlock
)
/
(
2
*
tunable
->
KPerBlock
)
>
1
);
const
bool
hasDoubleTailKBlockLoop
=
((
K
/
tunable
->
KPerBlock
)
%
2
==
0
);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem
in_n_c_hi_wi_dev_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_dev_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k_ho_wo_dev_buf
(
sizeof
(
TOut
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
in_n_c_hi_wi_dev_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_k_c_y_x_dev_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_dev_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem
workspace_buf
(
4096
);
void
*
a_k_m0_m1_grid_desc_dev_buf
=
workspace_buf
.
GetDeviceBuffer
();
void
*
b_k_n0_n1_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
1024
);
void
*
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
2048
);
void
*
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
3072
);
const
std
::
vector
<
size_t
>
vld
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd1
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd2
=
{
static_cast
<
size_t
>
(
grid_size
*
tunable
->
BlockSize
),
1
,
1
};
std
::
string
program_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp"
;
std
::
string
algo_name
=
"implicit_gemm_conv_fwd_v4r4_dlops_nchw"
;
std
::
string
param
=
" -std=c++17 "
;
std
::
string
network_config
;
param
+=
get_definition_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
" "
+
get_definition_string_from_tunable
(
tunable
)
+
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP="
+
std
::
to_string
(
hasMainKBlockLoop
)
+
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP="
+
std
::
to_string
(
hasDoubleTailKBlockLoop
);
network_config
=
get_network_config_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
"_"
+
get_network_config_string_from_tunable
(
tunable
)
+
"_"
+
std
::
to_string
(
hasMainKBlockLoop
)
+
"_"
+
std
::
to_string
(
hasDoubleTailKBlockLoop
);
std
::
vector
<
float
>
kernel1_times
;
std
::
vector
<
float
>
kernel2_times
;
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
KernelTimer
timer1
,
timer2
;
std
::
string
kernel_name
;
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare"
;
auto
network_config_1
=
network_config
+
"_1"
;
timer1
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_1
,
program_name
,
kernel_name
,
vld
,
vgd1
,
param
)(
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I0
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I1
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I2
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I3
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I0
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I2
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I3
]),
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
a_k_m0_m1_grid_desc_dev_buf
,
b_k_n0_n1_grid_desc_dev_buf
,
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
);
timer1
.
End
();
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw"
;
auto
network_config_2
=
network_config
+
"_2"
;
timer2
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_2
,
program_name
,
kernel_name
,
vld
,
vgd2
,
param
)(
reinterpret_cast
<
const
TInWei
*>
(
wei_k_c_y_x_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
const
TInWei
*>
(
in_n_c_hi_wi_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
TOut
*>
(
out_n_k_ho_wo_dev_buf
.
GetDeviceBuffer
()),
(
const
void
*
)(
a_k_m0_m1_grid_desc_dev_buf
),
(
const
void
*
)(
b_k_n0_n1_grid_desc_dev_buf
),
(
const
void
*
)(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
),
(
const
void
*
)(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
));
timer2
.
End
();
kernel1_times
.
push_back
(
timer1
.
GetElapsedTime
());
kernel2_times
.
push_back
(
timer2
.
GetElapsedTime
());
}
{
auto
ave_time1
=
std
::
accumulate
(
std
::
next
(
kernel1_times
.
begin
()),
kernel1_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
auto
ave_time2
=
std
::
accumulate
(
std
::
next
(
kernel2_times
.
begin
()),
kernel2_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
const
auto
N
=
in_n_c_hi_wi_lengths
[
I0
];
const
auto
C
=
in_n_c_hi_wi_lengths
[
I1
];
const
auto
K
=
out_n_k_ho_wo_lengths
[
I1
];
const
auto
Ho
=
out_n_k_ho_wo_lengths
[
I2
];
const
auto
Wo
=
out_n_k_ho_wo_lengths
[
I3
];
const
auto
Y
=
wei_k_c_y_x_lengths
[
I2
];
const
auto
X
=
wei_k_c_y_x_lengths
[
I3
];
float
perf
=
(
float
)(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
(
ave_time1
+
ave_time2
);
std
::
cout
<<
"Average time : "
<<
ave_time1
+
ave_time2
<<
" ms("
<<
ave_time1
<<
", "
<<
ave_time2
<<
"), "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
};
// copy result back to host
out_n_k_ho_wo_dev_buf
.
FromDevice
(
out_n_k_ho_wo
.
mData
.
data
());
}
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
5781adf5
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
namespace
detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
{
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_network_config_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_network_config_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
*
pt
)
{
std
::
string
out
(
"TUN_"
);
out
+=
std
::
to_string
(
pt
->
BlockSize
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
NPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
KPerBlock
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerWave
)
+
"x"
+
std
::
to_string
(
pt
->
NPerWave
)
+
"x"
+
std
::
to_string
(
pt
->
MRepeat
)
+
"x"
+
std
::
to_string
(
pt
->
NRepeat
)
+
"x"
+
std
::
to_string
(
pt
->
K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
6
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
7
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_definition_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
" -DCK_PARAM_ABDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
" -DCK_PARAM_AccDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
" -DCK_PARAM_CDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_definition_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
*
pt
)
{
std
::
string
out
;
out
+=
" -DCK_PARAM_BlockSize="
+
std
::
to_string
(
pt
->
BlockSize
);
out
+=
" -DCK_PARAM_MPerBlock="
+
std
::
to_string
(
pt
->
MPerBlock
)
+
" -DCK_PARAM_NPerBlock="
+
std
::
to_string
(
pt
->
NPerBlock
)
+
" -DCK_PARAM_KPerBlock="
+
std
::
to_string
(
pt
->
KPerBlock
);
out
+=
" -DCK_PARAM_MPerWave="
+
std
::
to_string
(
pt
->
MPerWave
)
+
" -DCK_PARAM_NPerWave="
+
std
::
to_string
(
pt
->
NPerWave
)
+
" -DCK_PARAM_K1="
+
std
::
to_string
(
pt
->
K1
)
+
" -DCK_PARAM_MRepeat="
+
std
::
to_string
(
pt
->
MRepeat
)
+
" -DCK_PARAM_NRepeat="
+
std
::
to_string
(
pt
->
NRepeat
);
out
+=
" -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_ABlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_ABlockTransferDstScalarPerVector_K1="
+
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_K1
);
out
+=
" -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_BBlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_BBlockTransferDstScalarPerVector_K1="
+
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_K1
);
out
+=
" -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstAccessOrder="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
6
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
7
]);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstVectorDim="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
);
out
+=
" -DCK_PARAM_CThreadTransferDstScalarPerVector="
+
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
}
// namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
(
online_compile
::
Handle
*
handle
,
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
const
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
*
tunable
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
namespace
ck_driver
;
using
namespace
detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
;
using
size_t
=
std
::
size_t
;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
in_n_c_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_c_hi_wi_lengths
);
const
auto
wei_k_c_y_x_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_c_y_x_lengths
);
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_k_ho_wo_lengths
);
const
auto
n
=
in_n_c_hi_wi_desc
.
GetLength
(
I0
);
const
auto
c
=
in_n_c_hi_wi_desc
.
GetLength
(
I1
);
const
auto
hi
=
in_n_c_hi_wi_desc
.
GetLength
(
I2
);
const
auto
wi
=
in_n_c_hi_wi_desc
.
GetLength
(
I3
);
const
auto
k
=
wei_k_c_y_x_desc
.
GetLength
(
I0
);
const
auto
y
=
wei_k_c_y_x_desc
.
GetLength
(
I2
);
const
auto
x
=
wei_k_c_y_x_desc
.
GetLength
(
I3
);
const
auto
ho
=
out_n_k_ho_wo_desc
.
GetLength
(
I2
);
const
auto
wo
=
out_n_k_ho_wo_desc
.
GetLength
(
I3
);
const
auto
M
=
k
;
const
auto
N
=
n
*
ho
*
wo
;
const
auto
K
=
c
*
y
*
x
;
const
auto
K0
=
K
/
tunable
->
K1
;
const
index_t
grid_size
=
(
M
/
tunable
->
MPerBlock
)
*
(
N
/
tunable
->
NPerBlock
);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem
in_n_c_hi_wi_dev_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_dev_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k_ho_wo_dev_buf
(
sizeof
(
TOut
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
in_n_c_hi_wi_dev_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_k_c_y_x_dev_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_dev_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem
workspace_buf
(
4096
);
void
*
a_k_m0_m1_grid_desc_dev_buf
=
workspace_buf
.
GetDeviceBuffer
();
void
*
b_k_n0_n1_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
1024
);
void
*
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
2048
);
void
*
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
3072
);
const
std
::
vector
<
size_t
>
vld
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd1
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd2
=
{
static_cast
<
size_t
>
(
grid_size
*
tunable
->
BlockSize
),
1
,
1
};
std
::
string
program_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp"
;
std
::
string
algo_name
=
"implicit_gemm_conv_fwd_v4r4_xdlops_nchw"
;
std
::
string
param
=
" -std=c++17 "
;
std
::
string
network_config
;
param
+=
get_definition_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
" "
+
" -DCK_USE_AMD_XDLOPS"
+
get_definition_string_from_tunable
(
tunable
);
network_config
=
get_network_config_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
"_"
+
get_network_config_string_from_tunable
(
tunable
);
std
::
vector
<
float
>
kernel1_times
;
std
::
vector
<
float
>
kernel2_times
;
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
KernelTimer
timer1
,
timer2
;
std
::
string
kernel_name
;
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare"
;
auto
network_config_1
=
network_config
+
"_1"
;
timer1
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_1
,
program_name
,
kernel_name
,
vld
,
vgd1
,
param
)(
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I0
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I1
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I2
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I3
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I0
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I2
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I3
]),
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
a_k_m0_m1_grid_desc_dev_buf
,
b_k_n0_n1_grid_desc_dev_buf
,
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
);
timer1
.
End
();
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw"
;
auto
network_config_2
=
network_config
+
"_2"
;
timer2
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_2
,
program_name
,
kernel_name
,
vld
,
vgd2
,
param
)(
reinterpret_cast
<
const
TInWei
*>
(
wei_k_c_y_x_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
const
TInWei
*>
(
in_n_c_hi_wi_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
TOut
*>
(
out_n_k_ho_wo_dev_buf
.
GetDeviceBuffer
()),
(
const
void
*
)(
a_k_m0_m1_grid_desc_dev_buf
),
(
const
void
*
)(
b_k_n0_n1_grid_desc_dev_buf
),
(
const
void
*
)(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf
),
(
const
void
*
)(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
));
timer2
.
End
();
kernel1_times
.
push_back
(
timer1
.
GetElapsedTime
());
kernel2_times
.
push_back
(
timer2
.
GetElapsedTime
());
}
{
auto
ave_time1
=
std
::
accumulate
(
std
::
next
(
kernel1_times
.
begin
()),
kernel1_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
auto
ave_time2
=
std
::
accumulate
(
std
::
next
(
kernel2_times
.
begin
()),
kernel2_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
const
auto
N
=
in_n_c_hi_wi_lengths
[
I0
];
const
auto
C
=
in_n_c_hi_wi_lengths
[
I1
];
const
auto
K
=
out_n_k_ho_wo_lengths
[
I1
];
const
auto
Ho
=
out_n_k_ho_wo_lengths
[
I2
];
const
auto
Wo
=
out_n_k_ho_wo_lengths
[
I3
];
const
auto
Y
=
wei_k_c_y_x_lengths
[
I2
];
const
auto
X
=
wei_k_c_y_x_lengths
[
I3
];
float
perf
=
(
float
)(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
(
ave_time1
+
ave_time2
);
std
::
cout
<<
"Average time : "
<<
ave_time1
+
ave_time2
<<
" ms("
<<
ave_time1
<<
", "
<<
ave_time2
<<
"), "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
};
// copy result back to host
out_n_k_ho_wo_dev_buf
.
FromDevice
(
out_n_k_ho_wo
.
mData
.
data
());
}
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
deleted
100644 → 0
View file @
5781adf5
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
namespace
detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
{
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_network_config_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
"_"
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_network_config_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
*
pt
)
{
std
::
string
out
(
"TUN_"
);
out
+=
std
::
to_string
(
pt
->
BlockSize
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
NPerBlock
)
+
"x"
+
std
::
to_string
(
pt
->
KPerBlock
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
MPerWave
)
+
"x"
+
std
::
to_string
(
pt
->
NPerWave
)
+
"x"
+
std
::
to_string
(
pt
->
MRepeat
)
+
"x"
+
std
::
to_string
(
pt
->
NRepeat
)
+
"x"
+
std
::
to_string
(
pt
->
K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_K1
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
6
])
+
"x"
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
7
])
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
)
+
"_"
;
out
+=
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
>
static
std
::
string
get_definition_string_from_types
()
{
using
namespace
ck
;
std
::
string
out
;
out
+=
" -DCK_PARAM_ABDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TInWei
>::
value
)
+
" -DCK_PARAM_AccDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TAcc
>::
value
)
+
" -DCK_PARAM_CDataTypeEnum="
+
std
::
to_string
(
get_datatype_enum_from_type
<
TOut
>::
value
);
return
(
out
);
};
static
std
::
string
get_definition_string_from_tunable
(
const
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
*
pt
)
{
std
::
string
out
;
out
+=
" -DCK_PARAM_BlockSize="
+
std
::
to_string
(
pt
->
BlockSize
);
out
+=
" -DCK_PARAM_MPerBlock="
+
std
::
to_string
(
pt
->
MPerBlock
)
+
" -DCK_PARAM_NPerBlock="
+
std
::
to_string
(
pt
->
NPerBlock
)
+
" -DCK_PARAM_KPerBlock="
+
std
::
to_string
(
pt
->
KPerBlock
);
out
+=
" -DCK_PARAM_MPerWave="
+
std
::
to_string
(
pt
->
MPerWave
)
+
" -DCK_PARAM_NPerWave="
+
std
::
to_string
(
pt
->
NPerWave
)
+
" -DCK_PARAM_K1="
+
std
::
to_string
(
pt
->
K1
)
+
" -DCK_PARAM_MRepeat="
+
std
::
to_string
(
pt
->
MRepeat
)
+
" -DCK_PARAM_NRepeat="
+
std
::
to_string
(
pt
->
NRepeat
);
out
+=
" -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadSliceLengths_K0_M_K1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterLengths_K0_M_K1
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
ABlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_ABlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
ABlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_ABlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
ABlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_ABlockTransferDstScalarPerVector_K1="
+
std
::
to_string
(
pt
->
ABlockTransferDstScalarPerVector_K1
);
out
+=
" -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
AThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadSliceLengths_K0_N_K1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterLengths_K0_N_K1
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder="
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferThreadClusterArrangeOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcAccessOrder="
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
BBlockTransferSrcAccessOrder
[
2
]);
out
+=
" -DCK_PARAM_BBlockTransferSrcVectorDim="
+
std
::
to_string
(
pt
->
BBlockTransferSrcVectorDim
);
out
+=
" -DCK_PARAM_BBlockTransferSrcScalarPerVector="
+
std
::
to_string
(
pt
->
BBlockTransferSrcScalarPerVector
);
out
+=
" -DCK_PARAM_BBlockTransferDstScalarPerVector_K1="
+
std
::
to_string
(
pt
->
BBlockTransferDstScalarPerVector_K1
);
out
+=
" -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun="
+
std
::
to_string
(
pt
->
BThreadTransferSrcResetCoordinateAfterRun
);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstAccessOrder="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
0
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
1
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
2
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
3
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
4
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
5
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
6
])
+
","
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstAccessOrder
[
7
]);
out
+=
" -DCK_PARAM_CThreadTransferSrcDstVectorDim="
+
std
::
to_string
(
pt
->
CThreadTransferSrcDstVectorDim
);
out
+=
" -DCK_PARAM_CThreadTransferDstScalarPerVector="
+
std
::
to_string
(
pt
->
CThreadTransferDstScalarPerVector
);
return
(
out
);
};
}
// namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk
(
online_compile
::
Handle
*
handle
,
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_hi_wi_c
,
const
Tensor
<
TInWei
>&
wei_k_y_x_c
,
Tensor
<
TOut
>&
out_n_ho_wo_k
,
const
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
*
tunable
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
namespace
detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
;
using
size_t
=
std
::
size_t
;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
in_n_hi_wi_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_ho_wo_k_lengths
);
const
auto
n
=
in_n_hi_wi_c_desc
.
GetLength
(
I0
);
const
auto
hi
=
in_n_hi_wi_c_desc
.
GetLength
(
I1
);
const
auto
wi
=
in_n_hi_wi_c_desc
.
GetLength
(
I2
);
const
auto
c
=
in_n_hi_wi_c_desc
.
GetLength
(
I3
);
const
auto
k
=
wei_k_y_x_c_desc
.
GetLength
(
I0
);
const
auto
y
=
wei_k_y_x_c_desc
.
GetLength
(
I1
);
const
auto
x
=
wei_k_y_x_c_desc
.
GetLength
(
I2
);
const
auto
ho
=
out_n_ho_wo_k_desc
.
GetLength
(
I1
);
const
auto
wo
=
out_n_ho_wo_k_desc
.
GetLength
(
I2
);
const
auto
M
=
k
;
const
auto
N
=
n
*
ho
*
wo
;
const
auto
K
=
c
*
y
*
x
;
const
auto
K0
=
K
/
tunable
->
K1
;
const
index_t
grid_size
=
(
M
/
tunable
->
MPerBlock
)
*
(
N
/
tunable
->
NPerBlock
);
// these buffers are usually provided by the user application
DeviceMem
in_n_hi_wi_c_dev_buf
(
sizeof
(
TInWei
)
*
in_n_hi_wi_c
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_y_x_c_dev_buf
(
sizeof
(
TInWei
)
*
wei_k_y_x_c
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_ho_wo_k_dev_buf
(
sizeof
(
TOut
)
*
out_n_ho_wo_k
.
mDesc
.
GetElementSpace
());
in_n_hi_wi_c_dev_buf
.
ToDevice
(
in_n_hi_wi_c
.
mData
.
data
());
wei_k_y_x_c_dev_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_dev_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem
workspace_buf
(
4096
);
void
*
a_k0_m_k1_grid_desc_dev_buf
=
workspace_buf
.
GetDeviceBuffer
();
void
*
b_k0_n_k1_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
1024
);
void
*
c_m0_m1_m2_n_grid_desc_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
2048
);
void
*
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
=
static_cast
<
void
*>
(
static_cast
<
unsigned
char
*>
(
workspace_buf
.
GetDeviceBuffer
())
+
3072
);
const
std
::
vector
<
size_t
>
vld
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd1
=
{
static_cast
<
size_t
>
(
tunable
->
BlockSize
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd2
=
{
static_cast
<
size_t
>
(
grid_size
*
tunable
->
BlockSize
),
1
,
1
};
std
::
string
program_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp"
;
std
::
string
algo_name
=
"implicit_gemm_conv_fwd_v4r4_xdlops_nhwc"
;
std
::
string
param
=
" -std=c++17 "
;
std
::
string
network_config
;
param
+=
get_definition_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
" -DCK_USE_AMD_XDLOPS "
;
param
+=
get_definition_string_from_tunable
(
tunable
);
network_config
=
get_network_config_string_from_types
<
TInWei
,
TAcc
,
TOut
>
()
+
"_"
+
get_network_config_string_from_tunable
(
tunable
);
std
::
vector
<
float
>
kernel1_times
;
std
::
vector
<
float
>
kernel2_times
;
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
KernelTimer
timer1
,
timer2
;
std
::
string
kernel_name
;
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare"
;
auto
network_config_1
=
network_config
+
"_1"
;
timer1
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_1
,
program_name
,
kernel_name
,
vld
,
vgd1
,
param
)(
static_cast
<
index_t
>
(
in_n_hi_wi_c_lengths
[
I0
]),
static_cast
<
index_t
>
(
in_n_hi_wi_c_lengths
[
I1
]),
static_cast
<
index_t
>
(
in_n_hi_wi_c_lengths
[
I2
]),
static_cast
<
index_t
>
(
in_n_hi_wi_c_lengths
[
I3
]),
static_cast
<
index_t
>
(
wei_k_y_x_c_lengths
[
I0
]),
static_cast
<
index_t
>
(
wei_k_y_x_c_lengths
[
I1
]),
static_cast
<
index_t
>
(
wei_k_y_x_c_lengths
[
I2
]),
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
a_k0_m_k1_grid_desc_dev_buf
,
b_k0_n_k1_grid_desc_dev_buf
,
c_m0_m1_m2_n_grid_desc_dev_buf
,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
);
timer1
.
End
();
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk"
;
auto
network_config_2
=
network_config
+
"_2"
;
timer2
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_2
,
program_name
,
kernel_name
,
vld
,
vgd2
,
param
)(
reinterpret_cast
<
const
TInWei
*>
(
in_n_hi_wi_c_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
const
TInWei
*>
(
wei_k_y_x_c_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
TOut
*>
(
out_n_ho_wo_k_dev_buf
.
GetDeviceBuffer
()),
(
const
void
*
)(
a_k0_m_k1_grid_desc_dev_buf
),
(
const
void
*
)(
b_k0_n_k1_grid_desc_dev_buf
),
(
const
void
*
)(
c_m0_m1_m2_n_grid_desc_dev_buf
),
(
const
void
*
)(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf
));
timer2
.
End
();
kernel1_times
.
push_back
(
timer1
.
GetElapsedTime
());
kernel2_times
.
push_back
(
timer2
.
GetElapsedTime
());
}
{
auto
ave_time1
=
std
::
accumulate
(
std
::
next
(
kernel1_times
.
begin
()),
kernel1_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
auto
ave_time2
=
std
::
accumulate
(
std
::
next
(
kernel2_times
.
begin
()),
kernel2_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
(
nrepeat
-
1
);
const
auto
N
=
in_n_hi_wi_c_lengths
[
I0
];
const
auto
C
=
in_n_hi_wi_c_lengths
[
I3
];
const
auto
Ho
=
out_n_ho_wo_k_lengths
[
I1
];
const
auto
Wo
=
out_n_ho_wo_k_lengths
[
I2
];
const
auto
K
=
out_n_ho_wo_k_lengths
[
I3
];
const
auto
Y
=
wei_k_y_x_c_lengths
[
I1
];
const
auto
X
=
wei_k_y_x_c_lengths
[
I2
];
float
perf
=
(
float
)(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time2
;
std
::
cout
<<
"Average time : "
<<
ave_time1
+
ave_time2
<<
" ms("
<<
ave_time1
<<
", "
<<
ave_time2
<<
"), "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
};
// copy result back to host
out_n_ho_wo_k_dev_buf
.
FromDevice
(
out_n_ho_wo_k
.
mData
.
data
());
}
host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
5781adf5
#pragma once
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "convolution_problem_descriptor.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
(
online_compile
::
Handle
*
handle
,
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
const
ck_driver
::
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
&
compile_param
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
namespace
ck_driver
;
using
size_t
=
std
::
size_t
;
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
ConvolutionProblemDescriptor
conv_problem_desc
{
in_n_c_hi_wi_lengths
[
I0
],
out_n_k_ho_wo_lengths
[
I1
],
in_n_c_hi_wi_lengths
[
I1
],
wei_k_c_y_x_lengths
[
I2
],
wei_k_c_y_x_lengths
[
I3
],
in_n_c_hi_wi_lengths
[
I2
],
in_n_c_hi_wi_lengths
[
I3
],
out_n_k_ho_wo_lengths
[
I2
],
out_n_k_ho_wo_lengths
[
I3
],
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
get_datatype_enum_from_type
<
TInWei
>::
value
,
get_datatype_enum_from_type
<
TInWei
>::
value
,
get_datatype_enum_from_type
<
TOut
>::
value
};
if
(
!
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
::
IsValidCompileParameter
(
conv_problem_desc
,
compile_param
))
{
throw
std
::
runtime_error
(
"wrong! IsValidCompileParameter fail"
);
}
DeviceMem
in_n_c_hi_wi_dev_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_dev_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k_ho_wo_dev_buf
(
sizeof
(
TOut
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
in_n_c_hi_wi_dev_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_k_c_y_x_dev_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_dev_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
// workspace is used for save transformed tensor descritpors created by prepare kernel
DeviceMem
workspace_dev_buf
(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
::
GetWorkSpaceSize
(
conv_problem_desc
,
compile_param
));
const
auto
block_size
=
std
::
size_t
(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
::
GetBlockSize
(
conv_problem_desc
,
compile_param
));
const
auto
grid_size
=
std
::
size_t
(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
::
GetGridSize
(
conv_problem_desc
,
compile_param
));
const
std
::
vector
<
size_t
>
vld1
=
{
1
,
1
,
1
};
const
std
::
vector
<
size_t
>
vgd1
=
{
1
,
1
,
1
};
const
std
::
vector
<
size_t
>
vld2
=
{
static_cast
<
size_t
>
(
block_size
),
1
,
1
};
const
std
::
vector
<
size_t
>
vgd2
=
{
static_cast
<
size_t
>
(
grid_size
*
block_size
),
1
,
1
};
std
::
string
program_name
=
"dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp"
;
std
::
string
algo_name
=
"implicit_gemm_conv_fwd_v6r1_dlops_nchw"
;
std
::
string
compile_param_string
=
get_ck_hip_online_compile_common_flag
()
+
compile_param
.
GetCompileParameterString
();
std
::
string
network_config
=
compile_param_string
;
std
::
vector
<
float
>
kernel1_times
;
std
::
vector
<
float
>
kernel2_times
;
for
(
index_t
i
=
0
;
i
<
nrepeat
+
1
;
++
i
)
{
KernelTimer
timer1
,
timer2
;
std
::
string
kernel_name
;
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare"
;
auto
network_config_1
=
network_config
+
"_1"
;
timer1
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_1
,
program_name
,
kernel_name
,
vld1
,
vgd1
,
compile_param_string
)(
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I0
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I1
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I2
]),
static_cast
<
index_t
>
(
in_n_c_hi_wi_lengths
[
I3
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I0
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I2
]),
static_cast
<
index_t
>
(
wei_k_c_y_x_lengths
[
I3
]),
conv_strides
[
I0
],
conv_strides
[
I1
],
conv_dilations
[
I0
],
conv_dilations
[
I1
],
in_left_pads
[
I0
],
in_left_pads
[
I1
],
in_right_pads
[
I0
],
in_right_pads
[
I1
],
(
void
*
)(
workspace_dev_buf
.
GetDeviceBuffer
()));
timer1
.
End
();
kernel_name
=
"dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw"
;
auto
network_config_2
=
network_config
+
"_2"
;
timer2
.
Start
();
handle
->
AddKernel
(
algo_name
,
network_config_2
,
program_name
,
kernel_name
,
vld2
,
vgd2
,
compile_param_string
)(
reinterpret_cast
<
const
TInWei
*>
(
wei_k_c_y_x_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
const
TInWei
*>
(
in_n_c_hi_wi_dev_buf
.
GetDeviceBuffer
()),
reinterpret_cast
<
TOut
*>
(
out_n_k_ho_wo_dev_buf
.
GetDeviceBuffer
()),
(
const
void
*
)(
workspace_dev_buf
.
GetDeviceBuffer
()));
timer2
.
End
();
kernel1_times
.
push_back
(
timer1
.
GetElapsedTime
());
kernel2_times
.
push_back
(
timer2
.
GetElapsedTime
());
}
{
auto
ave_time1
=
std
::
accumulate
(
std
::
next
(
kernel1_times
.
begin
()),
kernel1_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
nrepeat
;
auto
ave_time2
=
std
::
accumulate
(
std
::
next
(
kernel2_times
.
begin
()),
kernel2_times
.
end
(),
0.
,
std
::
plus
<
float
>
{})
/
nrepeat
;
float
perf
=
(
float
)(
conv_problem_desc
.
CalculateFlop
())
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
(
ave_time1
+
ave_time2
);
std
::
cout
<<
"Average time : "
<<
ave_time1
+
ave_time2
<<
" ms("
<<
ave_time1
<<
", "
<<
ave_time2
<<
"), "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
};
// copy result back to host
out_n_k_ho_wo_dev_buf
.
FromDevice
(
out_n_k_ho_wo
.
mData
.
data
());
}
host/host_tensor/CMakeLists.txt
View file @
31b40352
...
...
@@ -10,6 +10,8 @@ set(HOST_TENSOR_SOURCE
## the library target
add_library
(
host_tensor SHARED
${
HOST_TENSOR_SOURCE
}
)
target_include_directories
(
host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
target_link_libraries
(
host_tensor PRIVATE hip::device
)
target_link_libraries
(
host_tensor INTERFACE hip::host
)
...
...
host/host_tensor/include/conv_common.hpp
View file @
31b40352
#ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
enum
ConvTensorLayout
{
...
...
@@ -19,8 +19,8 @@ template <typename... InDesc,
typename
LeftPads
,
typename
RightPads
>
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor
(
const
ck
::
Dynamic
TensorDescriptor
<
InDesc
...
>&
in_desc
,
const
ck
::
Dynamic
TensorDescriptor
<
WeiDesc
...
>&
wei_desc
,
const
ck
::
TensorDescriptor
<
InDesc
...
>&
in_desc
,
const
ck
::
TensorDescriptor
<
WeiDesc
...
>&
wei_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
conv_dilations
,
const
LeftPads
&
left_pads
,
...
...
@@ -57,12 +57,12 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
const
auto
Ho
=
(
Hi
+
LeftPadH
+
RightPadH
-
YEff
)
/
conv_strides
[
I0
]
+
I1
;
const
auto
Wo
=
(
Wi
+
LeftPadW
+
RightPadW
-
XEff
)
/
conv_strides
[
I1
]
+
I1
;
return
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
N
,
K
,
Ho
,
Wo
));
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K
,
Ho
,
Wo
));
}
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
constexpr
std
::
size_t
calculate_convolution_flops
(
const
InDesc
&
in_desc
,
const
WeiDesc
&
wei_desc
,
const
OutDesc
&
out_desc
)
calculate_convolution_flops
(
const
InDesc
&
,
const
WeiDesc
&
wei_desc
,
const
OutDesc
&
out_desc
)
{
using
namespace
ck
;
...
...
host/host_tensor/include/device.hpp
View file @
31b40352
...
...
@@ -34,24 +34,16 @@ struct KernelTimer
using
device_stream_t
=
hipStream_t
;
template
<
typename
...
Args
,
typename
F
>
void
launch_kernel
(
F
kernel
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
hipStream_t
stream_id
,
Args
...
args
)
void
launch_kernel
(
F
kernel
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
{
hipStream_t
stream_id
=
nullptr
;
hipLaunchKernelGGL
(
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
stream_id
,
args
...);
}
template
<
typename
...
Args
,
typename
F
>
float
launch_and_time_kernel
(
F
kernel
,
int
nrepeat
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
hipStream_t
stream_id
,
Args
...
args
)
float
launch_and_time_kernel
(
F
kernel
,
int
nrepeat
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
{
KernelTimer
timer
;
...
...
@@ -66,6 +58,8 @@ float launch_and_time_kernel(F kernel,
printf
(
"Warm up
\n
"
);
hipStream_t
stream_id
=
nullptr
;
// warm up
hipLaunchKernelGGL
(
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
stream_id
,
args
...);
...
...
host/host_tensor/include/host_conv.hpp
View file @
31b40352
...
...
@@ -14,15 +14,13 @@ void host_direct_convolution(const Tensor<TIn>& in,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
InRightPads
&
,
const
ConvTensorLayout
layout
=
ConvTensorLayout
::
NCHW
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
...
...
@@ -68,23 +66,25 @@ void host_direct_convolution(const Tensor<TIn>& in,
out
(
n
,
ho
,
wo
,
k
)
=
v
;
};
switch
(
layout
)
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
case
ConvTensorLayout
::
NCHW
:
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
break
;
case
ConvTensorLayout
::
NHWC
:
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
make_ParallelTensorFunctor
(
f_nhwc
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
break
;
default:
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
else
{
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
...
...
@@ -100,17 +100,15 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
constexpr
std
::
size_t
HoPerTile
=
2
;
constexpr
std
::
size_t
WoPerTile
=
2
;
std
::
size_t
N
=
in_nchw
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
C
=
in_nchw
.
mDesc
.
GetLengths
()[
1
];
std
::
size_t
HI
=
in_nchw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
WI
=
in_nchw
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
N
=
in_nchw
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
C
=
in_nchw
.
mDesc
.
GetLengths
()[
1
];
std
::
size_t
K
=
wei_kcyx
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
Y
=
wei_kcyx
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
X
=
wei_kcyx
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
H
O
=
out_nkhw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
W
O
=
out_nkhw
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
H
o
=
out_nkhw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
W
o
=
out_nkhw
.
mDesc
.
GetLengths
()[
3
];
index_t
h_pad_low
=
InLeftPads
{}.
Get
(
Number
<
0
>
{});
index_t
w_pad_low
=
InLeftPads
{}.
Get
(
Number
<
1
>
{});
...
...
@@ -118,8 +116,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
std
::
size_t
HiPerTile
=
HoPerTile
+
Y
-
1
;
std
::
size_t
WiPerTile
=
WoPerTile
+
X
-
1
;
std
::
size_t
HTile
=
(
H
O
+
HoPerTile
-
1
)
/
HoPerTile
;
std
::
size_t
WTile
=
(
W
O
+
WoPerTile
-
1
)
/
WoPerTile
;
std
::
size_t
HTile
=
(
H
o
+
HoPerTile
-
1
)
/
HoPerTile
;
std
::
size_t
WTile
=
(
W
o
+
WoPerTile
-
1
)
/
WoPerTile
;
Tensor
<
double
>
in_hold
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
in_transform
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
...
...
host/host_tensor/include/host_conv_bwd_data.hpp
View file @
31b40352
...
...
@@ -14,7 +14,7 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
InRightPads
&
/*
in_right_pads
*/
,
const
ConvTensorLayout
layout
=
ConvTensorLayout
::
NCHW
)
{
using
namespace
ck
;
...
...
@@ -25,11 +25,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
constexpr
auto
I3
=
Number
<
3
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
std
::
size_t
N
=
in
.
mDesc
.
GetLengths
()[
I0
];
std
::
size_t
C
=
in
.
mDesc
.
GetLengths
()[
I1
];
std
::
size_t
Hi
=
in
.
mDesc
.
GetLengths
()[
I2
];
std
::
size_t
Wi
=
in
.
mDesc
.
GetLengths
()[
I3
];
std
::
size_t
K
=
wei
.
mDesc
.
GetLengths
()[
I0
];
std
::
size_t
Y
=
wei
.
mDesc
.
GetLengths
()[
I2
];
std
::
size_t
X
=
wei
.
mDesc
.
GetLengths
()[
I3
];
...
...
@@ -74,11 +69,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
};
auto
f_nhwc
=
[
&
](
auto
n
,
auto
hi
,
auto
wi
,
auto
c
)
{
std
::
size_t
N
=
in
.
mDesc
.
GetLengths
()[
I0
];
std
::
size_t
Hi
=
in
.
mDesc
.
GetLengths
()[
I1
];
std
::
size_t
Wi
=
in
.
mDesc
.
GetLengths
()[
I2
];
std
::
size_t
C
=
in
.
mDesc
.
GetLengths
()[
I3
];
std
::
size_t
K
=
wei
.
mDesc
.
GetLengths
()[
I0
];
std
::
size_t
Y
=
wei
.
mDesc
.
GetLengths
()[
I1
];
std
::
size_t
X
=
wei
.
mDesc
.
GetLengths
()[
I2
];
...
...
@@ -122,22 +112,24 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
in
(
n
,
hi
,
wi
,
c
)
=
v
;
};
switch
(
layout
)
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
case
ConvTensorLayout
::
NCHW
:
make_ParallelTensorFunctor
(
f_nchw
,
in
.
mDesc
.
GetLengths
()[
0
],
in
.
mDesc
.
GetLengths
()[
1
],
in
.
mDesc
.
GetLengths
()[
2
],
in
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
break
;
case
ConvTensorLayout
::
NHWC
:
}
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
{
make_ParallelTensorFunctor
(
f_nhwc
,
in
.
mDesc
.
GetLengths
()[
0
],
in
.
mDesc
.
GetLengths
()[
1
],
in
.
mDesc
.
GetLengths
()[
2
],
in
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
break
;
default:
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
else
{
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
host/host_tensor/include/host_tensor.hpp
View file @
31b40352
...
...
@@ -34,7 +34,7 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
first
=
false
;
else
os
<<
delim
;
os
<<
T
{
v
}
;
os
<<
static_cast
<
T
>
(
v
)
;
}
return
os
;
}
...
...
host/host_tensor/include/host_tensor_generator.hpp
View file @
31b40352
...
...
@@ -9,7 +9,7 @@ struct GeneratorTensor_1
int
value
=
1
;
template
<
typename
...
Is
>
float
operator
()(
Is
...
is
)
float
operator
()(
Is
...)
{
return
value
;
}
...
...
host/host_tensor/src/device.cpp
View file @
31b40352
...
...
@@ -24,32 +24,32 @@ struct KernelTimerImpl
{
KernelTimerImpl
()
{
hipEventCreate
(
&
mStart
);
hipEventCreate
(
&
mEnd
);
hipGetErrorString
(
hipEventCreate
(
&
mStart
)
)
;
hipGetErrorString
(
hipEventCreate
(
&
mEnd
)
)
;
}
~
KernelTimerImpl
()
{
hipEventDestroy
(
mStart
);
hipEventDestroy
(
mEnd
);
hipGetErrorString
(
hipEventDestroy
(
mStart
)
)
;
hipGetErrorString
(
hipEventDestroy
(
mEnd
)
)
;
}
void
Start
()
{
hipDeviceSynchronize
();
hipEventRecord
(
mStart
,
0
);
hipGetErrorString
(
hipDeviceSynchronize
()
)
;
hipGetErrorString
(
hipEventRecord
(
mStart
,
nullptr
)
);
}
void
End
()
{
hipEventRecord
(
mEnd
,
0
);
hipEventSynchronize
(
mEnd
);
hipGetErrorString
(
hipEventRecord
(
mEnd
,
nullptr
)
);
hipGetErrorString
(
hipEventSynchronize
(
mEnd
)
)
;
}
float
GetElapsedTime
()
const
{
float
time
;
hipEventElapsedTime
(
&
time
,
mStart
,
mEnd
);
hipGetErrorString
(
hipEventElapsedTime
(
&
time
,
mStart
,
mEnd
)
)
;
return
time
;
}
...
...
host/online_compile/CMakeLists.txt
deleted
100644 → 0
View file @
5781adf5
set
(
CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++
)
## for online-compiling of HIP kernels
set
(
OLC_HIP_COMPILER
${
CMAKE_CXX_COMPILER
}
CACHE PATH
""
)
## reset to avoid the C++ options from the parent project
set
(
CMAKE_CXX_FLAGS
""
)
message
(
"Compiling options for library and kernels:
${
CMAKE_CXX_FLAGS
}
"
)
# look for and register clang-offload-bundler
if
(
OLC_HIP_COMPILER MATCHES
".*clang
\\
+
\\
+$"
)
find_program
(
OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
PATH_SUFFIXES bin
PATHS
/opt/rocm/llvm
${
CMAKE_INSTALL_PREFIX
}
/llvm
)
endif
()
if
(
OLC_OFFLOADBUNDLER_BIN
)
message
(
STATUS
"clang-offload-bundler found:
${
OLC_OFFLOADBUNDLER_BIN
}
"
)
set
(
OLC_OFFLOADBUNDLER_BIN
"
${
OLC_OFFLOADBUNDLER_BIN
}
"
)
else
()
# look for and register extractkernel
message
(
STATUS
"clang-offload-bundler not found"
)
find_program
(
EXTRACTKERNEL_BIN extractkernel
PATH_SUFFIXES bin
PATHS
/opt/rocm/hip
/opt/rocm/hcc
/opt/rocm
${
CMAKE_INSTALL_PREFIX
}
/hip
${
CMAKE_INSTALL_PREFIX
}
/hcc
${
CMAKE_INSTALL_PREFIX
}
)
if
(
EXTRACTKERNEL_BIN
)
message
(
STATUS
"extractkernel found:
${
EXTRACTKERNEL_BIN
}
"
)
set
(
EXTRACTKERNEL_BIN
"
${
EXTRACTKERNEL_BIN
}
"
)
else
()
message
(
FATAL_ERROR
"extractkernel not found"
)
endif
()
endif
()
option
(
Boost_USE_STATIC_LIBS
"Use boost static libraries"
OFF
)
set
(
BOOST_COMPONENTS filesystem
)
add_definitions
(
-DBOOST_ALL_NO_LIB=1
)
find_package
(
Boost REQUIRED COMPONENTS
${
BOOST_COMPONENTS
}
)
# HIP is always required
find_package
(
hip REQUIRED PATHS /opt/rocm
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
target_flags
(
HIP_COMPILER_FLAGS hip::device
)
# Remove cuda arch flags
string
(
REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
string
(
REGEX REPLACE --offload-arch=[a-z0-9]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
set
(
OLC_hip_VERSION_MAJOR
"
${
hip_VERSION_MAJOR
}
"
)
set
(
OLC_hip_VERSION_MINOR
"
${
hip_VERSION_MINOR
}
"
)
set
(
OLC_hip_VERSION_PATCH
"
${
hip_VERSION_PATCH
}
"
)
option
(
ENABLE_DEBUG
"Build to enable debugging"
ON
)
if
(
ENABLE_DEBUG
)
set
(
OLC_DEBUG 1
)
else
()
set
(
OLC_DEBUG 0
)
endif
()
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/host/online_compile/include/config.h.in"
"
${
PROJECT_BINARY_DIR
}
/host/online_compile/include/config.h"
)
include_directories
(
BEFORE
${
PROJECT_BINARY_DIR
}
/host/online_compile/include
)
message
(
STATUS
"Hip compiler flags:
${
HIP_COMPILER_FLAGS
}
"
)
## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
set
(
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
${
HIP_ONLINE_COMPILER_FLAGS
}
"
)
add_definitions
(
"-DHIP_COMPILER_FLAGS=
${
HIP_COMPILER_FLAGS
}
"
)
file
(
GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/*/*.hpp"
)
file
(
GLOB COMPOSABLE_KERNEL_INCLUDE_2
"
${
PROJECT_SOURCE_DIR
}
/external/rocm/include/bfloat16_dev.hpp"
)
set
(
MCONV_KERNEL_INCLUDES
${
COMPOSABLE_KERNEL_INCLUDE_1
}
${
COMPOSABLE_KERNEL_INCLUDE_2
}
)
file
(
GLOB_RECURSE MCONV_KERNELS
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/src/kernel_wrapper/*.cpp"
)
add_kernels
(
${
CMAKE_CURRENT_SOURCE_DIR
}
"
${
MCONV_KERNELS
}
"
)
add_kernel_includes
(
${
CMAKE_CURRENT_SOURCE_DIR
}
"
${
MCONV_KERNEL_INCLUDES
}
"
)
set
(
ONLINE_COMPILATION_SOURCE
${
PROJECT_BINARY_DIR
}
/kernel.cpp
${
PROJECT_BINARY_DIR
}
/kernel_includes.cpp
)
include_directories
(
BEFORE
${
PROJECT_BINARY_DIR
}
/host/online_compile/include
include
)
set
(
OLC_HIP_UTILITY_CPPS
hip_utility/logger.cpp
hip_utility/tmp_dir.cpp
hip_utility/md5.cpp
hip_utility/exec_utils.cpp
hip_utility/target_properties.cpp
hip_utility/handlehip.cpp
hip_utility/kernel_build_params.cpp
hip_utility/hip_build_utils.cpp
hip_utility/hipoc_program.cpp
hip_utility/hipoc_kernel.cpp
hip_utility/kernel_cache.cpp
hip_utility/binary_cache.cpp
)
list
(
APPEND OLC_SOURCES
${
OLC_HIP_UTILITY_CPPS
}
${
OLC_HIP_UTILITY_HEADERS
}
)
## addkernels provide the tool to create inlined kernels in one header
add_subdirectory
(
addkernels
)
function
(
inline_kernels_src KERNELS KERNEL_INCLUDES
)
set
(
KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp
)
set
(
KERNEL_SRC_HPP_PATH
${
PROJECT_BINARY_DIR
}
/inlined_kernels/
${
KERNEL_SRC_HPP_FILENAME
}
)
set
(
KERNEL_SRC_CPP_PATH
${
PROJECT_BINARY_DIR
}
/inlined_kernels/batch_all.cpp
)
add_custom_command
(
OUTPUT
${
KERNEL_SRC_HPP_PATH
}
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
DEPENDS addkernels
${
KERNELS
}
${
KERNEL_INCLUDES
}
COMMAND $<TARGET_FILE:addkernels> -target
${
KERNEL_SRC_HPP_PATH
}
-extern -source
${
KERNELS
}
COMMENT
"Inlining All kernels"
)
configure_file
(
kernels_batch.cpp.in
${
KERNEL_SRC_CPP_PATH
}
)
list
(
APPEND OLC_SOURCES
${
KERNEL_SRC_CPP_PATH
}
${
KERNEL_SRC_HPP_PATH
}
)
set
(
OLC_SOURCES
${
OLC_SOURCES
}
PARENT_SCOPE
)
endfunction
()
inline_kernels_src
(
"
${
MCONV_KERNELS
}
"
"
${
MCONV_KERNEL_INCLUDES
}
"
)
list
(
APPEND ONLINE_COMPILATION_SOURCE
${
OLC_SOURCES
}
${
PROJECT_BINARY_DIR
}
/olc_kernel_includes.h
)
add_custom_command
(
OUTPUT
${
PROJECT_BINARY_DIR
}
/olc_kernel_includes.h
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
DEPENDS addkernels
${
MCONV_KERNEL_INCLUDES
}
COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target
${
PROJECT_BINARY_DIR
}
/olc_kernel_includes.h -source
${
MCONV_KERNEL_INCLUDES
}
COMMENT
"Inlining HIP kernel includes"
)
## the library target
add_library
(
online_compile SHARED
${
ONLINE_COMPILATION_SOURCE
}
)
target_include_directories
(
online_compile PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/online_compile/include/
)
target_include_directories
(
online_compile PRIVATE
${
PROJECT_BINARY_DIR
}
)
target_include_directories
(
online_compile PRIVATE
${
PROJECT_SOURCE_DIR
}
/external/half/include/
)
target_link_libraries
(
online_compile PRIVATE hip::device
)
target_link_libraries
(
online_compile INTERFACE hip::host
)
target_link_libraries
(
online_compile PRIVATE Boost::filesystem
)
target_compile_features
(
online_compile PUBLIC
)
set_target_properties
(
online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS online_compile LIBRARY DESTINATION lib
)
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment