Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
c03045ce
Commit
c03045ce
authored
Aug 10, 2021
by
Chao Liu
Browse files
rename
parent
b2589957
Changes
54
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
813 additions
and
870 deletions
+813
-870
composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
...lude/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
+38
-43
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
...nel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
+79
-81
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
...nel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
+47
-49
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+69
-73
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+93
-96
composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
.../include/tensor_operation/threadwise_tensor_slice_set.hpp
+6
-6
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
...ude/tensor_operation/threadwise_tensor_slice_transfer.hpp
+76
-81
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
.../tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+55
-58
composable_kernel/include/utility/config.hpp
composable_kernel/include/utility/config.hpp
+2
-2
composable_kernel/include/utility/dynamic_buffer.hpp
composable_kernel/include/utility/dynamic_buffer.hpp
+2
-2
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
...ution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
+95
-99
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
...tion_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+93
-97
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
...tion_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+93
-97
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
...ution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
+28
-31
host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
...ackward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+6
-9
host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
...kward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+6
-9
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+6
-9
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
...tion_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+7
-10
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
...ion_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+6
-9
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
...on_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+6
-9
No files found.
composable_kernel/include/tensor_operation/gridwise_
dynamic_
contraction_dlops_v1r2.hpp
→
composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
View file @
c03045ce
#ifndef CK_GRIDWISE_
DYNAMIC_
CONTRACTION_DLOPS_V1R2_HPP
#ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
#define CK_GRIDWISE_
DYNAMIC_
CONTRACTION_DLOPS_V1R2_HPP
#define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
multi_index_transform_helper.hpp"
#include "multi_index_transform_helper.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_gemm_dlops_v2r3.hpp"
#include "blockwise_gemm_dlops_v2r3.hpp"
#include "blockwise_
dynamic_
tensor_slice_transfer_v2.hpp"
#include "blockwise_tensor_slice_transfer_v2.hpp"
#include "threadwise_
dynamic_
tensor_slice_transfer.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "threadwise_
dynamic_
tensor_slice_set.hpp"
#include "threadwise_tensor_slice_set.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -25,7 +25,7 @@ __global__ void
...
@@ -25,7 +25,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
kernel_
dynamic_
contraction_dlops_v1r2
(
kernel_contraction_dlops_v1r2
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
...
@@ -89,7 +89,7 @@ template <index_t BlockSize,
...
@@ -89,7 +89,7 @@ template <index_t BlockSize,
typename
CGridIteratorHacks
,
typename
CGridIteratorHacks
,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
BGridMoveSliceWindowIteratorHacks
>
typename
BGridMoveSliceWindowIteratorHacks
>
struct
Gridwise
Dynamic
ContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
struct
GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -110,15 +110,13 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -110,15 +110,13 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned_v2
(
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned_v2
(
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
max_lds_align
);
...
@@ -201,7 +199,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -201,7 +199,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
const
auto
GM11
=
Number
<
GM1PerBlockGM11
>
{};
const
auto
GM11
=
Number
<
GM1PerBlockGM11
>
{};
const
auto
GM10
=
GM1
/
GM11
;
const
auto
GM10
=
GM1
/
GM11
;
const
auto
a_grid_desc_gk0_gm0_gm10_gm11_gk1
=
transform_
dynamic_
tensor_descriptor
(
const
auto
a_grid_desc_gk0_gm0_gm10_gm11_gk1
=
transform_tensor_descriptor
(
a_grid_desc_gk0_gm0_gm1_gk1
,
a_grid_desc_gk0_gm0_gm1_gk1
,
make_tuple
(
make_pass_through_transform
(
GK0
),
make_tuple
(
make_pass_through_transform
(
GK0
),
make_pass_through_transform
(
GM0
),
make_pass_through_transform
(
GM0
),
...
@@ -222,7 +220,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -222,7 +220,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
const
auto
GN11
=
Number
<
GN1PerBlockGN11
>
{};
const
auto
GN11
=
Number
<
GN1PerBlockGN11
>
{};
const
auto
GN10
=
GN1
/
GN11
;
const
auto
GN10
=
GN1
/
GN11
;
const
auto
b_grid_desc_gk0_gn0_gn10_gn11_gk1
=
transform_
dynamic_
tensor_descriptor
(
const
auto
b_grid_desc_gk0_gn0_gn10_gn11_gk1
=
transform_tensor_descriptor
(
b_grid_desc_gk0_gn0_gn1_gk1
,
b_grid_desc_gk0_gn0_gn1_gk1
,
make_tuple
(
make_pass_through_transform
(
GK0
),
make_tuple
(
make_pass_through_transform
(
GK0
),
make_pass_through_transform
(
GN0
),
make_pass_through_transform
(
GN0
),
...
@@ -259,7 +257,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -259,7 +257,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
constexpr
auto
BM0
=
BM
/
BM1
;
constexpr
auto
BM0
=
BM
/
BM1
;
constexpr
auto
BN0
=
BN
/
BN1
;
constexpr
auto
BN0
=
BN
/
BN1
;
const
auto
c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc
=
transform_tensor_descriptor
(
c_grid_desc_gm0_gm1_gn0_gn1
,
c_grid_desc_gm0_gm1_gn0_gn1
,
make_tuple
(
make_pass_through_transform
(
GM0
),
make_tuple
(
make_pass_through_transform
(
GM0
),
make_unmerge_transform
(
make_tuple
(
GM10
,
GM11
)),
make_unmerge_transform
(
make_tuple
(
GM10
,
GM11
)),
...
@@ -268,7 +266,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -268,7 +266,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
,
5
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
c_gm10_bm_gn10_bn_grid_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
c_gm10_bm_gn10_bn_grid_desc
=
transform_tensor_descriptor
(
c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc
,
c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc
,
make_tuple
(
make_pass_through_transform
(
GM10
),
make_tuple
(
make_pass_through_transform
(
GM10
),
make_merge_transform
(
make_tuple
(
GM0
,
GM11
)),
make_merge_transform
(
make_tuple
(
GM0
,
GM11
)),
...
@@ -277,7 +275,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -277,7 +275,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1
=
transform_
dynamic_
tensor_descriptor
(
const
auto
c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1
=
transform_tensor_descriptor
(
c_gm10_bm_gn10_bn_grid_desc
,
c_gm10_bm_gn10_bn_grid_desc
,
make_tuple
(
make_pass_through_transform
(
GM10
),
make_tuple
(
make_pass_through_transform
(
GM10
),
make_unmerge_transform
(
make_tuple
(
BM0
,
BM1
)),
make_unmerge_transform
(
make_tuple
(
BM0
,
BM1
)),
...
@@ -356,26 +354,24 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -356,26 +354,24 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
constexpr
auto
a_block_desc_gk0_gm0_gm10_gm11_gk1
=
make_naive_tensor_descriptor_aligned_v2
(
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
,
I1
,
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
constexpr
auto
b_block_desc_gk0_gn0_gn10_gn11_gk1
=
make_naive_tensor_descriptor_aligned_v2
(
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
,
I1
,
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
max_lds_align
);
// A matrix in LDS memory for blockwise GEMM
// A matrix in LDS memory for blockwise GEMM
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_block_desc_gk0_bm_gk1
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_block_desc_gk0_bm_gk1
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
*
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
make_tuple
(
Number
<
GK0PerBlock
>
{},
GM0
*
Number
<
GM1PerBlockGM11
>
{},
GK1
),
max_lds_align
);
// B matrix in LDS memory for blockwise GEMM
// B matrix in LDS memory for blockwise GEMM
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_block_desc_gk0_bn_gk1
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_block_desc_gk0_bn_gk1
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
*
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
make_tuple
(
Number
<
GK0PerBlock
>
{},
GN0
*
Number
<
GN1PerBlockGN11
>
{},
GK1
),
max_lds_align
);
static_assert
(
a_block_desc_gk0_gm0_gm10_gm11_gk1
.
GetElementSpaceSize
()
==
static_assert
(
a_block_desc_gk0_gm0_gm10_gm11_gk1
.
GetElementSpaceSize
()
==
...
@@ -385,7 +381,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -385,7 +381,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
"wrong!"
);
"wrong!"
);
// A matrix blockwise copy
// A matrix blockwise copy
auto
a_blockwise_copy
=
Blockwise
Dynamic
TensorSliceTransfer_v4r1
<
auto
a_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
GK0PerBlock
,
GM0
,
1
,
GM1PerBlockGM11
,
GK1
.
value
>
,
Sequence
<
GK0PerBlock
,
GM0
,
1
,
GM1PerBlockGM11
,
GK1
.
value
>
,
...
@@ -409,7 +405,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -409,7 +405,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
make_multi_index
(
0
,
0
,
0
,
0
,
0
));
make_multi_index
(
0
,
0
,
0
,
0
,
0
));
// B matrix blockwise copy
// B matrix blockwise copy
auto
b_blockwise_copy
=
Blockwise
Dynamic
TensorSliceTransfer_v4r1
<
auto
b_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
GK0PerBlock
,
GN0
,
1
,
GN1PerBlockGN11
,
GK1
.
value
>
,
Sequence
<
GK0PerBlock
,
GN0
,
1
,
GN1PerBlockGN11
,
GK1
.
value
>
,
...
@@ -457,8 +453,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -457,8 +453,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
constexpr
auto
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
=
constexpr
auto
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
=
decltype
(
blockwise_gemm
)
::
GetCThreadTensorLengths_BM0_BM1_BN0_BN1
();
decltype
(
blockwise_gemm
)
::
GetCThreadTensorLengths_BM0_BM1_BN0_BN1
();
constexpr
auto
c_thread_desc_bm0_bm1_bn0_bn1
=
constexpr
auto
c_thread_desc_bm0_bm1_bn0_bn1
=
make_naive_tensor_descriptor_packed
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
sequence_to_tuple_of_number
(
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
));
sequence_to_tuple_of_number
(
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
));
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
...
@@ -475,7 +470,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -475,7 +470,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatAcc
>
(
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatAcc
>
(
c_thread_desc_bm0_bm1_bn0_bn1
.
GetElementSpaceSize
());
c_thread_desc_bm0_bm1_bn0_bn1
.
GetElementSpaceSize
());
Threadwise
Dynamic
TensorSliceSet_v1
<
FloatAcc
,
ThreadwiseTensorSliceSet_v1
<
FloatAcc
,
decltype
(
c_thread_desc_bm0_bm1_bn0_bn1
),
decltype
(
c_thread_desc_bm0_bm1_bn0_bn1
),
decltype
(
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
)
>
{}
decltype
(
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
)
>
{}
.
Run
(
c_thread_desc_bm0_bm1_bn0_bn1
,
.
Run
(
c_thread_desc_bm0_bm1_bn0_bn1
,
...
@@ -615,7 +610,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -615,7 +610,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
// output: register to global memory
// output: register to global memory
{
{
constexpr
auto
c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1
=
constexpr
auto
c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
make_tuple
(
I1
,
Number
<
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
[
I0
]
>
{},
Number
<
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
[
I0
]
>
{},
Number
<
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
[
I1
]
>
{},
Number
<
c_thread_tensor_lengths_bm0_bm1_bn0_bn1
[
I1
]
>
{},
...
@@ -627,7 +622,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
...
@@ -627,7 +622,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
blockwise_gemm
.
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
blockwise_gemm
.
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
get_thread_local_1d_id
());
get_thread_local_1d_id
());
Threadwise
Dynamic
TensorSliceTransfer_v1r3
<
ThreadwiseTensorSliceTransfer_v1r3
<
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
decltype
(
c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1
),
decltype
(
c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1
),
...
...
composable_kernel/include/tensor_operation/gridwise_
dynamic_
gemm_dlops_v1r2.hpp
→
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
View file @
c03045ce
#ifndef CK_GRIDWISE_
DYNAMIC_
GEMM_DLOPS_V1R2_HPP
#ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
#define CK_GRIDWISE_
DYNAMIC_
GEMM_DLOPS_V1R2_HPP
#define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
multi_index_transform_helper.hpp"
#include "multi_index_transform_helper.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_gemm_dlops_v2r2.hpp"
#include "blockwise_gemm_dlops_v2r2.hpp"
#include "blockwise_
dynamic_
tensor_slice_transfer.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
#include "threadwise_
dynamic_
tensor_slice_transfer.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "threadwise_
dynamic_
tensor_slice_set.hpp"
#include "threadwise_tensor_slice_set.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -26,7 +26,7 @@ __global__ void
...
@@ -26,7 +26,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
kernel_
dynamic_
gemm_dlops_v1r2
(
kernel_gemm_dlops_v1r2
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
...
@@ -68,8 +68,7 @@ __global__ void
...
@@ -68,8 +68,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
kernel_dynamic_gemm_dlops_v1r2
(
kernel_gemm_dlops_v1r2
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
void
CONSTANT
*
p_a_k_m0_m1_grid_desc
,
const
void
CONSTANT
*
p_a_k_m0_m1_grid_desc
,
...
@@ -151,7 +150,7 @@ template <index_t BlockSize,
...
@@ -151,7 +150,7 @@ template <index_t BlockSize,
typename
CGridIteratorHacks
,
typename
CGridIteratorHacks
,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
BGridMoveSliceWindowIteratorHacks
>
typename
BGridMoveSliceWindowIteratorHacks
>
struct
Gridwise
Dynamic
GemmDlops_km_kn_mn_v1r2
struct
GridwiseGemmDlops_km_kn_mn_v1r2
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -167,12 +166,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -167,12 +166,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k_m_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k_n_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
...
@@ -230,7 +229,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -230,7 +229,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
const
auto
M1
=
Number
<
MPerBlockM1
>
{};
const
auto
M1
=
Number
<
MPerBlockM1
>
{};
const
auto
M0
=
M
/
M1
;
const
auto
M0
=
M
/
M1
;
const
auto
a_k_m0_m1_grid_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
a_k_m0_m1_grid_desc
=
transform_tensor_descriptor
(
a_k_m_grid_desc
,
a_k_m_grid_desc
,
make_tuple
(
make_pass_through_transform
(
K
),
make_unmerge_transform
(
make_tuple
(
M0
,
M1
))),
make_tuple
(
make_pass_through_transform
(
K
),
make_unmerge_transform
(
make_tuple
(
M0
,
M1
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
...
@@ -248,7 +247,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -248,7 +247,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
const
auto
N1
=
Number
<
NPerBlockN1
>
{};
const
auto
N1
=
Number
<
NPerBlockN1
>
{};
const
auto
N0
=
N
/
N1
;
const
auto
N0
=
N
/
N1
;
const
auto
b_k_n0_n1_grid_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
b_k_n0_n1_grid_desc
=
transform_tensor_descriptor
(
b_k_n_grid_desc
,
b_k_n_grid_desc
,
make_tuple
(
make_pass_through_transform
(
K
),
make_unmerge_transform
(
make_tuple
(
N0
,
N1
))),
make_tuple
(
make_pass_through_transform
(
K
),
make_unmerge_transform
(
make_tuple
(
N0
,
N1
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
...
@@ -277,7 +276,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -277,7 +276,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
constexpr
auto
M10
=
M1
/
M11
;
constexpr
auto
M10
=
M1
/
M11
;
constexpr
auto
N10
=
N1
/
N11
;
constexpr
auto
N10
=
N1
/
N11
;
const
auto
c_m0_m10_m11_n0_n10_n11_grid_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
c_m0_m10_m11_n0_n10_n11_grid_desc
=
transform_tensor_descriptor
(
c_m_n_grid_desc
,
c_m_n_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
M0
,
M10
,
M11
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
M0
,
M10
,
M11
)),
make_unmerge_transform
(
make_tuple
(
N0
,
N10
,
N11
))),
make_unmerge_transform
(
make_tuple
(
N0
,
N10
,
N11
))),
...
@@ -352,27 +351,27 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -352,27 +351,27 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k_m_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k_n_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k_m0_m1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_k_m0_m1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k_n0_n1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_k_n0_n1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{}),
max_lds_align
);
// A matrix blockwise copy
// A matrix blockwise copy
auto
a_blockwise_copy
=
auto
a_blockwise_copy
=
Blockwise
Dynamic
TensorSliceTransfer_v4
<
BlockSize
,
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
KPerBlock
,
1
,
MPerBlockM1
>
,
Sequence
<
KPerBlock
,
1
,
MPerBlockM1
>
,
ABlockTransferThreadSliceLengths_K_M0_M1
,
ABlockTransferThreadSliceLengths_K_M0_M1
,
...
@@ -398,7 +397,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -398,7 +397,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
// B matrix blockwise copy
// B matrix blockwise copy
auto
b_blockwise_copy
=
auto
b_blockwise_copy
=
Blockwise
Dynamic
TensorSliceTransfer_v4
<
BlockSize
,
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
KPerBlock
,
1
,
NPerBlockN1
>
,
Sequence
<
KPerBlock
,
1
,
NPerBlockN1
>
,
BBlockTransferThreadSliceLengths_K_N0_N1
,
BBlockTransferThreadSliceLengths_K_N0_N1
,
...
@@ -447,8 +446,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -447,8 +446,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
constexpr
auto
c_m10_m11_n10_n11_thread_tensor_lengths
=
constexpr
auto
c_m10_m11_n10_n11_thread_tensor_lengths
=
decltype
(
blockwise_gemm
)
::
GetCM0M1N0N1ThreadTensorLengths
();
decltype
(
blockwise_gemm
)
::
GetCM0M1N0N1ThreadTensorLengths
();
constexpr
auto
c_m10_m11_n10_n11_thread_desc
=
constexpr
auto
c_m10_m11_n10_n11_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
sequence_to_tuple_of_number
(
c_m10_m11_n10_n11_thread_tensor_lengths
));
sequence_to_tuple_of_number
(
c_m10_m11_n10_n11_thread_tensor_lengths
));
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
...
@@ -465,7 +463,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -465,7 +463,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatAcc
>
(
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatAcc
>
(
c_m10_m11_n10_n11_thread_desc
.
GetElementSpaceSize
());
c_m10_m11_n10_n11_thread_desc
.
GetElementSpaceSize
());
Threadwise
Dynamic
TensorSliceSet_v1
<
FloatAcc
,
ThreadwiseTensorSliceSet_v1
<
FloatAcc
,
decltype
(
c_m10_m11_n10_n11_thread_desc
),
decltype
(
c_m10_m11_n10_n11_thread_desc
),
decltype
(
c_m10_m11_n10_n11_thread_tensor_lengths
)
>
{}
decltype
(
c_m10_m11_n10_n11_thread_tensor_lengths
)
>
{}
.
Run
(
c_m10_m11_n10_n11_thread_desc
,
.
Run
(
c_m10_m11_n10_n11_thread_desc
,
...
@@ -620,7 +618,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -620,7 +618,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
// output: register to global memory
// output: register to global memory
{
{
constexpr
auto
c_m0_m10_m11_n0_n10_n11_thread_desc
=
constexpr
auto
c_m0_m10_m11_n0_n10_n11_thread_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
make_tuple
(
I1
,
Number
<
c_m10_m11_n10_n11_thread_tensor_lengths
[
I0
]
>
{},
Number
<
c_m10_m11_n10_n11_thread_tensor_lengths
[
I0
]
>
{},
Number
<
c_m10_m11_n10_n11_thread_tensor_lengths
[
I1
]
>
{},
Number
<
c_m10_m11_n10_n11_thread_tensor_lengths
[
I1
]
>
{},
...
@@ -631,7 +629,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -631,7 +629,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
const
auto
c_m10_m11_n10_n11_thread_origin_idx_on_block
=
const
auto
c_m10_m11_n10_n11_thread_origin_idx_on_block
=
blockwise_gemm
.
CalculateCM0M1N0N1ThreadOriginOnBlock
(
get_thread_local_1d_id
());
blockwise_gemm
.
CalculateCM0M1N0N1ThreadOriginOnBlock
(
get_thread_local_1d_id
());
Threadwise
Dynamic
TensorSliceTransfer_v1r3
<
ThreadwiseTensorSliceTransfer_v1r3
<
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
decltype
(
c_m0_m10_m11_n0_n10_n11_thread_desc
),
decltype
(
c_m0_m10_m11_n0_n10_n11_thread_desc
),
...
...
composable_kernel/include/tensor_operation/gridwise_
dynamic_
gemm_dlops_v1r3.hpp
→
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
View file @
c03045ce
#ifndef CK_GRIDWISE_
DYNAMIC_
GEMM_V1R3_HPP
#ifndef CK_GRIDWISE_GEMM_V1R3_HPP
#define CK_GRIDWISE_
DYNAMIC_
GEMM_V1R3_HPP
#define CK_GRIDWISE_GEMM_V1R3_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
multi_index_transform_helper.hpp"
#include "multi_index_transform_helper.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_gemm_dlops_v2r3.hpp"
#include "blockwise_gemm_dlops_v2r3.hpp"
#include "blockwise_
dynamic_
tensor_slice_transfer_v2.hpp"
#include "blockwise_tensor_slice_transfer_v2.hpp"
#include "threadwise_
dynamic_
tensor_slice_transfer_v2.hpp"
#include "threadwise_tensor_slice_transfer_v2.hpp"
#include "threadwise_
dynamic_
tensor_slice_set.hpp"
#include "threadwise_tensor_slice_set.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -26,7 +26,7 @@ __global__ void
...
@@ -26,7 +26,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
kernel_
dynamic_
gemm_dlops_v1r3
(
kernel_gemm_dlops_v1r3
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
...
@@ -68,8 +68,7 @@ __global__ void
...
@@ -68,8 +68,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
kernel_dynamic_gemm_dlops_v1r3
(
kernel_gemm_dlops_v1r3
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
void
CONSTANT
*
p_a_k0_m0_m1_k1_grid_desc
,
const
void
CONSTANT
*
p_a_k0_m0_m1_k1_grid_desc
,
...
@@ -147,7 +146,7 @@ template <index_t BlockSize,
...
@@ -147,7 +146,7 @@ template <index_t BlockSize,
typename
CGridIteratorHacks
,
typename
CGridIteratorHacks
,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
BGridMoveSliceWindowIteratorHacks
>
typename
BGridMoveSliceWindowIteratorHacks
>
struct
Gridwise
Dynamic
GemmDlops_km_kn_mn_v1r3
struct
GridwiseGemmDlops_km_kn_mn_v1r3
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -164,12 +163,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -164,12 +163,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
// TODO: check alignment
// TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
constexpr
auto
a_k_m_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_k_m_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
constexpr
auto
b_k_n_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_k_n_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
...
@@ -231,8 +230,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -231,8 +230,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
const
auto
M1
=
Number
<
MPerBlockM1
>
{};
const
auto
M1
=
Number
<
MPerBlockM1
>
{};
const
auto
M0
=
M
/
M1
;
const
auto
M0
=
M
/
M1
;
const
auto
a_k0_m0_m1_k1_grid_desc
=
transform_dynamic_tensor_descriptor
(
const
auto
a_k0_m0_m1_k1_grid_desc
=
a_k0_m_k1_grid_desc
,
transform_tensor_descriptor
(
a_k0_m_k1_grid_desc
,
make_tuple
(
make_pass_through_transform
(
K0
),
make_tuple
(
make_pass_through_transform
(
K0
),
make_unmerge_transform
(
make_tuple
(
M0
,
M1
)),
make_unmerge_transform
(
make_tuple
(
M0
,
M1
)),
make_pass_through_transform
(
K1
)),
make_pass_through_transform
(
K1
)),
...
@@ -251,8 +250,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -251,8 +250,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
const
auto
N1
=
Number
<
NPerBlockN1
>
{};
const
auto
N1
=
Number
<
NPerBlockN1
>
{};
const
auto
N0
=
N
/
N1
;
const
auto
N0
=
N
/
N1
;
const
auto
b_k0_n0_n1_k1_grid_desc
=
transform_dynamic_tensor_descriptor
(
const
auto
b_k0_n0_n1_k1_grid_desc
=
b_k0_n_k1_grid_desc
,
transform_tensor_descriptor
(
b_k0_n_k1_grid_desc
,
make_tuple
(
make_pass_through_transform
(
K0
),
make_tuple
(
make_pass_through_transform
(
K0
),
make_unmerge_transform
(
make_tuple
(
N0
,
N1
)),
make_unmerge_transform
(
make_tuple
(
N0
,
N1
)),
make_pass_through_transform
(
K1
)),
make_pass_through_transform
(
K1
)),
...
@@ -284,7 +283,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -284,7 +283,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
constexpr
auto
M10
=
M1
/
M11
;
constexpr
auto
M10
=
M1
/
M11
;
constexpr
auto
N10
=
N1
/
N11
;
constexpr
auto
N10
=
N1
/
N11
;
const
auto
c_m0_m10_m11_n0_n10_n11_grid_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
c_m0_m10_m11_n0_n10_n11_grid_desc
=
transform_tensor_descriptor
(
c_m_n_grid_desc
,
c_m_n_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
M0
,
M10
,
M11
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
M0
,
M10
,
M11
)),
make_unmerge_transform
(
make_tuple
(
N0
,
N10
,
N11
))),
make_unmerge_transform
(
make_tuple
(
N0
,
N10
,
N11
))),
...
@@ -355,23 +354,23 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -355,23 +354,23 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
// TODO: check alignment
// TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k0_m0_m1_k1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_k0_m0_m1_k1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k0_n0_n1_k1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_k0_n0_n1_k1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
I1
,
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
// A matrix in LDS memory, for blockwise GEMM
// A matrix in LDS memory, for blockwise GEMM
constexpr
auto
a_k0_m_k1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlockM1
>
{},
K1
),
max_lds_align
);
// TODO: check alignment
// TODO: check alignment
// B matrix in LDS memory, for blockwise GEMM
// B matrix in LDS memory, for blockwise GEMM
constexpr
auto
b_k0_n_k1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlockN1
>
{},
K1
),
max_lds_align
);
static_assert
(
a_k0_m0_m1_k1_block_desc
.
GetElementSpaceSize
()
==
static_assert
(
a_k0_m0_m1_k1_block_desc
.
GetElementSpaceSize
()
==
...
@@ -381,7 +380,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -381,7 +380,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
"wrong!"
);
"wrong!"
);
// A matrix blockwise copy
// A matrix blockwise copy
auto
a_blockwise_copy
=
Blockwise
Dynamic
TensorSliceTransfer_v4r1
<
auto
a_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
KPerBlock
,
1
,
MPerBlockM1
,
K1
.
value
>
,
Sequence
<
KPerBlock
,
1
,
MPerBlockM1
,
K1
.
value
>
,
...
@@ -405,7 +404,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -405,7 +404,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
make_multi_index
(
0
,
0
,
0
,
0
));
make_multi_index
(
0
,
0
,
0
,
0
));
// B matrix blockwise copy
// B matrix blockwise copy
auto
b_blockwise_copy
=
Blockwise
Dynamic
TensorSliceTransfer_v4r1
<
auto
b_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
KPerBlock
,
1
,
NPerBlockN1
,
K1
.
value
>
,
Sequence
<
KPerBlock
,
1
,
NPerBlockN1
,
K1
.
value
>
,
...
@@ -453,8 +452,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -453,8 +452,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
constexpr
auto
c_m10_m11_n10_n11_thread_tensor_lengths
=
constexpr
auto
c_m10_m11_n10_n11_thread_tensor_lengths
=
decltype
(
blockwise_gemm
)
::
GetCThreadTensorLengths_BM0_BM1_BN0_BN1
();
decltype
(
blockwise_gemm
)
::
GetCThreadTensorLengths_BM0_BM1_BN0_BN1
();
constexpr
auto
c_m10_m11_n10_n11_thread_desc
=
constexpr
auto
c_m10_m11_n10_n11_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
sequence_to_tuple_of_number
(
c_m10_m11_n10_n11_thread_tensor_lengths
));
sequence_to_tuple_of_number
(
c_m10_m11_n10_n11_thread_tensor_lengths
));
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
...
@@ -471,7 +469,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -471,7 +469,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatAcc
>
(
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatAcc
>
(
c_m10_m11_n10_n11_thread_desc
.
GetElementSpaceSize
());
c_m10_m11_n10_n11_thread_desc
.
GetElementSpaceSize
());
Threadwise
Dynamic
TensorSliceSet_v1
<
FloatAcc
,
ThreadwiseTensorSliceSet_v1
<
FloatAcc
,
decltype
(
c_m10_m11_n10_n11_thread_desc
),
decltype
(
c_m10_m11_n10_n11_thread_desc
),
decltype
(
c_m10_m11_n10_n11_thread_tensor_lengths
)
>
{}
decltype
(
c_m10_m11_n10_n11_thread_tensor_lengths
)
>
{}
.
Run
(
c_m10_m11_n10_n11_thread_desc
,
.
Run
(
c_m10_m11_n10_n11_thread_desc
,
...
@@ -609,7 +607,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -609,7 +607,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
// output: register to global memory
// output: register to global memory
{
{
constexpr
auto
c_m0_m10_m11_n0_n10_n11_thread_desc
=
constexpr
auto
c_m0_m10_m11_n0_n10_n11_thread_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
make_tuple
(
I1
,
Number
<
c_m10_m11_n10_n11_thread_tensor_lengths
[
I0
]
>
{},
Number
<
c_m10_m11_n10_n11_thread_tensor_lengths
[
I0
]
>
{},
Number
<
c_m10_m11_n10_n11_thread_tensor_lengths
[
I1
]
>
{},
Number
<
c_m10_m11_n10_n11_thread_tensor_lengths
[
I1
]
>
{},
...
@@ -621,7 +619,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -621,7 +619,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
blockwise_gemm
.
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
blockwise_gemm
.
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
get_thread_local_1d_id
());
get_thread_local_1d_id
());
Threadwise
Dynamic
TensorSliceTransfer_v1r3
<
ThreadwiseTensorSliceTransfer_v1r3
<
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
decltype
(
c_m0_m10_m11_n0_n10_n11_thread_desc
),
decltype
(
c_m0_m10_m11_n0_n10_n11_thread_desc
),
...
...
composable_kernel/include/tensor_operation/gridwise_
dynamic_
gemm_dlops_v2.hpp
→
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
View file @
c03045ce
#ifndef CK_GRIDWISE_
DYNAMIC_
GEMM_V2_HPP
#ifndef CK_GRIDWISE_GEMM_V2_HPP
#define CK_GRIDWISE_
DYNAMIC_
GEMM_V2_HPP
#define CK_GRIDWISE_GEMM_V2_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
multi_index_transform_helper.hpp"
#include "multi_index_transform_helper.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_
dynamic_
tensor_slice_transfer.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
#include "threadwise_
dynamic_
tensor_slice_transfer.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "blockwise_gemm_dlops_v3.hpp"
#include "blockwise_gemm_dlops_v3.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -47,7 +47,7 @@ template <index_t BlockSize,
...
@@ -47,7 +47,7 @@ template <index_t BlockSize,
typename
CGlobalIteratorHacks
,
typename
CGlobalIteratorHacks
,
typename
AGlobalMoveSliceWindowIteratorHacks
,
typename
AGlobalMoveSliceWindowIteratorHacks
,
typename
BGlobalMoveSliceWindowIteratorHacks
>
typename
BGlobalMoveSliceWindowIteratorHacks
>
struct
Gridwise
Dynamic
GemmDlops_km_kn_mn_v3
struct
GridwiseGemmDlops_km_kn_mn_v3
{
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
{
...
@@ -58,7 +58,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
...
@@ -58,7 +58,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_e_k_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
...
@@ -132,22 +132,20 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
...
@@ -132,22 +132,20 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_e_k_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_e_k_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
constexpr
auto
a_e_k_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_e_k_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_e_n_ho_wo_block_desc
=
constexpr
auto
b_e_n_ho_wo_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerBlock
>
{},
Number
<
WoPerBlock
>
{}));
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerBlock
>
{},
Number
<
WoPerBlock
>
{}));
// c_thread_mtx definition: this is a mess
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k_n_ho_wo_thread_desc
=
constexpr
auto
c_k_n_ho_wo_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
auto
blockwise_gemm
=
auto
blockwise_gemm
=
...
@@ -182,7 +180,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
...
@@ -182,7 +180,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
// A matrix blockwise copy
// A matrix blockwise copy
auto
a_blockwise_copy
=
auto
a_blockwise_copy
=
Blockwise
Dynamic
TensorSliceTransfer_v4
<
BlockSize
,
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
E
,
KPerBlock
>
,
Sequence
<
E
,
KPerBlock
>
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadSliceLengths_E_K
,
...
@@ -201,18 +199,16 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
...
@@ -201,18 +199,16 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
1
,
1
,
1
,
1
,
AThreadTransferSrcResetCoordinateAfterRun
,
AThreadTransferSrcResetCoordinateAfterRun
,
true
>
(
true
>
(
a_e_k_global_desc
,
a_e_k_global_desc
,
make_multi_index
(
0
,
k_block_data_on_global
),
make_multi_index
(
0
,
k_block_data_on_global
),
a_e_k_desc
,
a_e_k_desc
,
make_multi_index
(
0
,
0
));
make_multi_index
(
0
,
0
));
constexpr
auto
b_e_n_ho_wo_thread_desc
=
constexpr
auto
b_e_n_ho_wo_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
auto
b_threadwise_transfer
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
auto
b_threadwise_transfer
=
FloatAB
,
ThreadwiseTensorSliceTransfer_v2
<
FloatAB
,
FloatAB
,
FloatAB
,
decltype
(
b_e_n_ho_wo_global_desc
),
decltype
(
b_e_n_ho_wo_global_desc
),
decltype
(
b_e_n_ho_wo_thread_desc
),
decltype
(
b_e_n_ho_wo_thread_desc
),
...
@@ -221,7 +217,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
...
@@ -221,7 +217,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
BBlockTransferSrcVectorDim
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferSrcScalarPerVector
,
1
,
1
,
true
>
(
b_e_n_ho_wo_global_desc
,
true
>
(
b_e_n_ho_wo_global_desc
,
make_multi_index
(
0
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
));
make_multi_index
(
0
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
));
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
...
@@ -234,7 +231,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
...
@@ -234,7 +231,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
c_thread_buf
;
c_thread_buf
;
// initialize output thread tensor
// initialize output thread tensor
Threadwise
Dynamic
TensorSliceSet_v1
<
FloatAcc
,
ThreadwiseTensorSliceSet_v1
<
FloatAcc
,
decltype
(
c_k_n_ho_wo_thread_desc
),
decltype
(
c_k_n_ho_wo_thread_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>>
{}
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>>
{}
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
FloatAcc
{
0
});
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
FloatAcc
{
0
});
...
@@ -354,8 +351,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
...
@@ -354,8 +351,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
const
index_t
k_thread_data_on_global
=
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
k_thread_id
*
KPerThread
;
k_block_data_on_global
+
k_thread_id
*
KPerThread
;
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
ThreadwiseTensorSliceTransfer_v1r3
<
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
decltype
(
c_k_n_ho_wo_thread_desc
),
decltype
(
c_k_n_ho_wo_thread_desc
),
decltype
(
c_k_n_ho_wo_global_desc
),
decltype
(
c_k_n_ho_wo_global_desc
),
...
...
composable_kernel/include/tensor_operation/gridwise_
dynamic_
gemm_xdlops_v2r3.hpp
→
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
View file @
c03045ce
#ifndef CK_GRIDWISE_
DYNAMIC_
GEMM_XDLOPS_V2R3_HPP
#ifndef CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP
#define CK_GRIDWISE_
DYNAMIC_
GEMM_XDLOPS_V2R3_HPP
#define CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
multi_index_transform_helper.hpp"
#include "multi_index_transform_helper.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "blockwise_
dynamic_
tensor_slice_transfer.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
#include "threadwise_
dynamic_
tensor_slice_transfer.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "threadwise_
dynamic_
tensor_slice_set.hpp"
#include "threadwise_tensor_slice_set.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -24,7 +24,7 @@ __global__ void
...
@@ -24,7 +24,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
kernel_
dynamic_
gemm_xdlops_v2r3
(
const
FloatAB
*
__restrict__
p_a_grid
,
kernel_gemm_xdlops_v2r3
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
AK0MK1GridDesc
a_k0_m_k1_grid_desc
,
const
AK0MK1GridDesc
a_k0_m_k1_grid_desc
,
...
@@ -58,7 +58,7 @@ __global__ void
...
@@ -58,7 +58,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
kernel_
dynamic_
gemm_xdlops_v2r3
(
const
FloatAB
*
__restrict__
p_a_grid
,
kernel_gemm_xdlops_v2r3
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
void
CONSTANT
*
p_a_k0_m_k1_grid_desc
,
const
void
CONSTANT
*
p_a_k0_m_k1_grid_desc
,
...
@@ -132,7 +132,7 @@ template <index_t BlockSize,
...
@@ -132,7 +132,7 @@ template <index_t BlockSize,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
BGridMoveSliceWindowIteratorHacks
,
typename
BGridMoveSliceWindowIteratorHacks
,
bool
CAccessOrderMRepeatNRepeat
>
bool
CAccessOrderMRepeatNRepeat
>
struct
Gridwise
Dynamic
Gemm_k0mk1_k0nk1_mn_xdlops_v2r3
struct
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -148,12 +148,12 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -148,12 +148,12 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k0_m_k1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k0_n_k1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
...
@@ -216,7 +216,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -216,7 +216,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
constexpr
auto
N1
=
Number
<
CLayout
.
N0
()
>
{};
constexpr
auto
N1
=
Number
<
CLayout
.
N0
()
>
{};
const
auto
c_m0_m1_m2_n_grid_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
c_m0_m1_m2_n_grid_desc
=
transform_tensor_descriptor
(
c_m_n_grid_desc
,
c_m_n_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
MRepeat
,
MWaves
,
M0
,
M1
,
M2
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
MRepeat
,
MWaves
,
M0
,
M1
,
M2
)),
make_unmerge_transform
(
make_tuple
(
NRepeat
,
NWaves
,
N1
))),
make_unmerge_transform
(
make_tuple
(
NRepeat
,
NWaves
,
N1
))),
...
@@ -290,17 +290,17 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -290,17 +290,17 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_k0_m_k1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_k0_m_k1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_k0_n_k1_block_desc
=
make_
dynamic_
naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_k0_n_k1_block_desc
=
make_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
KPerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
// A matrix blockwise copy
// A matrix blockwise copy
auto
a_blockwise_copy
=
auto
a_blockwise_copy
=
Blockwise
Dynamic
TensorSliceTransfer_v4
<
BlockSize
,
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
KPerBlock
,
MPerBlock
,
K1
>
,
Sequence
<
KPerBlock
,
MPerBlock
,
K1
>
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
...
@@ -319,15 +319,14 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -319,15 +319,14 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
1
,
1
,
1
,
1
,
AThreadTransferSrcResetCoordinateAfterRun
,
AThreadTransferSrcResetCoordinateAfterRun
,
true
>
(
true
>
(
a_k0_m_k1_grid_desc
,
a_k0_m_k1_grid_desc
,
make_multi_index
(
0
,
m_block_data_idx_on_grid
,
0
),
make_multi_index
(
0
,
m_block_data_idx_on_grid
,
0
),
a_k0_m_k1_block_desc
,
a_k0_m_k1_block_desc
,
make_multi_index
(
0
,
0
,
0
));
make_multi_index
(
0
,
0
,
0
));
// B matrix blockwise copy
// B matrix blockwise copy
auto
b_blockwise_copy
=
auto
b_blockwise_copy
=
Blockwise
Dynamic
TensorSliceTransfer_v4
<
BlockSize
,
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
KPerBlock
,
NPerBlock
,
K1
>
,
Sequence
<
KPerBlock
,
NPerBlock
,
K1
>
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
...
@@ -346,8 +345,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -346,8 +345,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
1
,
1
,
1
,
1
,
BThreadTransferSrcResetCoordinateAfterRun
,
BThreadTransferSrcResetCoordinateAfterRun
,
true
>
(
true
>
(
b_k0_n_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
make_multi_index
(
0
,
n_block_data_idx_on_grid
,
0
),
make_multi_index
(
0
,
n_block_data_idx_on_grid
,
0
),
b_k0_n_k1_block_desc
,
b_k0_n_k1_block_desc
,
make_multi_index
(
0
,
0
,
0
));
make_multi_index
(
0
,
0
,
0
));
...
@@ -364,7 +362,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -364,7 +362,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
NPerBlock
%
(
NPerWave
*
NRepeat
)
==
0
,
NPerBlock
%
(
NPerWave
*
NRepeat
)
==
0
,
"wrong!"
);
"wrong!"
);
constexpr
auto
a_k0_m0_m1_k1_block_desc
=
transform_
dynamic_
tensor_descriptor
(
constexpr
auto
a_k0_m0_m1_k1_block_desc
=
transform_tensor_descriptor
(
a_k0_m_k1_block_desc
,
a_k0_m_k1_block_desc
,
make_tuple
(
make_pass_through_transform
(
Number
<
KPerBlock
>
{}),
make_tuple
(
make_pass_through_transform
(
Number
<
KPerBlock
>
{}),
make_unmerge_transform
(
make_unmerge_transform
(
...
@@ -373,7 +371,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -373,7 +371,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
constexpr
auto
b_k0_n0_n1_k1_block_desc
=
transform_
dynamic_
tensor_descriptor
(
constexpr
auto
b_k0_n0_n1_k1_block_desc
=
transform_tensor_descriptor
(
b_k0_n_k1_block_desc
,
b_k0_n_k1_block_desc
,
make_tuple
(
make_pass_through_transform
(
Number
<
KPerBlock
>
{}),
make_tuple
(
make_pass_through_transform
(
Number
<
KPerBlock
>
{}),
make_unmerge_transform
(
make_unmerge_transform
(
...
@@ -399,8 +397,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -399,8 +397,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
static_assert
(
NumBlks
==
1
&&
NumXdlops
==
1
,
"K Reduction Mfma only"
);
static_assert
(
NumBlks
==
1
&&
NumXdlops
==
1
,
"K Reduction Mfma only"
);
constexpr
auto
c_mr_nr_blk_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
constexpr
auto
c_mr_nr_blk_desc
=
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
NRepeat
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
NRepeat
>
{}));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
vector_type
<
FloatAcc
,
BlkSize
>
,
vector_type
<
FloatAcc
,
BlkSize
>
,
...
@@ -492,7 +490,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -492,7 +490,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
constexpr index_t N1 = CLayout.N0();
constexpr index_t N1 = CLayout.N0();
constexpr auto c_m0_m1_m2_n_thread_desc =
constexpr auto c_m0_m1_m2_n_thread_desc =
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(make_tuple(Number<MRepeat>{},
make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
Number<NRepeat>{},
Number<NRepeat>{},
Number<1>{},
Number<1>{},
Number<1>{},
Number<1>{},
...
@@ -533,7 +531,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -533,7 +531,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
Threadwise
Dynamic
TensorSliceTransfer_v1r3<
ThreadwiseTensorSliceTransfer_v1r3<
FloatC,
FloatC,
FloatC,
FloatC,
decltype(c_m0_m1_m2_n_thread_desc),
decltype(c_m0_m1_m2_n_thread_desc),
...
@@ -567,9 +565,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -567,9 +565,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
constexpr
index_t
M1
=
CLayout
.
N1
();
constexpr
index_t
M1
=
CLayout
.
N1
();
constexpr
index_t
M2
=
CLayout
.
M0
();
constexpr
index_t
M2
=
CLayout
.
M0
();
constexpr
auto
c_m0_m1_m2_n_thread_desc
=
constexpr
auto
c_m0_m1_m2_n_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
make_tuple
(
I1
,
I1
,
I1
,
I1
,
Number
<
M0
>
{},
Number
<
1
>
{},
Number
<
M2
>
{},
Number
<
1
>
{}));
I1
,
I1
,
I1
,
I1
,
Number
<
M0
>
{},
Number
<
1
>
{},
Number
<
M2
>
{},
Number
<
1
>
{}));
// calculate origin of thread output tensor on global memory
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
// blockwise GEMM c matrix starting index
...
@@ -585,7 +582,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -585,7 +582,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
constexpr
auto
c_m0_m1_m2_n_grid_tensor_iterator_hacks
=
CGridIteratorHacks
{};
constexpr
auto
c_m0_m1_m2_n_grid_tensor_iterator_hacks
=
CGridIteratorHacks
{};
auto
c_thread_copy
=
auto
c_thread_copy
=
Threadwise
Dynamic
TensorSliceTransfer_v1r3
<
FloatC
,
ThreadwiseTensorSliceTransfer_v1r3
<
FloatC
,
FloatC
,
FloatC
,
decltype
(
c_m0_m1_m2_n_thread_desc
),
decltype
(
c_m0_m1_m2_n_thread_desc
),
decltype
(
c_m0_m1_m2_n_grid_desc
),
decltype
(
c_m0_m1_m2_n_grid_desc
),
...
...
composable_kernel/include/tensor_operation/threadwise_
dynamic_
tensor_slice_set.hpp
→
composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
View file @
c03045ce
#ifndef CK_THREADWISE_
DYNAMIC_
TENSOR_SET_HPP
#ifndef CK_THREADWISE_TENSOR_SET_HPP
#define CK_THREADWISE_
DYNAMIC_
TENSOR_SET_HPP
#define CK_THREADWISE_TENSOR_SET_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -16,7 +16,7 @@ template <typename Data,
...
@@ -16,7 +16,7 @@ template <typename Data,
typename
Desc
,
typename
Desc
,
typename
SliceLengths
,
typename
SliceLengths
,
typename
std
::
enable_if
<
Desc
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
typename
std
::
enable_if
<
Desc
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
struct
Threadwise
Dynamic
TensorSliceSet_v1
struct
ThreadwiseTensorSliceSet_v1
{
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
...
@@ -40,7 +40,7 @@ struct ThreadwiseDynamicTensorSliceSet_v1
...
@@ -40,7 +40,7 @@ struct ThreadwiseDynamicTensorSliceSet_v1
constexpr
auto
origin_idx
=
to_multi_index
(
OriginIdx
{});
constexpr
auto
origin_idx
=
to_multi_index
(
OriginIdx
{});
static_ford
<
SliceLengths
>
{}([
&
](
auto
access_idx
)
{
static_ford
<
SliceLengths
>
{}([
&
](
auto
access_idx
)
{
constexpr
auto
coord
=
make_
dynamic_
tensor_coordinate
(
desc
,
origin_idx
+
access_idx
);
constexpr
auto
coord
=
make_tensor_coordinate
(
desc
,
origin_idx
+
access_idx
);
constexpr
bool
is_valid
=
constexpr
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
desc
,
coord
);
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
desc
,
coord
);
...
...
composable_kernel/include/tensor_operation/threadwise_
dynamic_
tensor_slice_transfer.hpp
→
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
View file @
c03045ce
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/threadwise_
dynamic_
tensor_slice_transfer_v2.hpp
→
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
View file @
c03045ce
#ifndef CK_THREADWISE_
DYNAMIC_
TENSOR_SLICE_TRANSFER_V2_HPP
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP
#define CK_THREADWISE_
DYNAMIC_
TENSOR_SLICE_TRANSFER_V2_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -30,7 +30,7 @@ template <typename SliceLengths,
...
@@ -30,7 +30,7 @@ template <typename SliceLengths,
bool
DstResetCoordinateAfterRun
>
// control whether to move back dst coordinate after each
bool
DstResetCoordinateAfterRun
>
// control whether to move back dst coordinate after each
// RunWrite(), will be fused with MoveDstSliceWindow to
// RunWrite(), will be fused with MoveDstSliceWindow to
// save addr computation
// save addr computation
struct
Threadwise
Dynamic
TensorSliceTransfer_v3r1
struct
ThreadwiseTensorSliceTransfer_v3r1
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -38,18 +38,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -38,18 +38,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
decltype
(
make_
dynamic_
tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
SrcCoord
=
decltype
(
make_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_
dynamic_
tensor_coordinate
(
DstDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
SrcCoordIterator
=
decltype
(
make_
dynamic_
tensor_coordinate_iterator
(
SrcDesc
{},
Index
{}));
using
SrcCoordIterator
=
decltype
(
make_tensor_coordinate_iterator
(
SrcDesc
{},
Index
{}));
using
DstCoordIterator
=
decltype
(
make_
dynamic_
tensor_coordinate_iterator
(
DstDesc
{},
Index
{}));
using
DstCoordIterator
=
decltype
(
make_tensor_coordinate_iterator
(
DstDesc
{},
Index
{}));
__device__
constexpr
Threadwise
Dynamic
TensorSliceTransfer_v3r1
(
const
SrcDesc
&
src_desc
,
__device__
constexpr
ThreadwiseTensorSliceTransfer_v3r1
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
)
const
Index
&
dst_slice_origin
)
:
src_coord_
(
make_
dynamic_
tensor_coordinate
(
src_desc
,
src_slice_origin
)),
:
src_coord_
(
make_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_coord_
(
make_
dynamic_
tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
dst_coord_
(
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
{
{
// TODO: fix this
// TODO: fix this
static_assert
(
is_same
<
SrcData
,
DstData
>::
value
,
static_assert
(
is_same
<
SrcData
,
DstData
>::
value
,
...
@@ -64,12 +64,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -64,12 +64,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
{
{
src_coord_
=
make_
dynamic_
tensor_coordinate
(
src_desc
,
src_slice_origin_idx
);
src_coord_
=
make_tensor_coordinate
(
src_desc
,
src_slice_origin_idx
);
}
}
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
{
{
dst_coord_
=
make_
dynamic_
tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
dst_coord_
=
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
}
template
<
typename
SrcBuffer
,
typename
SrcIteratorHacks
>
template
<
typename
SrcBuffer
,
typename
SrcIteratorHacks
>
...
@@ -96,8 +96,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -96,8 +96,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
I1
),
I1
),
SrcVectorTensorContiguousDimOrder
{});
SrcVectorTensorContiguousDimOrder
{});
constexpr
auto
src_vector_desc
=
make_dynamic_naive_tensor_descriptor_v2
(
constexpr
auto
src_vector_desc
=
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
make_naive_tensor_descriptor_v2
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
// access order and lengths
// access order and lengths
...
@@ -117,7 +117,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -117,7 +117,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
forward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
src_vector_tensor_lengths
[
i
]
:
0
;
forward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
src_vector_tensor_lengths
[
i
]
:
0
;
});
});
return
make_
dynamic_
tensor_coordinate_iterator
(
return
make_tensor_coordinate_iterator
(
src_desc
,
forward_step
,
src_iterator_hacks
[
I0
][
i
]);
src_desc
,
forward_step
,
src_iterator_hacks
[
I0
][
i
]);
},
},
Number
<
nDim
>
{});
Number
<
nDim
>
{});
...
@@ -131,7 +131,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -131,7 +131,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
backward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
src_vector_tensor_lengths
[
i
]
:
0
;
backward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
src_vector_tensor_lengths
[
i
]
:
0
;
});
});
return
make_
dynamic_
tensor_coordinate_iterator
(
return
make_tensor_coordinate_iterator
(
src_desc
,
backward_step
,
src_iterator_hacks
[
I1
][
i
]);
src_desc
,
backward_step
,
src_iterator_hacks
[
I1
][
i
]);
},
},
Number
<
nDim
>
{});
Number
<
nDim
>
{});
...
@@ -219,12 +219,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -219,12 +219,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
{
{
if
constexpr
(
forward_sweep
[
i
])
if
constexpr
(
forward_sweep
[
i
])
{
{
move_
dynamic_
tensor_coordinate
(
move_tensor_coordinate
(
src_desc
,
src_coord_
,
src_forward_iterators
[
src_dim_access_order
[
i
]]);
src_desc
,
src_coord_
,
src_forward_iterators
[
src_dim_access_order
[
i
]]);
}
}
else
else
{
{
move_
dynamic_
tensor_coordinate
(
move_tensor_coordinate
(
src_desc
,
src_coord_
,
src_backward_iterators
[
src_dim_access_order
[
i
]]);
src_desc
,
src_coord_
,
src_backward_iterators
[
src_dim_access_order
[
i
]]);
}
}
}
}
...
@@ -235,9 +235,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -235,9 +235,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
if
constexpr
(
SrcResetCoordinateAfterRun
)
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
{
const
auto
src_reset_iterator
=
const
auto
src_reset_iterator
=
make_
dynamic_
tensor_coordinate_iterator
(
src_desc
,
GetSrcCoordinateResetStep
());
make_tensor_coordinate_iterator
(
src_desc
,
GetSrcCoordinateResetStep
());
move_
dynamic_
tensor_coordinate
(
src_desc
,
src_coord_
,
src_reset_iterator
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
src_reset_iterator
);
}
}
}
}
...
@@ -265,8 +265,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -265,8 +265,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
I1
),
I1
),
DstVectorTensorContiguousDimOrder
{});
DstVectorTensorContiguousDimOrder
{});
constexpr
auto
dst_vector_desc
=
make_dynamic_naive_tensor_descriptor_v2
(
constexpr
auto
dst_vector_desc
=
sequence_to_tuple_of_number
(
dst_vector_tensor_lengths
),
make_naive_tensor_descriptor_v2
(
sequence_to_tuple_of_number
(
dst_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
dst_vector_tensor_strides
));
sequence_to_tuple_of_number
(
dst_vector_tensor_strides
));
// dst access order and lengths
// dst access order and lengths
...
@@ -286,7 +286,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -286,7 +286,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
forward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
dst_vector_tensor_lengths
[
i
]
:
0
;
forward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
dst_vector_tensor_lengths
[
i
]
:
0
;
});
});
const
auto
forward_iterator
=
make_
dynamic_
tensor_coordinate_iterator
(
const
auto
forward_iterator
=
make_tensor_coordinate_iterator
(
dst_desc
,
forward_step
,
dst_iterator_hacks
[
I0
][
i
]);
dst_desc
,
forward_step
,
dst_iterator_hacks
[
I0
][
i
]);
return
forward_iterator
;
return
forward_iterator
;
...
@@ -302,7 +302,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -302,7 +302,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
backward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
dst_vector_tensor_lengths
[
i
]
:
0
;
backward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
dst_vector_tensor_lengths
[
i
]
:
0
;
});
});
const
auto
backward_iterator
=
make_
dynamic_
tensor_coordinate_iterator
(
const
auto
backward_iterator
=
make_tensor_coordinate_iterator
(
dst_desc
,
backward_step
,
dst_iterator_hacks
[
I1
][
i
]);
dst_desc
,
backward_step
,
dst_iterator_hacks
[
I1
][
i
]);
return
backward_iterator
;
return
backward_iterator
;
...
@@ -394,12 +394,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -394,12 +394,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
{
{
if
constexpr
(
forward_sweep
[
i
])
if
constexpr
(
forward_sweep
[
i
])
{
{
move_
dynamic_
tensor_coordinate
(
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_forward_iterators
[
dst_dim_access_order
[
i
]]);
dst_desc
,
dst_coord_
,
dst_forward_iterators
[
dst_dim_access_order
[
i
]]);
}
}
else
else
{
{
move_
dynamic_
tensor_coordinate
(
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_backward_iterators
[
dst_dim_access_order
[
i
]]);
dst_desc
,
dst_coord_
,
dst_backward_iterators
[
dst_dim_access_order
[
i
]]);
}
}
}
}
...
@@ -410,9 +410,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -410,9 +410,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
if
constexpr
(
DstResetCoordinateAfterRun
)
if
constexpr
(
DstResetCoordinateAfterRun
)
{
{
const
auto
dst_reset_iterator
=
const
auto
dst_reset_iterator
=
make_
dynamic_
tensor_coordinate_iterator
(
dst_desc
,
GetDstCoordinateResetStep
());
make_tensor_coordinate_iterator
(
dst_desc
,
GetDstCoordinateResetStep
());
move_
dynamic_
tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_reset_iterator
);
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_reset_iterator
);
}
}
}
}
...
@@ -564,10 +564,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -564,10 +564,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
:
src_slice_origin_step_idx
+
GetSrcCoordinateResetStep
();
:
src_slice_origin_step_idx
+
GetSrcCoordinateResetStep
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
const
auto
adjusted_step
=
make_tensor_coordinate_iterator
(
src_desc
,
adjusted_step_idx
);
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
adjusted_step_idx
);
move_
dynamic_
tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
...
@@ -583,10 +582,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -583,10 +582,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
:
src_slice_origin_step_idx
+
GetSrcCoordinateResetStep
();
:
src_slice_origin_step_idx
+
GetSrcCoordinateResetStep
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_
dynamic_
tensor_coordinate_iterator
(
const
auto
adjusted_step
=
make_tensor_coordinate_iterator
(
src_desc
,
adjusted_step_idx
,
src_move_slice_window_iterator_hack
);
src_desc
,
adjusted_step_idx
,
src_move_slice_window_iterator_hack
);
move_
dynamic_
tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
}
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
...
@@ -598,15 +597,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
...
@@ -598,15 +597,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
:
dst_slice_origin_step_idx
+
GetDstCoordinateResetStep
();
:
dst_slice_origin_step_idx
+
GetDstCoordinateResetStep
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
const
auto
adjusted_step
=
make_tensor_coordinate_iterator
(
dst_desc
,
adjusted_step_idx
);
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
adjusted_step_idx
);
move_
dynamic_
tensor_coordinate
(
dst_desc
,
dst_coord_
,
adjusted_step
);
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
adjusted_step
);
}
}
private:
private:
static
constexpr
auto
buffer_desc_
=
static
constexpr
auto
buffer_desc_
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
sequence_to_tuple_of_number
(
SliceLengths
{}));
make_naive_tensor_descriptor_packed
(
sequence_to_tuple_of_number
(
SliceLengths
{}));
static
constexpr
auto
buffer_size_
=
buffer_desc_
.
GetElementSpaceSize
();
static
constexpr
auto
buffer_size_
=
buffer_desc_
.
GetElementSpaceSize
();
...
@@ -640,7 +638,7 @@ template <
...
@@ -640,7 +638,7 @@ template <
typename
SrcVectorTensorContiguousDimOrder
,
typename
SrcVectorTensorContiguousDimOrder
,
typename
std
::
enable_if
<
SrcDesc
::
IsKnownAtCompileTime
()
&&
DstDesc
::
IsKnownAtCompileTime
(),
typename
std
::
enable_if
<
SrcDesc
::
IsKnownAtCompileTime
()
&&
DstDesc
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
bool
>
::
type
=
false
>
struct
Threadwise
Dynamic
TensorSliceTransfer_v4r1
struct
ThreadwiseTensorSliceTransfer_v4r1
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -649,12 +647,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
...
@@ -649,12 +647,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
decltype
(
make_
dynamic_
tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
SrcCoord
=
decltype
(
make_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
SrcCoordIterator
=
decltype
(
make_
dynamic_
tensor_coordinate_iterator
(
SrcDesc
{},
Index
{}));
using
SrcCoordIterator
=
decltype
(
make_tensor_coordinate_iterator
(
SrcDesc
{},
Index
{}));
__device__
constexpr
Threadwise
Dynamic
TensorSliceTransfer_v4r1
(
const
Index
&
src_ref_idx
)
__device__
constexpr
ThreadwiseTensorSliceTransfer_v4r1
(
const
Index
&
src_ref_idx
)
:
src_ref_coord_
(
make_
dynamic_
tensor_coordinate
(
SrcDesc
{},
src_ref_idx
))
:
src_ref_coord_
(
make_tensor_coordinate
(
SrcDesc
{},
src_ref_idx
))
{
{
static_assert
(
SrcDesc
::
IsKnownAtCompileTime
()
&&
DstDesc
::
IsKnownAtCompileTime
(),
static_assert
(
SrcDesc
::
IsKnownAtCompileTime
()
&&
DstDesc
::
IsKnownAtCompileTime
(),
"wrong! SrcDesc and DstDesc need to known at compile-time"
);
"wrong! SrcDesc and DstDesc need to known at compile-time"
);
...
@@ -712,8 +710,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
...
@@ -712,8 +710,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
I1
),
I1
),
SrcVectorTensorContiguousDimOrder
{});
SrcVectorTensorContiguousDimOrder
{});
constexpr
auto
src_vector_desc
=
make_dynamic_naive_tensor_descriptor_v2
(
constexpr
auto
src_vector_desc
=
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
make_naive_tensor_descriptor_v2
(
sequence_to_tuple_of_number
(
src_vector_tensor_lengths
),
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
sequence_to_tuple_of_number
(
src_vector_tensor_strides
));
// access order and lengths
// access order and lengths
...
@@ -735,12 +733,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
...
@@ -735,12 +733,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
src_ref_to_origin_disp_idx
+
data_to_origin_disp_idx
;
src_ref_to_origin_disp_idx
+
data_to_origin_disp_idx
;
constexpr
auto
src_ref_to_data_disp_coord_iterator
=
constexpr
auto
src_ref_to_data_disp_coord_iterator
=
make_
dynamic_
tensor_coordinate_iterator
(
src_desc
,
src_ref_to_data_disp_idx
);
make_tensor_coordinate_iterator
(
src_desc
,
src_ref_to_data_disp_idx
);
auto
src_data_coord
=
src_ref_coord_
;
auto
src_data_coord
=
src_ref_coord_
;
move_dynamic_tensor_coordinate
(
move_tensor_coordinate
(
src_desc
,
src_data_coord
,
src_ref_to_data_disp_coord_iterator
);
src_desc
,
src_data_coord
,
src_ref_to_data_disp_coord_iterator
);
vector_type_maker_t
<
SrcData
,
src_vector_desc
.
GetElementSpaceSize
()
>
src_vector
;
vector_type_maker_t
<
SrcData
,
src_vector_desc
.
GetElementSpaceSize
()
>
src_vector
;
...
@@ -775,10 +772,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
...
@@ -775,10 +772,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
{
{
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
const
auto
src_slice_move_step_iter
=
make_dynamic_tensor_coordinate_iterator
(
const
auto
src_slice_move_step_iter
=
src_desc
,
to_multi_index
(
src_slice_move_step_idx
));
make_tensor_coordinate_iterator
(
src_desc
,
to_multi_index
(
src_slice_move_step_idx
));
move_
dynamic_
tensor_coordinate
(
SrcDesc
{},
src_ref_coord_
,
src_slice_move_step_iter
);
move_tensor_coordinate
(
SrcDesc
{},
src_ref_coord_
,
src_slice_move_step_iter
);
}
}
private:
private:
...
...
composable_kernel/include/utility/config.hpp
View file @
c03045ce
...
@@ -99,8 +99,8 @@
...
@@ -99,8 +99,8 @@
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_
DYNAMIC_
MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_
DYNAMIC_
MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif
#endif
// workaround for compiler crash when compiling recursive lambda
// workaround for compiler crash when compiling recursive lambda
...
...
composable_kernel/include/utility/dynamic_buffer.hpp
View file @
c03045ce
#ifndef CK_
DYNAMIC_
BUFFER_HPP
#ifndef CK_BUFFER_HPP
#define CK_
DYNAMIC_
BUFFER_HPP
#define CK_BUFFER_HPP
#include "amd_buffer_addressing.hpp"
#include "amd_buffer_addressing.hpp"
#include "c_style_pointer_cast.hpp"
#include "c_style_pointer_cast.hpp"
...
...
composable_kernel/src/kernel_wrapper/
dynamic_
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
→
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
View file @
c03045ce
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_dlops_v1r2.hpp"
#include "gridwise_gemm_dlops_v1r2.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
@@ -64,8 +64,7 @@ constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDs
...
@@ -64,8 +64,7 @@ constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDs
constexpr
bool
HasMainKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HAS_MAIN_KBLOCK_LOOP
);
constexpr
bool
HasMainKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HAS_MAIN_KBLOCK_LOOP
);
constexpr
bool
HasDoubleTailKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP
);
constexpr
bool
HasDoubleTailKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP
);
extern
"C"
__global__
void
extern
"C"
__global__
void
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare
(
dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare
(
int
n
,
int
n
,
int
c
,
int
c
,
int
hi
,
int
hi
,
...
@@ -93,12 +92,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
...
@@ -93,12 +92,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
const
index_t
ho
=
(
hi
+
leftPadH
+
rightPadH
-
convDilationY
*
(
y
-
1
)
-
1
)
/
convStrideH
+
1
;
const
index_t
ho
=
(
hi
+
leftPadH
+
rightPadH
-
convDilationY
*
(
y
-
1
)
-
1
)
/
convStrideH
+
1
;
const
index_t
wo
=
(
wi
+
leftPadW
+
rightPadW
-
convDilationX
*
(
x
-
1
)
-
1
)
/
convStrideW
+
1
;
const
index_t
wo
=
(
wi
+
leftPadW
+
rightPadW
-
convDilationX
*
(
x
-
1
)
-
1
)
/
convStrideW
+
1
;
const
auto
in_n_c_hi_wi_desc
=
const
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
c
,
hi
,
wi
));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
n
,
c
,
hi
,
wi
));
const
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
k
,
c
,
y
,
x
));
const
auto
wei_k_c_y_x_desc
=
const
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
k
,
ho
,
wo
));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
k
,
c
,
y
,
x
));
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
n
,
k
,
ho
,
wo
));
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
(
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
wei_k_c_y_x_desc
,
...
@@ -151,7 +147,7 @@ dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
...
@@ -151,7 +147,7 @@ dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
GridwiseGemm
=
using
GridwiseGemm
=
Gridwise
Dynamic
GemmDlops_km_kn_mn_v1r2
<
BlockSize
,
GridwiseGemmDlops_km_kn_mn_v1r2
<
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
...
@@ -216,7 +212,7 @@ extern "C" __global__ void
...
@@ -216,7 +212,7 @@ extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
dynamic_
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
(
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
...
@@ -230,11 +226,11 @@ extern "C" __global__ void
...
@@ -230,11 +226,11 @@ extern "C" __global__ void
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
in_n_c_hi_wi_desc
=
constexpr
auto
in_n_c_hi_wi_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
256
,
28
,
28
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
wei_k_c_y_x_desc
=
constexpr
auto
wei_k_c_y_x_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
256
,
3
,
3
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
3
,
3
));
constexpr
auto
out_n_k_ho_wo_desc
=
constexpr
auto
out_n_k_ho_wo_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
256
,
28
,
28
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
descs
=
constexpr
auto
descs
=
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
...
@@ -287,7 +283,7 @@ extern "C" __global__ void
...
@@ -287,7 +283,7 @@ extern "C" __global__ void
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
GridwiseGemm
=
using
GridwiseGemm
=
Gridwise
Dynamic
GemmDlops_km_kn_mn_v1r2
<
BlockSize
,
GridwiseGemmDlops_km_kn_mn_v1r2
<
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
...
...
composable_kernel/src/kernel_wrapper/
dynamic_
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
→
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
View file @
c03045ce
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_xdlops_v2r3.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
@@ -60,8 +60,7 @@ using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDst
...
@@ -60,8 +60,7 @@ using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDst
constexpr
index_t
CThreadTransferSrcDstVectorDim
=
CK_PARAM_CThreadTransferSrcDstVectorDim
;
constexpr
index_t
CThreadTransferSrcDstVectorDim
=
CK_PARAM_CThreadTransferSrcDstVectorDim
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
CK_PARAM_CThreadTransferDstScalarPerVector
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
CK_PARAM_CThreadTransferDstScalarPerVector
;
extern
"C"
__global__
void
extern
"C"
__global__
void
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare
(
dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare
(
int
n
,
int
n
,
int
c
,
int
c
,
int
hi
,
int
hi
,
...
@@ -89,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
...
@@ -89,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
const
index_t
ho
=
(
hi
+
leftPadH
+
rightPadH
-
convDilationY
*
(
y
-
1
)
-
1
)
/
convStrideH
+
1
;
const
index_t
ho
=
(
hi
+
leftPadH
+
rightPadH
-
convDilationY
*
(
y
-
1
)
-
1
)
/
convStrideH
+
1
;
const
index_t
wo
=
(
wi
+
leftPadW
+
rightPadW
-
convDilationX
*
(
x
-
1
)
-
1
)
/
convStrideW
+
1
;
const
index_t
wo
=
(
wi
+
leftPadW
+
rightPadW
-
convDilationX
*
(
x
-
1
)
-
1
)
/
convStrideW
+
1
;
const
auto
in_n_c_hi_wi_desc
=
const
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
c
,
hi
,
wi
));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
n
,
c
,
hi
,
wi
));
const
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
k
,
c
,
y
,
x
));
const
auto
wei_k_c_y_x_desc
=
const
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
k
,
ho
,
wo
));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
k
,
c
,
y
,
x
));
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
n
,
k
,
ho
,
wo
));
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad
(
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
wei_k_c_y_x_desc
,
...
@@ -148,7 +144,7 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
...
@@ -148,7 +144,7 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
GridwiseGemm
=
using
GridwiseGemm
=
Gridwise
Dynamic
Gemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
...
@@ -212,7 +208,7 @@ extern "C" __global__ void
...
@@ -212,7 +208,7 @@ extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
dynamic_
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
(
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
...
@@ -227,11 +223,11 @@ extern "C" __global__ void
...
@@ -227,11 +223,11 @@ extern "C" __global__ void
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
in_n_c_hi_wi_desc
=
constexpr
auto
in_n_c_hi_wi_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
256
,
28
,
28
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
wei_k_c_y_x_desc
=
constexpr
auto
wei_k_c_y_x_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
256
,
3
,
3
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
3
,
3
));
constexpr
auto
out_n_k_ho_wo_desc
=
constexpr
auto
out_n_k_ho_wo_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
256
,
28
,
28
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
descs
=
constexpr
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
...
@@ -285,7 +281,7 @@ extern "C" __global__ void
...
@@ -285,7 +281,7 @@ extern "C" __global__ void
using
CMNGridDesc
=
decltype
(
c_m_n_grid_desc
);
using
CMNGridDesc
=
decltype
(
c_m_n_grid_desc
);
using
GridwiseGemm
=
using
GridwiseGemm
=
Gridwise
Dynamic
Gemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
...
...
composable_kernel/src/kernel_wrapper/
dynamic_
convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
→
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
View file @
c03045ce
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_xdlops_v2r3.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
@@ -60,8 +60,7 @@ using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDst
...
@@ -60,8 +60,7 @@ using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDst
constexpr
index_t
CThreadTransferSrcDstVectorDim
=
CK_PARAM_CThreadTransferSrcDstVectorDim
;
constexpr
index_t
CThreadTransferSrcDstVectorDim
=
CK_PARAM_CThreadTransferSrcDstVectorDim
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
CK_PARAM_CThreadTransferDstScalarPerVector
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
CK_PARAM_CThreadTransferDstScalarPerVector
;
extern
"C"
__global__
void
extern
"C"
__global__
void
convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare
(
dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare
(
int
n
,
int
n
,
int
hi
,
int
hi
,
int
wi
,
int
wi
,
...
@@ -89,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
...
@@ -89,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
const
index_t
ho
=
(
hi
+
leftPadH
+
rightPadH
-
convDilationY
*
(
y
-
1
)
-
1
)
/
convStrideH
+
1
;
const
index_t
ho
=
(
hi
+
leftPadH
+
rightPadH
-
convDilationY
*
(
y
-
1
)
-
1
)
/
convStrideH
+
1
;
const
index_t
wo
=
(
wi
+
leftPadW
+
rightPadW
-
convDilationX
*
(
x
-
1
)
-
1
)
/
convStrideW
+
1
;
const
index_t
wo
=
(
wi
+
leftPadW
+
rightPadW
-
convDilationX
*
(
x
-
1
)
-
1
)
/
convStrideW
+
1
;
const
auto
in_n_hi_wi_c_desc
=
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
hi
,
wi
,
c
));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
n
,
hi
,
wi
,
c
));
const
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
k
,
y
,
x
,
c
));
const
auto
wei_k_y_x_c_desc
=
const
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
ho
,
wo
,
k
));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
k
,
y
,
x
,
c
));
const
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
n
,
ho
,
wo
,
k
));
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad
(
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad
(
in_n_hi_wi_c_desc
,
in_n_hi_wi_c_desc
,
...
@@ -148,7 +144,7 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
...
@@ -148,7 +144,7 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
GridwiseGemm
=
using
GridwiseGemm
=
Gridwise
Dynamic
Gemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
...
@@ -212,7 +208,7 @@ extern "C" __global__ void
...
@@ -212,7 +208,7 @@ extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
dynamic_
convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk
(
convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
...
@@ -227,11 +223,11 @@ extern "C" __global__ void
...
@@ -227,11 +223,11 @@ extern "C" __global__ void
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
in_n_hi_wi_c_desc
=
constexpr
auto
in_n_hi_wi_c_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
28
,
28
,
256
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
28
,
28
,
256
));
constexpr
auto
wei_k_y_x_c_desc
=
constexpr
auto
wei_k_y_x_c_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
3
,
3
,
256
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
3
,
3
,
256
));
constexpr
auto
out_n_ho_wo_k_desc
=
constexpr
auto
out_n_ho_wo_k_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
28
,
28
,
256
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
28
,
28
,
256
));
constexpr
auto
descs
=
constexpr
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad
(
in_n_hi_wi_c_desc
,
transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad
(
in_n_hi_wi_c_desc
,
...
@@ -285,7 +281,7 @@ extern "C" __global__ void
...
@@ -285,7 +281,7 @@ extern "C" __global__ void
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
BGridMoveSliceWindowIteratorHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
GridwiseGemm
=
using
GridwiseGemm
=
Gridwise
Dynamic
Gemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
...
...
composable_kernel/src/kernel_wrapper/
dynamic_
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
→
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
View file @
c03045ce
#include "common_header.hpp"
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
contraction_dlops_v1r2.hpp"
#include "gridwise_contraction_dlops_v1r2.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
@@ -62,7 +62,7 @@ constexpr bool HasMainKBlockLoop = static_cast<bool>(CK_PARAM_HasMainKBloc
...
@@ -62,7 +62,7 @@ constexpr bool HasMainKBlockLoop = static_cast<bool>(CK_PARAM_HasMainKBloc
constexpr
bool
HasDoubleTailKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HasDoubleTailKBlockLoop
);
constexpr
bool
HasDoubleTailKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HasDoubleTailKBlockLoop
);
extern
"C"
__global__
void
extern
"C"
__global__
void
dynamic_
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare
(
index_t
N
,
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare
(
index_t
N
,
index_t
C
,
index_t
C
,
index_t
Hi
,
index_t
Hi
,
index_t
Wi
,
index_t
Wi
,
...
@@ -88,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
...
@@ -88,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
const
index_t
Wo
=
const
index_t
Wo
=
(
Wi
+
InLeftPadW
+
InRightPadW
-
ConvDilationW
*
(
X
-
1
)
-
1
)
/
ConvStrideW
+
1
;
(
Wi
+
InLeftPadW
+
InRightPadW
-
ConvDilationW
*
(
X
-
1
)
-
1
)
/
ConvStrideW
+
1
;
const
auto
in_n_c_hi_wi_desc
=
const
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C
,
Hi
,
Wi
));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
C
,
Hi
,
Wi
));
const
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C
,
Y
,
X
));
const
auto
wei_k_c_y_x_desc
=
const
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K
,
Ho
,
Wo
));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
K
,
C
,
Y
,
X
));
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K
,
Ho
,
Wo
));
const
auto
descs
=
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad
(
const
auto
descs
=
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
wei_k_c_y_x_desc
,
...
@@ -160,7 +157,7 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
...
@@ -160,7 +157,7 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
;
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
;
using
GridwiseContraction
=
using
GridwiseContraction
=
Gridwise
Dynamic
ContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
<
GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
<
BlockSize
,
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
...
@@ -220,7 +217,7 @@ extern "C" __global__ void
...
@@ -220,7 +217,7 @@ extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
#endif
dynamic_
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
(
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
...
@@ -232,11 +229,11 @@ extern "C" __global__ void
...
@@ -232,11 +229,11 @@ extern "C" __global__ void
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_n_c_hi_wi_desc
=
constexpr
auto
in_n_c_hi_wi_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
256
,
28
,
28
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
wei_k_c_y_x_desc
=
constexpr
auto
wei_k_c_y_x_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
256
,
3
,
3
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
3
,
3
));
constexpr
auto
out_n_k_ho_wo_desc
=
constexpr
auto
out_n_k_ho_wo_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
256
,
256
,
28
,
28
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
descs
=
constexpr
auto
descs
=
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
...
@@ -303,7 +300,7 @@ extern "C" __global__ void
...
@@ -303,7 +300,7 @@ extern "C" __global__ void
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
;
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
;
using
GridwiseContraction
=
using
GridwiseContraction
=
Gridwise
Dynamic
ContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
<
GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
<
BlockSize
,
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
...
...
host/driver_offline/include/device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
→
host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
View file @
c03045ce
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp"
#include "transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp"
#include "driver_
dynamic_
gemm_xdlops_v2r3.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
TInWei
,
template
<
typename
TInWei
,
typename
TAcc
,
typename
TAcc
,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InLeftPads
,
typename
InRightPads
>
typename
InRightPads
>
void
device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk
(
void
device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk
(
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
...
@@ -44,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
...
@@ -44,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
const
auto
in_n_hi_wi_c_desc
=
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
in_n_hi_wi_c_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_y_x_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
const
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
out_n_ho_wo_k_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_ho_wo_k_lengths
);
#if 1
#if 1
// [M, N, K0, K1] = [128, 128, 4, 4] for fp32
// [M, N, K0, K1] = [128, 128, 4, 4] for fp32
...
@@ -254,7 +251,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
...
@@ -254,7 +251,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
float
ave_time
=
driver_
dynamic_
gemm_xdlops_v2r3
<
float
ave_time
=
driver_gemm_xdlops_v2r3
<
BlockSize
,
BlockSize
,
TInWei
,
TInWei
,
TAcc
,
TAcc
,
...
...
host/driver_offline/include/device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
→
host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
View file @
c03045ce
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp"
#include "transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp"
#include "driver_
dynamic_
gemm_xdlops_v2r3.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
TInWei
,
template
<
typename
TInWei
,
typename
TAcc
,
typename
TAcc
,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InLeftPads
,
typename
InRightPads
>
typename
InRightPads
>
void
device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
(
void
device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
(
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
...
@@ -44,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
...
@@ -44,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
const
auto
in_n_hi_wi_c_desc
=
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
in_n_hi_wi_c_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_y_x_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
const
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
out_n_ho_wo_k_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_ho_wo_k_lengths
);
#if 0
#if 0
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
...
@@ -226,7 +223,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
...
@@ -226,7 +223,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
float
ave_time
=
driver_
dynamic_
gemm_xdlops_v2r3
<
float
ave_time
=
driver_gemm_xdlops_v2r3
<
BlockSize
,
BlockSize
,
TInWei
,
TInWei
,
TAcc
,
TAcc
,
...
...
host/driver_offline/include/device_
dynamic_
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
→
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
View file @
c03045ce
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "driver_
dynamic_
gemm_dlops_v1r2.hpp"
#include "driver_gemm_dlops_v1r2.hpp"
template
<
typename
TInWei
,
template
<
typename
TInWei
,
typename
TAcc
,
typename
TAcc
,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InLeftPads
,
typename
InRightPads
>
typename
InRightPads
>
void
device_
dynamic_
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
(
void
device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
(
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
...
@@ -43,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
...
@@ -43,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
out_n_k_ho_wo_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
const
auto
in_n_c_hi_wi_desc
=
const
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
in_n_c_hi_wi_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_c_hi_wi_lengths
);
const
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_c_y_x_lengths
);
const
auto
wei_k_c_y_x_desc
=
const
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
out_n_k_ho_wo_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_c_y_x_lengths
);
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_k_ho_wo_lengths
);
#if 1
#if 1
// cdata = 64, BlockSize = 256, 128x128x8
// cdata = 64, BlockSize = 256, 128x128x8
...
@@ -136,7 +133,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
...
@@ -136,7 +133,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
float
ave_time
=
driver_
dynamic_
gemm_dlops_v1r2
<
float
ave_time
=
driver_gemm_dlops_v1r2
<
BlockSize
,
BlockSize
,
TInWei
,
TInWei
,
TAcc
,
TAcc
,
...
...
host/driver_offline/include/device_
dynamic_
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
→
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
View file @
c03045ce
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "driver_
dynamic_
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
#include "driver_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
template
<
typename
TInWei
,
template
<
typename
TInWei
,
typename
TAcc
,
typename
TAcc
,
...
@@ -13,7 +13,7 @@ template <typename TInWei,
...
@@ -13,7 +13,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InLeftPads
,
typename
InRightPads
>
typename
InRightPads
>
void
device_
dynamic_
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
(
void
device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
(
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
...
@@ -48,12 +48,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
...
@@ -48,12 +48,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
out_n_k_ho_wo_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
const
auto
in_n_c_hi_wi_desc
=
const
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
in_n_c_hi_wi_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_c_hi_wi_lengths
);
const
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_c_y_x_lengths
);
const
auto
wei_k_c_y_x_desc
=
const
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
out_n_k_ho_wo_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_c_y_x_lengths
);
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_k_ho_wo_lengths
);
#if 0
#if 0
constexpr index_t BlockSize = 256;
constexpr index_t BlockSize = 256;
...
@@ -212,9 +209,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
...
@@ -212,9 +209,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
#if 0
#if 0
float ave_time = launch_kernel_
dynamic_
gemm_xdlops_v1
float ave_time = launch_kernel_gemm_xdlops_v1
#else
#else
float
ave_time
=
launch_kernel_
dynamic_
gemm_xdlops_v2
float
ave_time
=
launch_kernel_gemm_xdlops_v2
#endif
#endif
<
BlockSize
,
<
BlockSize
,
TInWei
,
TInWei
,
...
...
host/driver_offline/include/device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
→
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
View file @
c03045ce
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "driver_
dynamic_
gemm_dlops_v1r3.hpp"
#include "driver_gemm_dlops_v1r3.hpp"
template
<
typename
TInWei
,
template
<
typename
TInWei
,
typename
TAcc
,
typename
TAcc
,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InLeftPads
,
typename
InRightPads
>
typename
InRightPads
>
void
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk
(
void
device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk
(
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
...
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
...
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
const
auto
in_n_hi_wi_c_desc
=
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
in_n_hi_wi_c_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_y_x_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
const
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
out_n_ho_wo_k_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_ho_wo_k_lengths
);
#if 0
#if 0
// [M, N, K0, K1] = [128, 128, 8, 1] for fp32
// [M, N, K0, K1] = [128, 128, 8, 1] for fp32
...
@@ -200,7 +197,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
...
@@ -200,7 +197,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
float
ave_time
=
driver_
dynamic_
gemm_dlops_v1r3
<
float
ave_time
=
driver_gemm_dlops_v1r3
<
BlockSize
,
BlockSize
,
TInWei
,
TInWei
,
TAcc
,
TAcc
,
...
...
host/driver_offline/include/device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
→
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
View file @
c03045ce
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
#include "driver_
dynamic_
gemm_xdlops_v2r3.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
TInWei
,
template
<
typename
TInWei
,
typename
TAcc
,
typename
TAcc
,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
...
@@ -14,7 +14,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InLeftPads
,
typename
InRightPads
>
typename
InRightPads
>
void
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
(
void
device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
(
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
...
@@ -43,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
...
@@ -43,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
out_n_k_ho_wo_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
const
auto
in_n_c_hi_wi_desc
=
const
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
in_n_c_hi_wi_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_c_hi_wi_lengths
);
const
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_c_y_x_lengths
);
const
auto
wei_k_c_y_x_desc
=
const
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
out_n_k_ho_wo_lengths
);
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_c_y_x_lengths
);
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_k_ho_wo_lengths
);
#if 1
#if 1
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
// [M, N, K0, K1] = [256, 128, 4, 8] for fp16
...
@@ -134,7 +131,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
...
@@ -134,7 +131,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
float
ave_time
=
driver_
dynamic_
gemm_xdlops_v2r3
<
float
ave_time
=
driver_gemm_xdlops_v2r3
<
BlockSize
,
BlockSize
,
TInWei
,
TInWei
,
TAcc
,
TAcc
,
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment