Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
583aab02
Commit
583aab02
authored
Dec 15, 2021
by
Chao Liu
Browse files
clean up coordinate step hack
parent
69e771f6
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
16 additions
and
62 deletions
+16
-62
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r5.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v2r5.hpp
+6
-29
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r6.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v2r6.hpp
+6
-25
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v1r4.hpp
...ensor_operation/threadwise_tensor_slice_transfer_v1r4.hpp
+2
-4
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v1r5.hpp
...ensor_operation/threadwise_tensor_slice_transfer_v1r5.hpp
+2
-4
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r5.hpp
View file @
583aab02
...
@@ -107,11 +107,6 @@ template <index_t BlockSize,
...
@@ -107,11 +107,6 @@ template <index_t BlockSize,
typename
CThreadTransferSrcDstAccessOrder
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
index_t
CThreadTransferDstScalarPerVector
,
typename
AGridStepHacks
,
typename
BGridStepHacks
,
typename
CGridStepHacks
,
typename
AGridMoveSliceWindowStepHacks
,
typename
BGridMoveSliceWindowStepHacks
,
bool
CAccessOrderMRepeatNRepeat
,
bool
CAccessOrderMRepeatNRepeat
,
bool
ABlockLdsExtraM
,
bool
ABlockLdsExtraM
,
bool
BBlockLdsExtraN
>
bool
BBlockLdsExtraN
>
...
@@ -502,15 +497,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
...
@@ -502,15 +497,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
AGridStepHacks
{};
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
BGridStepHacks
{};
// hack to control index calculation when move slice window for A and B matrix for
// threadwise copy
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hack
=
AGridMoveSliceWindowStepHacks
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hack
=
BGridMoveSliceWindowStepHacks
{};
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_a_block
,
a_block_desc_k0_m_k1
.
GetElementSpaceSize
());
p_a_block
,
a_block_desc_k0_m_k1
.
GetElementSpaceSize
());
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
...
@@ -518,8 +504,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
...
@@ -518,8 +504,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
// preload data into LDS
// preload data into LDS
{
{
a_blockwise_copy
.
RunRead
(
a_grid_desc_k0_m_k1
,
a_grid_buf
,
a_k0_m_k1_grid_step_hacks
);
a_blockwise_copy
.
RunRead
(
a_grid_desc_k0_m_k1
,
a_grid_buf
);
b_blockwise_copy
.
RunRead
(
b_grid_desc_k0_n_k1
,
b_grid_buf
,
b_k0_n_k1_grid_step_hacks
);
b_blockwise_copy
.
RunRead
(
b_grid_desc_k0_n_k1
,
b_grid_buf
);
a_blockwise_copy
.
RunWrite
(
a_block_desc_k0_m_k1
,
a_block_buf
);
a_blockwise_copy
.
RunWrite
(
a_block_desc_k0_m_k1
,
a_block_buf
);
b_blockwise_copy
.
RunWrite
(
b_block_desc_k0_n_k1
,
b_block_buf
);
b_blockwise_copy
.
RunWrite
(
b_block_desc_k0_n_k1
,
b_block_buf
);
...
@@ -535,20 +521,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
...
@@ -535,20 +521,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
do
do
{
{
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_m_k1
,
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_m_k1
,
a_block_slice_copy_step
);
a_block_slice_copy_step
,
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_n_k1
,
b_block_slice_copy_step
);
a_k0_m_k1_grid_move_slice_window_step_hack
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_n_k1
,
b_block_slice_copy_step
,
b_k0_n_k1_grid_move_slice_window_step_hack
);
a_blockwise_copy
.
RunRead
(
a_blockwise_copy
.
RunRead
(
a_grid_desc_k0_m_k1
,
a_grid_buf
);
a_grid_desc_k0_m_k1
,
a_grid_buf
,
a_k0_m_k1_grid_step_hacks
);
block_sync_lds
();
block_sync_lds
();
b_blockwise_copy
.
RunRead
(
b_blockwise_copy
.
RunRead
(
b_grid_desc_k0_n_k1
,
b_grid_buf
);
b_grid_desc_k0_n_k1
,
b_grid_buf
,
b_k0_n_k1_grid_step_hacks
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
...
@@ -597,8 +577,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
...
@@ -597,8 +577,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
const
index_t
n_thread_data_on_grid
=
const
index_t
n_thread_data_on_grid
=
n_block_data_idx_on_grid
+
c_thread_mtx_on_block
[
I1
];
n_block_data_idx_on_grid
+
c_thread_mtx_on_block
[
I1
];
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks
=
CGridStepHacks
{};
const
auto
m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor
=
const
auto
m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor
=
make_single_stage_tensor_adaptor
(
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
M0
,
M1
,
M2
,
M3
,
M4
))),
make_tuple
(
make_merge_transform
(
make_tuple
(
M0
,
M1
,
M2
,
M3
,
M4
))),
...
@@ -651,7 +629,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
...
@@ -651,7 +629,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
c_thread_buf
,
c_thread_buf
,
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c_grid_buf
,
c_grid_buf
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks
,
c0_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c0_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c0_grid_buf
,
c0_grid_buf
,
c1_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c1_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r6.hpp
View file @
583aab02
...
@@ -101,11 +101,6 @@ template <index_t BlockSize,
...
@@ -101,11 +101,6 @@ template <index_t BlockSize,
typename
CThreadTransferSrcDstAccessOrder
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
index_t
CThreadTransferDstScalarPerVector
,
typename
AGridStepHacks
,
typename
BGridStepHacks
,
typename
CGridStepHacks
,
typename
AGridMoveSliceWindowStepHacks
,
typename
BGridMoveSliceWindowStepHacks
,
bool
CAccessOrderMRepeatNRepeat
,
bool
CAccessOrderMRepeatNRepeat
,
bool
ABlockLdsExtraM
,
bool
ABlockLdsExtraM
,
bool
BBlockLdsExtraN
>
bool
BBlockLdsExtraN
>
...
@@ -488,15 +483,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
...
@@ -488,15 +483,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr
auto
a_k0_m_k1_grid_step_hacks
=
AGridStepHacks
{};
constexpr
auto
b_k0_n_k1_grid_step_hacks
=
BGridStepHacks
{};
// hack to control index calculation when move slice window for A and B matrix for
// threadwise copy
constexpr
auto
a_k0_m_k1_grid_move_slice_window_step_hack
=
AGridMoveSliceWindowStepHacks
{};
constexpr
auto
b_k0_n_k1_grid_move_slice_window_step_hack
=
BGridMoveSliceWindowStepHacks
{};
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_a_block
,
a_block_desc_k0_m_k1
.
GetElementSpaceSize
());
p_a_block
,
a_block_desc_k0_m_k1
.
GetElementSpaceSize
());
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
...
@@ -504,8 +490,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
...
@@ -504,8 +490,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
// preload data into LDS
// preload data into LDS
{
{
a_blockwise_copy
.
RunRead
(
a_grid_desc_k0_m_k1
,
a_grid_buf
,
a_k0_m_k1_grid_step_hacks
);
a_blockwise_copy
.
RunRead
(
a_grid_desc_k0_m_k1
,
a_grid_buf
);
b_blockwise_copy
.
RunRead
(
b_grid_desc_k0_n_k1
,
b_grid_buf
,
b_k0_n_k1_grid_step_hacks
);
b_blockwise_copy
.
RunRead
(
b_grid_desc_k0_n_k1
,
b_grid_buf
);
a_blockwise_copy
.
RunWrite
(
a_block_desc_k0_m_k1
,
a_block_buf
);
a_blockwise_copy
.
RunWrite
(
a_block_desc_k0_m_k1
,
a_block_buf
);
b_blockwise_copy
.
RunWrite
(
b_block_desc_k0_n_k1
,
b_block_buf
);
b_blockwise_copy
.
RunWrite
(
b_block_desc_k0_n_k1
,
b_block_buf
);
...
@@ -522,19 +508,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
...
@@ -522,19 +508,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
do
do
{
{
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_m_k1
,
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_m_k1
,
a_block_slice_copy_step
,
a_block_slice_copy_step
);
a_k0_m_k1_grid_move_slice_window_step_hack
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_n_k1
,
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_n_k1
,
b_block_slice_copy_step
,
b_block_slice_copy_step
);
b_k0_n_k1_grid_move_slice_window_step_hack
);
a_blockwise_copy
.
RunRead
(
a_blockwise_copy
.
RunRead
(
a_grid_desc_k0_m_k1
,
a_grid_buf
,
a_k0_m_k1_grid_step_hacks
);
a_grid_desc_k0_m_k1
,
a_grid_buf
);
block_sync_lds
();
block_sync_lds
();
b_blockwise_copy
.
RunRead
(
b_blockwise_copy
.
RunRead
(
b_grid_desc_k0_n_k1
,
b_grid_buf
,
b_k0_n_k1_grid_step_hacks
);
b_grid_desc_k0_n_k1
,
b_grid_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
...
@@ -583,8 +567,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
...
@@ -583,8 +567,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
const
index_t
n_thread_data_on_grid
=
const
index_t
n_thread_data_on_grid
=
n_block_data_idx_on_grid
+
c_thread_mtx_on_block
[
I1
];
n_block_data_idx_on_grid
+
c_thread_mtx_on_block
[
I1
];
constexpr
auto
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks
=
CGridStepHacks
{};
const
auto
m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor
=
const
auto
m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor
=
make_single_stage_tensor_adaptor
(
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
M0
,
M1
,
M2
,
M3
,
M4
))),
make_tuple
(
make_merge_transform
(
make_tuple
(
M0
,
M1
,
M2
,
M3
,
M4
))),
...
@@ -635,7 +617,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
...
@@ -635,7 +617,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
c_thread_buf
,
c_thread_buf
,
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c_grid_buf
,
c_grid_buf
,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks
,
c0_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c0_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c0_grid_buf
);
c0_grid_buf
);
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v1r4.hpp
View file @
583aab02
...
@@ -397,14 +397,12 @@ struct ThreadwiseTensorSliceTransfer_v1r4
...
@@ -397,14 +397,12 @@ struct ThreadwiseTensorSliceTransfer_v1r4
typename
SrcBuffer
,
typename
SrcBuffer
,
typename
DstBuffer
,
typename
DstBuffer
,
typename
Dst0Buffer
,
typename
Dst0Buffer
,
typename
Dst1Buffer
,
typename
Dst1Buffer
>
typename
DstStepHacks
>
__device__
void
Run
(
const
SrcDesc
&
,
__device__
void
Run
(
const
SrcDesc
&
,
const
SrcSliceOriginIdx
&
,
const
SrcSliceOriginIdx
&
,
const
SrcBuffer
&
src_buf
,
const
SrcBuffer
&
src_buf
,
const
DstDesc
&
dst_desc
,
const
DstDesc
&
dst_desc
,
DstBuffer
&
dst_buf
,
DstBuffer
&
dst_buf
,
const
DstStepHacks
&
dst_step_hacks
,
const
Dst0Desc
&
dst0_desc
,
const
Dst0Desc
&
dst0_desc
,
const
Dst0Buffer
&
dst0_buf
,
const
Dst0Buffer
&
dst0_buf
,
const
Dst1Desc
&
dst1_desc
,
const
Dst1Desc
&
dst1_desc
,
...
@@ -427,7 +425,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4
...
@@ -427,7 +425,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4
src_buf
,
src_buf
,
dst_desc
,
dst_desc
,
dst_buf
,
dst_buf
,
dst
_step_hacks
,
f
_step_hacks
(
dst_desc
)
,
dst0_desc
,
dst0_desc
,
dst0_buf
,
dst0_buf
,
f_step_hacks
(
dst0_desc
),
f_step_hacks
(
dst0_desc
),
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v1r5.hpp
View file @
583aab02
...
@@ -339,14 +339,12 @@ struct ThreadwiseTensorSliceTransfer_v1r5
...
@@ -339,14 +339,12 @@ struct ThreadwiseTensorSliceTransfer_v1r5
template
<
typename
SrcSliceOriginIdx
,
template
<
typename
SrcSliceOriginIdx
,
typename
SrcBuffer
,
typename
SrcBuffer
,
typename
DstBuffer
,
typename
DstBuffer
,
typename
Dst0Buffer
,
typename
Dst0Buffer
>
typename
DstStepHacks
>
__device__
void
Run
(
const
SrcDesc
&
,
__device__
void
Run
(
const
SrcDesc
&
,
const
SrcSliceOriginIdx
&
,
const
SrcSliceOriginIdx
&
,
const
SrcBuffer
&
src_buf
,
const
SrcBuffer
&
src_buf
,
const
DstDesc
&
dst_desc
,
const
DstDesc
&
dst_desc
,
DstBuffer
&
dst_buf
,
DstBuffer
&
dst_buf
,
const
DstStepHacks
&
dst_step_hacks
,
const
Dst0Desc
&
dst0_desc
,
const
Dst0Desc
&
dst0_desc
,
const
Dst0Buffer
&
dst0_buf
)
const
Dst0Buffer
&
dst0_buf
)
{
{
...
@@ -367,7 +365,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5
...
@@ -367,7 +365,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5
src_buf
,
src_buf
,
dst_desc
,
dst_desc
,
dst_buf
,
dst_buf
,
dst
_step_hacks
,
f
_step_hacks
(
dst_desc
)
,
dst0_desc
,
dst0_desc
,
dst0_buf
,
dst0_buf
,
f_step_hacks
(
dst0_desc
));
f_step_hacks
(
dst0_desc
));
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment