Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
15713b20
"profiler/src/profile_grouped_gemm.cpp" did not exist on "65d67fb7738eac6f8ec4b27544eb8d88307dfbef"
Commit
15713b20
authored
Jul 07, 2023
by
danyao12
Browse files
rename functions
parent
8defa341
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
21 additions
and
24 deletions
+21
-24
include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
...pl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
+1
-1
include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
...pl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
+1
-1
include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
...pl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
+1
-1
include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
...pl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
+1
-1
include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_ydotygrad.hpp
...gridwise_batched_mha_bwd_xdl_cshuffle_qloop_ydotygrad.hpp
+17
-20
No files found.
include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
View file @
15713b20
...
...
@@ -881,7 +881,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_Light_V1
c_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3_
=
GridwiseGemm
::
MakeCGridDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3
(
z_grid_desc_m_n_
);
d_y_grid_desc_mblock_mperblock_oblock_operblock_
=
GridwiseYDotYGrad
::
Make
C
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
GridwiseYDotYGrad
::
Make
Y
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
y_grid_desc_m_o_
);
// Print();
...
...
include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
View file @
15713b20
...
...
@@ -894,7 +894,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_Light_V2
c_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3_
=
GridwiseGemm
::
MakeCGridDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3
(
z_grid_desc_m_n_
);
d_y_grid_desc_mblock_mperblock_oblock_operblock_
=
GridwiseYDotYGrad
::
Make
C
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
GridwiseYDotYGrad
::
Make
Y
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
y_grid_desc_m_o_
);
// Print();
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
View file @
15713b20
...
...
@@ -949,7 +949,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Xdl_CShuffle_Light_V1
const
auto
d_block_2_ctile_map
=
GridwiseYDotYGrad
::
MakeDefaultBlock2CTileMap
(
y_grid_desc_m_o
);
const
auto
d_y_grid_desc_mblock_mperblock_nblock_nperblock
=
GridwiseYDotYGrad
::
Make
C
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
GridwiseYDotYGrad
::
Make
Y
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
y_grid_desc_m_o
);
index_t
d_num_blocks_per_batch
=
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
View file @
15713b20
...
...
@@ -951,7 +951,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Xdl_CShuffle_Light_V2
const
auto
d_block_2_ctile_map
=
GridwiseYDotYGrad
::
MakeDefaultBlock2CTileMap
(
y_grid_desc_m_o
);
const
auto
d_y_grid_desc_mblock_mperblock_nblock_nperblock
=
GridwiseYDotYGrad
::
Make
C
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
GridwiseYDotYGrad
::
Make
Y
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
y_grid_desc_m_o
);
index_t
d_num_blocks_per_batch
=
...
...
include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_ydotygrad.hpp
View file @
15713b20
...
...
@@ -22,7 +22,7 @@ namespace ck {
template
<
typename
InputDataType
,
typename
FloatD
,
typename
C
GridDesc_M_N
,
typename
Y
GridDesc_M_N
,
typename
DGridDesc_M
,
index_t
BlockSize
,
index_t
MPerBlock
,
...
...
@@ -32,23 +32,21 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
WaveSize
=
64
;
static_assert
(
BlockSize
==
MPerBlock
,
"BlockSize must be same with MPerBlock"
);
// block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
template
<
typename
Block2CTileMap
>
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
C
GridDesc_M_N
&
c
_grid_desc_m_n
,
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
Y
GridDesc_M_N
&
y
_grid_desc_m_n
,
const
Block2CTileMap
&
block_2_ctile_map
)
{
if
(
!
block_2_ctile_map
.
CheckValidity
(
c
_grid_desc_m_n
))
if
(
!
block_2_ctile_map
.
CheckValidity
(
y
_grid_desc_m_n
))
{
return
false
;
}
// const auto M =
c
_grid_desc_m_n.GetLength(I0);
const
auto
N
=
c
_grid_desc_m_n
.
GetLength
(
I1
);
// const auto M =
y
_grid_desc_m_n.GetLength(I0);
const
auto
N
=
y
_grid_desc_m_n
.
GetLength
(
I1
);
if
(
N
<
NPerBlock
)
{
return
false
;
...
...
@@ -62,21 +60,20 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
// {
// return false;
// }
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
return
true
;
}
__host__
__device__
static
constexpr
auto
Make
C
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
const
C
GridDesc_M_N
&
c
_grid_desc_m_n
)
Make
Y
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
const
Y
GridDesc_M_N
&
y
_grid_desc_m_n
)
{
const
auto
M
=
c
_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
N
=
c
_grid_desc_m_n
.
GetLength
(
I1
);
const
auto
M
=
y
_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
N
=
y
_grid_desc_m_n
.
GetLength
(
I1
);
const
auto
MBlock
=
M
/
MPerBlock
;
const
auto
NBlock
=
N
/
NPerBlock
;
const
auto
y_grid_desc_mblock_mperblock_nblock_nperblock
=
transform_tensor_descriptor
(
c
_grid_desc_m_n
,
y
_grid_desc_m_n
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
MBlock
,
Number
<
MPerBlock
>
{})),
make_unmerge_transform
(
make_tuple
(
NBlock
,
Number
<
NPerBlock
>
{}))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
...
...
@@ -86,7 +83,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
}
__host__
__device__
static
constexpr
auto
Make
ORS
GridDescriptor_MBlock_MPerBlock
(
const
DGridDesc_M
&
d_grid_desc_m
)
Make
D
GridDescriptor_MBlock_MPerBlock
(
const
DGridDesc_M
&
d_grid_desc_m
)
{
const
index_t
M
=
d_grid_desc_m
.
GetLength
(
I0
);
const
index_t
MBlock
=
M
/
MPerBlock
;
...
...
@@ -100,19 +97,19 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
return
d_grid_desc_mblock_mperblock
;
}
// return block_id to
C
matrix tile idx (m0, n0) mapping
// return block_id to
Y
matrix tile idx (m0, n0) mapping
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
C
GridDesc_M_N
&
c
_grid_desc_m_n
)
MakeDefaultBlock2CTileMap
(
const
Y
GridDesc_M_N
&
y
_grid_desc_m_n
)
{
return
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
NPerBlock
,
C
GridDesc_M_N
>
(
c
_grid_desc_m_n
);
return
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
NPerBlock
,
Y
GridDesc_M_N
>
(
y
_grid_desc_m_n
);
}
using
YGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
=
remove_cvref_t
<
decltype
(
Make
C
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
C
GridDesc_M_N
{}))
>
;
Make
Y
GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
Y
GridDesc_M_N
{}))
>
;
using
DefaultBlock2CTileMap
=
remove_cvref_t
<
decltype
(
MakeDefaultBlock2CTileMap
(
C
GridDesc_M_N
{}))
>
;
remove_cvref_t
<
decltype
(
MakeDefaultBlock2CTileMap
(
Y
GridDesc_M_N
{}))
>
;
template
<
index_t
BlockSize_
,
index_t
BlockSliceLength_M_
,
index_t
BlockSliceLength_O_
>
struct
YDotYGrad_M_N_
...
...
@@ -240,7 +237,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
oblock_idx
++
;
}
while
(
oblock_idx
<
y_grid_desc_mblock_mperblock_nblock_nperblock
.
GetLength
(
I2
));
auto
d_grid_desc_mblock_mperblock
=
Make
ORS
GridDescriptor_MBlock_MPerBlock
(
d_grid_desc_m
);
auto
d_grid_desc_mblock_mperblock
=
Make
D
GridDescriptor_MBlock_MPerBlock
(
d_grid_desc_m
);
auto
d_thread_copy_vgpr_to_global
=
ThreadwiseTensorSliceTransfer_v1r3
<
FloatD
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment