Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
182e7480
Commit
182e7480
authored
Feb 18, 2025
by
mtgu0705
Browse files
Split the blockwise pipeline for fp8xint4.
parent
966f9051
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
696 additions
and
135 deletions
+696
-135
example/65_gemm_multiply_multiply/moe_gemm1.cpp
example/65_gemm_multiply_multiply/moe_gemm1.cpp
+2
-2
example/65_gemm_multiply_multiply/moe_gemm2.cpp
example/65_gemm_multiply_multiply/moe_gemm2.cpp
+2
-2
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp
...lockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp
+547
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
...lockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
+21
-21
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
.../blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
+77
-41
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
.../block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+18
-58
include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+25
-7
library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
...ary/reference_tensor_operation/cpu/reference_moe_gemm.hpp
+2
-2
library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
...ry/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
+2
-2
No files found.
example/65_gemm_multiply_multiply/moe_gemm1.cpp
View file @
182e7480
...
@@ -268,8 +268,8 @@ int main(int argc, char* argv[])
...
@@ -268,8 +268,8 @@ int main(int argc, char* argv[])
expert_ids
.
savetxt
(
"expert_ids.txt"
,
"int"
);
expert_ids
.
savetxt
(
"expert_ids.txt"
,
"int"
);
sorted_token_ids
.
savetxt
(
"sorted_token_ids.txt"
,
"int"
);
sorted_token_ids
.
savetxt
(
"sorted_token_ids.txt"
,
"int"
);
Tensor
<
A0DataType
>
a0_t_k
(
HostTensorDescriptor
({
tokens
,
K
},
{
K
,
1
}));
Tensor
<
A0DataType
>
a0_t_k
(
HostTensorDescriptor
({
tokens
,
K
},
{
K
,
1
}));
Tensor
<
B0DataType
>
b0_e_n_k
(
HostTensorDescriptor
({
experts
,
N
,
K
},
{
N
*
K
,
K
,
1
}));
Tensor
<
B0DataType
>
b0_e_n_k
(
HostTensorDescriptor
({
experts
,
K
,
N
},
{
N
*
K
,
1
,
K
}));
Tensor
<
B0DataType
>
b0_preshuffled
(
HostTensorDescriptor
({
experts
,
N
,
K
},
{
N
*
K
,
K
,
1
}));
Tensor
<
B0DataType
>
b0_preshuffled
(
HostTensorDescriptor
({
experts
,
K
,
N
},
{
N
*
K
,
1
,
K
}));
Tensor
<
D0DataType
>
d0_t_n
(
HostTensorDescriptor
({
tokens
,
N
},
{
StrideDs
[
0
],
0
}));
Tensor
<
D0DataType
>
d0_t_n
(
HostTensorDescriptor
({
tokens
,
N
},
{
StrideDs
[
0
],
0
}));
Tensor
<
D1DataType
>
d1_e_n
(
HostTensorDescriptor
({
experts
,
N
},
{
1
,
StrideDs
[
1
]}));
Tensor
<
D1DataType
>
d1_e_n
(
HostTensorDescriptor
({
experts
,
N
},
{
1
,
StrideDs
[
1
]}));
Tensor
<
EDataType
>
e_t_n_host_result
(
HostTensorDescriptor
({
tokens
,
topk
,
N
},
{
topk
*
N
,
N
,
1
}));
Tensor
<
EDataType
>
e_t_n_host_result
(
HostTensorDescriptor
({
tokens
,
topk
,
N
},
{
topk
*
N
,
N
,
1
}));
...
...
example/65_gemm_multiply_multiply/moe_gemm2.cpp
View file @
182e7480
...
@@ -268,8 +268,8 @@ int main(int argc, char* argv[])
...
@@ -268,8 +268,8 @@ int main(int argc, char* argv[])
expert_ids
.
savetxt
(
"expert_ids.txt"
,
"int"
);
expert_ids
.
savetxt
(
"expert_ids.txt"
,
"int"
);
sorted_token_ids
.
savetxt
(
"sorted_token_ids.txt"
,
"int"
);
sorted_token_ids
.
savetxt
(
"sorted_token_ids.txt"
,
"int"
);
Tensor
<
A0DataType
>
a0_t_k_k
(
HostTensorDescriptor
({
tokens
,
topk
,
K
},
{
topk
*
K
,
K
,
1
}));
Tensor
<
A0DataType
>
a0_t_k_k
(
HostTensorDescriptor
({
tokens
,
topk
,
K
},
{
topk
*
K
,
K
,
1
}));
Tensor
<
B0DataType
>
b0_e_n_k
(
HostTensorDescriptor
({
experts
,
N
,
K
},
{
N
*
K
,
K
,
1
}));
Tensor
<
B0DataType
>
b0_e_n_k
(
HostTensorDescriptor
({
experts
,
K
,
N
},
{
N
*
K
,
1
,
N
}));
Tensor
<
B0DataType
>
b0_preshuffled
(
HostTensorDescriptor
({
experts
,
N
,
K
},
{
N
*
K
,
K
,
1
}));
Tensor
<
B0DataType
>
b0_preshuffled
(
HostTensorDescriptor
({
experts
,
K
,
N
},
{
N
*
K
,
1
,
N
}));
Tensor
<
D0DataType
>
d0_t_n
(
HostTensorDescriptor
({
tokens
,
N
},
{
StrideDs
[
0
],
0
}));
Tensor
<
D0DataType
>
d0_t_n
(
HostTensorDescriptor
({
tokens
,
N
},
{
StrideDs
[
0
],
0
}));
Tensor
<
D1DataType
>
d1_e_n
(
HostTensorDescriptor
({
experts
,
N
},
{
1
,
StrideDs
[
1
]}));
Tensor
<
D1DataType
>
d1_e_n
(
HostTensorDescriptor
({
experts
,
N
},
{
1
,
StrideDs
[
1
]}));
Tensor
<
D2DataType
>
d2_e_n
(
HostTensorDescriptor
({
sorted_size
,
N
},
{
1
,
0
}));
Tensor
<
D2DataType
>
d2_e_n
(
HostTensorDescriptor
({
sorted_size
,
N
},
{
1
,
0
}));
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp
0 → 100644
View file @
182e7480
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
→
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_
dequant_
v3.hpp
View file @
182e7480
...
@@ -33,7 +33,7 @@ template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
...
@@ -33,7 +33,7 @@ template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
index_t
MRepeat
,
index_t
MRepeat
,
index_t
NRepeat
,
index_t
NRepeat
,
index_t
KPacks
>
index_t
KPacks
>
struct
BlockwiseGemmXdlops_pipeline_bpreshuffle_v3
struct
BlockwiseGemmXdlops_pipeline_bpreshuffle_
bdequant_
v3
{
{
};
};
...
@@ -58,26 +58,26 @@ template <index_t BlockSize,
...
@@ -58,26 +58,26 @@ template <index_t BlockSize,
index_t
KPack
index_t
KPack
// ,bool TransposeC //disable transposec right now...
// ,bool TransposeC //disable transposec right now...
>
>
struct
BlockwiseGemmXdlops_pipeline_bpreshuffle_v3
<
BlockGemmPipelineScheduler
::
Intrawave
,
struct
BlockwiseGemmXdlops_pipeline_bpreshuffle_
bdequant_
v3
<
BlockGemmPipelineScheduler
::
Intrawave
,
BlockSize
,
BlockSize
,
ADataType
,
ADataType
,
BDataType
,
BDataType
,
ComputeDataType
,
ComputeDataType
,
AccDataType
,
AccDataType
,
ATileDesc
,
ATileDesc
,
BTileDesc
,
BTileDesc
,
AMmaTileDesc
,
AMmaTileDesc
,
BMmaTileDesc
,
BMmaTileDesc
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferSrcScalarPerVector
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferSrcScalarPerVector
,
MPerBlock
,
MPerBlock
,
NPerBlock
,
NPerBlock
,
KPerBlock
,
KPerBlock
,
MPerXDL
,
MPerXDL
,
NPerXDL
,
NPerXDL
,
MRepeat
,
MRepeat
,
NRepeat
,
NRepeat
,
KPack
>
KPack
>
:
BlockwiseGemmXdlops_pipeline_base
<
BlockSize
,
:
BlockwiseGemmXdlops_pipeline_base
<
BlockSize
,
ADataType
,
ADataType
,
BDataType
,
BDataType
,
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
View file @
182e7480
...
@@ -4,8 +4,9 @@
...
@@ -4,8 +4,9 @@
#pragma once
#pragma once
#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_
dequant_
v3.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -34,26 +35,53 @@ constexpr auto BlockGemmBPreshufflePipeline_Selector()
...
@@ -34,26 +35,53 @@ constexpr auto BlockGemmBPreshufflePipeline_Selector()
{
{
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v1
)
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v1
)
{
{
return
BlockwiseGemmXdlops_pipeline_bpreshuffle_v1
<
BlkGemmPipeSche
,
if
(
std
::
is_same
<
ADataType
,
BDataType
>::
value
)
BlockSize
,
{
ADataType
,
return
BlockwiseGemmXdlops_pipeline_bpreshuffle_v1
<
BlkGemmPipeSche
,
BDataType
,
BlockSize
,
ComputeDataType
,
ADataType
,
AccDataType
,
BDataType
,
ATileDesc
,
ComputeDataType
,
BTileDesc
,
AccDataType
,
AMmaTileDesc
,
ATileDesc
,
BMmaTileDesc
,
BTileDesc
,
ABlockTransferSrcScalarPerVector
,
AMmaTileDesc
,
BBlockTransferSrcScalarPerVector
,
BMmaTileDesc
,
MPerBlock
,
ABlockTransferSrcScalarPerVector
,
NPerBlock
,
BBlockTransferSrcScalarPerVector
,
KPerBlock
,
MPerBlock
,
MPerXDL
,
NPerBlock
,
NPerXDL
,
KPerBlock
,
MRepeat
,
MPerXDL
,
NRepeat
,
NPerXDL
,
KPack
>
{};
MRepeat
,
NRepeat
,
KPack
>
{};
}
else
{
return
BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1
<
BlkGemmPipeSche
,
BlockSize
,
ADataType
,
BDataType
,
ComputeDataType
,
AccDataType
,
ATileDesc
,
BTileDesc
,
AMmaTileDesc
,
BMmaTileDesc
,
ABlockTransferSrcScalarPerVector
,
BBlockTransferSrcScalarPerVector
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerXDL
,
NPerXDL
,
MRepeat
,
NRepeat
,
KPack
>
{};
}
}
}
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v2
)
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v2
)
{
{
...
@@ -81,26 +109,34 @@ constexpr auto BlockGemmBPreshufflePipeline_Selector()
...
@@ -81,26 +109,34 @@ constexpr auto BlockGemmBPreshufflePipeline_Selector()
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v3
)
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v3
)
{
{
static_assert
(
MRepeat
>=
4
,
"MRepeat should at least be 4 in BlockGemmPipelineVersion::v3"
);
static_assert
(
MRepeat
>=
4
,
"MRepeat should at least be 4 in BlockGemmPipelineVersion::v3"
);
return
BlockwiseGemmXdlops_pipeline_bpreshuffle_v3
<
BlkGemmPipeSche
,
if
(
std
::
is_same
<
ADataType
,
BDataType
>::
value
)
BlockSize
,
{
ADataType
,
std
::
cerr
<<
"BlockGemmPipeline v3 configuration is not available"
<<
std
::
endl
;
BDataType
,
}
ComputeDataType
,
else
AccDataType
,
{
ATileDesc
,
return
BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3
<
BTileDesc
,
BlkGemmPipeSche
,
AMmaTileDesc
,
BlockSize
,
BMmaTileDesc
,
ADataType
,
ABlockTransferSrcScalarPerVector
,
BDataType
,
BBlockTransferSrcScalarPerVector
,
ComputeDataType
,
MPerBlock
,
AccDataType
,
NPerBlock
,
ATileDesc
,
KPerBlock
,
BTileDesc
,
MPerXDL
,
AMmaTileDesc
,
NPerXDL
,
BMmaTileDesc
,
MRepeat
,
ABlockTransferSrcScalarPerVector
,
NRepeat
,
BBlockTransferSrcScalarPerVector
,
KPack
>
{};
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerXDL
,
NPerXDL
,
MRepeat
,
NRepeat
,
KPack
>
{};
}
}
}
else
else
{
{
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
View file @
182e7480
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -144,7 +144,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -144,7 +144,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
static
constexpr
index_t
PrefetchStages
=
2
;
static
constexpr
index_t
PrefetchStages
=
2
;
static
constexpr
index_t
PrefillStages
=
1
;
static
constexpr
index_t
PrefillStages
=
1
;
static
constexpr
index_t
GlobalBufferNum
=
2
;
static
constexpr
index_t
GlobalBufferNum
=
1
;
template
<
typename
TileDesc_M0_M1_M2_K
>
template
<
typename
TileDesc_M0_M1_M2_K
>
__host__
__device__
static
constexpr
auto
MakeAGemmMmaTileDescriptor
(
const
TileDesc_M0_M1_M2_K
&
)
__host__
__device__
static
constexpr
auto
MakeAGemmMmaTileDescriptor
(
const
TileDesc_M0_M1_M2_K
&
)
...
@@ -235,7 +235,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -235,7 +235,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
const
BGridBuffer
&
b_grid_buf
,
const
BGridBuffer
&
b_grid_buf
,
BBlockBuffer
&
b_block_buf
,
BBlockBuffer
&
b_block_buf
,
const
BBlockTransferStep
&
b_block_copy_step
,
const
BBlockTransferStep
&
b_block_copy_step
,
// BThreadTransfer& b_thread_dequant_copy,
CThreadBuffer
&
c_thread_buf
,
CThreadBuffer
&
c_thread_buf
,
index_t
num_loop
)
const
index_t
num_loop
)
const
{
{
...
@@ -243,19 +242,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -243,19 +242,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
__builtin_amdgcn_sched_barrier
(
0
);
__builtin_amdgcn_sched_barrier
(
0
);
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
ComputeDataType
>
(
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
ComputeDataType
>
(
a_thread_desc_
.
GetElementSpaceSize
());
a_thread_desc_
.
GetElementSpaceSize
());
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
BDataType
>
(
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
ComputeDataType
>
(
b_thread_desc_
.
GetElementSpaceSize
());
auto
b_thread_dequant_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
ComputeDataType
>
(
b_thread_desc_
.
GetElementSpaceSize
());
b_thread_desc_
.
GetElementSpaceSize
());
StaticallyIndexedArray
<
decltype
(
b_thread_buf
),
Number
<
2
>
{}
>
b_thread_bufs
;
StaticallyIndexedArray
<
decltype
(
b_thread_buf
),
Number
<
2
>
{}
>
b_thread_bufs
;
constexpr
auto
b_block_origin_idx
=
make_tuple
(
I0
,
I0
,
I0
,
I0
);
constexpr
auto
b_block_origin_idx
=
make_tuple
(
I0
,
I0
,
I0
,
I0
);
StaticallyIndexedArray
<
decltype
(
b_thread_dequant_buf
),
Number
<
2
>
{}
>
b_thread_dequant_bufs
;
// Global prefetch A1 B1
// Global prefetch A1 B1
a_blockwise_copy
.
RunRead
(
a_grid_desc
,
a_grid_buf
,
I0
);
a_blockwise_copy
.
RunRead
(
a_grid_desc
,
a_grid_buf
);
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_grid_buf
,
b_grid_buf
,
b_block_desc_n0_n1_k0_k1
,
b_block_desc_n0_n1_k0_k1
,
...
@@ -264,13 +258,12 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -264,13 +258,12 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc
,
b_block_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc
,
b_block_copy_step
);
__builtin_amdgcn_sched_barrier
(
0
);
// // Local prefill A1
// // Local prefill A1
a_blockwise_copy
.
RunWrite
(
a_block_desc
,
a_block_buf
,
I0
);
a_blockwise_copy
.
RunWrite
(
a_block_desc
,
a_block_buf
);
// // Global prefetch A2
// // Global prefetch A2
a_blockwise_copy
.
RunRead
(
a_grid_desc
,
a_grid_buf
,
I0
);
a_blockwise_copy
.
RunRead
(
a_grid_desc
,
a_grid_buf
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
// Local prefetch A1
// Local prefetch A1
...
@@ -285,13 +278,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -285,13 +278,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
a_thread_buf
);
a_thread_buf
);
});
});
});
});
// B VGPR->VGPR dequant
b_thread_dequant_copy_
.
Run
(
b_block_desc_n0_n1_k0_k1
,
b_block_origin_idx
,
b_thread_bufs
(
I0
),
b_thread_desc_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_dequant_bufs
(
I0
));
// Initialize C
// Initialize C
c_thread_buf
.
Clear
();
c_thread_buf
.
Clear
();
...
@@ -310,14 +296,18 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -310,14 +296,18 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
b_block_desc_n0_n1_k0_k1
,
b_block_desc_n0_n1_k0_k1
,
b_block_origin_idx
,
b_block_origin_idx
,
b_thread_bufs
(
local_read_buf
));
b_thread_bufs
(
local_read_buf
));
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc
,
b_block_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc
,
b_block_copy_step
);
block_sync_lds
();
block_sync_lds
();
a_blockwise_copy
.
RunWrite
(
a_block_desc
,
a_block_buf
,
mfma_reg_buf
);
a_blockwise_copy
.
RunWrite
(
a_block_desc
,
a_block_buf
);
a_blockwise_copy
.
RunRead
(
a_grid_desc
,
a_grid_buf
);
a_blockwise_copy
.
RunRead
(
a_grid_desc
,
a_grid_buf
,
local_read_buf
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
// printf("bid %d tid %d %f %f\n", blockIdx.x, threadIdx.x,
// type_convert<float>(a_thread_buf[I0]),
// type_convert<float>(b_thread_bufs[mfma_reg_buf][I0]));
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
static_for
<
0
,
KRepeat
,
1
>
{}([
&
](
auto
k0
)
{
static_for
<
0
,
KRepeat
,
1
>
{}([
&
](
auto
k0
)
{
...
@@ -329,9 +319,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -329,9 +319,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
m0
,
I0
,
I0
,
k0
,
I0
,
ik
))
>
{}];
make_tuple
(
m0
,
I0
,
I0
,
k0
,
I0
,
ik
))
>
{}];
b_thread_vec
.
template
AsType
<
ComputeDataType
>()(
ik
)
=
b_thread_vec
.
template
AsType
<
ComputeDataType
>()(
ik
)
=
b_thread_
dequant_
bufs
[
mfma_reg_buf
]
b_thread_bufs
[
mfma_reg_buf
]
[
Number
<
b_thread_desc_
.
CalculateOffset
(
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
n0
,
I0
,
k0
,
ik
))
>
{}];
make_tuple
(
n0
,
I0
,
k0
,
ik
))
>
{}];
});
});
using
mfma_input_type
=
using
mfma_input_type
=
typename
vector_type
<
ComputeDataType
,
typename
vector_type
<
ComputeDataType
,
...
@@ -360,13 +350,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -360,13 +350,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
a_thread_buf
);
a_thread_buf
);
});
});
});
});
// B VGPR->VGPR dequant
b_thread_dequant_copy_
.
Run
(
b_block_desc_n0_n1_k0_k1
,
b_block_origin_idx
,
b_thread_bufs
(
local_read_buf
),
b_thread_desc_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_dequant_bufs
(
local_read_buf
));
HotLoopScheduler
();
HotLoopScheduler
();
__builtin_amdgcn_sched_barrier
(
0
);
__builtin_amdgcn_sched_barrier
(
0
);
...
@@ -401,7 +384,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -401,7 +384,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
m0
,
I0
,
I0
,
k0
,
I0
,
ik
))
>
{}];
make_tuple
(
m0
,
I0
,
I0
,
k0
,
I0
,
ik
))
>
{}];
b_thread_vec
.
template
AsType
<
ComputeDataType
>()(
ik
)
=
b_thread_vec
.
template
AsType
<
ComputeDataType
>()(
ik
)
=
b_thread_
dequant_
bufs
[
I0
][
Number
<
b_thread_desc_
.
CalculateOffset
(
b_thread_bufs
[
I0
][
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
n0
,
I0
,
k0
,
ik
))
>
{}];
make_tuple
(
n0
,
I0
,
k0
,
ik
))
>
{}];
});
});
...
@@ -430,13 +413,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -430,13 +413,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
a_thread_buf
);
a_thread_buf
);
});
});
});
});
// B VGPR->VGPR dequant
b_thread_dequant_copy_
.
Run
(
b_block_desc_n0_n1_k0_k1
,
b_block_origin_idx
,
b_thread_bufs
(
I1
),
b_thread_desc_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_dequant_bufs
(
I1
));
__builtin_amdgcn_sched_barrier
(
0
);
__builtin_amdgcn_sched_barrier
(
0
);
...
@@ -451,7 +427,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -451,7 +427,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
m0
,
I0
,
I0
,
k0
,
I0
,
ik
))
>
{}];
make_tuple
(
m0
,
I0
,
I0
,
k0
,
I0
,
ik
))
>
{}];
b_thread_vec
.
template
AsType
<
ComputeDataType
>()(
ik
)
=
b_thread_vec
.
template
AsType
<
ComputeDataType
>()(
ik
)
=
b_thread_
dequant_
bufs
[
I1
][
Number
<
b_thread_desc_
.
CalculateOffset
(
b_thread_bufs
[
I1
][
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
n0
,
I0
,
k0
,
ik
))
>
{}];
make_tuple
(
n0
,
I0
,
k0
,
ik
))
>
{}];
});
});
...
@@ -484,7 +460,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -484,7 +460,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
m0
,
I0
,
I0
,
k0
,
I0
,
ik
))
>
{}];
make_tuple
(
m0
,
I0
,
I0
,
k0
,
I0
,
ik
))
>
{}];
b_thread_vec
.
template
AsType
<
ComputeDataType
>()(
ik
)
=
b_thread_vec
.
template
AsType
<
ComputeDataType
>()(
ik
)
=
b_thread_
dequant_
bufs
[
I0
][
Number
<
b_thread_desc_
.
CalculateOffset
(
b_thread_bufs
[
I0
][
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
n0
,
I0
,
k0
,
ik
))
>
{}];
make_tuple
(
n0
,
I0
,
k0
,
ik
))
>
{}];
});
});
...
@@ -527,22 +503,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
...
@@ -527,22 +503,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
static
constexpr
BTileDesc
b_block_desc_n0_n1_k0_k1
;
static
constexpr
BTileDesc
b_block_desc_n0_n1_k0_k1
;
using
Base
::
c_thread_desc_
;
using
Base
::
c_thread_desc_
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BThreadDequantCopy
=
ThreadwiseTensorSliceTransfer_StaticToStatic
<
BDataType
,
ComputeDataType
,
decltype
(
b_block_desc_n0_n1_k0_k1
),
decltype
(
b_block_desc_n0_n1_k0_k1
),
tensor_operation
::
element_wise
::
PassThrough
,
Sequence
<
Number
<
NRepeat
>
{},
I1
,
Number
<
KRepeat
>
{},
Number
<
KPack
>
{}
>
,
Sequence
<
1
,
2
,
0
,
3
>
,
3
,
KPack
>
;
const
PassThrough
b_element_op
{};
BThreadDequantCopy
b_thread_dequant_copy_
{
b_element_op
};
};
};
}
// namespace ck
}
// namespace ck
include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
View file @
182e7480
...
@@ -196,6 +196,20 @@ struct GridwiseMoeGemm
...
@@ -196,6 +196,20 @@ struct GridwiseMoeGemm
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
static
constexpr
index_t
APackedSize
=
[]()
{
if
constexpr
(
is_same_v
<
remove_cvref_t
<
ADataType
>
,
pk_i4_t
>
)
return
2
;
else
return
1
;
}();
static
constexpr
index_t
BPackedSize
=
[]()
{
if
constexpr
(
is_same_v
<
remove_cvref_t
<
BDataType
>
,
pk_i4_t
>
)
return
2
;
else
return
1
;
}();
__host__
static
auto
CalculateGridSize
(
index_t
M
,
index_t
N
)
__host__
static
auto
CalculateGridSize
(
index_t
M
,
index_t
N
)
{
{
const
index_t
nblock
=
math
::
integer_divide_ceil
(
N
,
NPerBlock
);
const
index_t
nblock
=
math
::
integer_divide_ceil
(
N
,
NPerBlock
);
...
@@ -385,6 +399,10 @@ struct GridwiseMoeGemm
...
@@ -385,6 +399,10 @@ struct GridwiseMoeGemm
using
GemmSpecialization
=
tensor_operation
::
device
::
GemmSpecialization
;
using
GemmSpecialization
=
tensor_operation
::
device
::
GemmSpecialization
;
static_assert
(
!
(
is_same_v
<
remove_cvref_t
<
ADataType
>
,
pk_i4_t
>
&&
GemmSpec
!=
GemmSpecialization
::
Default
),
"pk_i4_t does not support padding"
);
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
NKPadding
||
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
NKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
)
GemmSpec
==
GemmSpecialization
::
MNKPadding
)
{
{
...
@@ -681,7 +699,7 @@ struct GridwiseMoeGemm
...
@@ -681,7 +699,7 @@ struct GridwiseMoeGemm
{
{
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>
)
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>
)
{
{
a_k_split_offset
=
k_id
*
karg
.
KRead
;
a_k_split_offset
=
k_id
*
karg
.
KRead
/
APackedSize
;
}
}
else
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
ColumnMajor
,
ALayout
>
)
else
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
ColumnMajor
,
ALayout
>
)
{
{
...
@@ -695,7 +713,7 @@ struct GridwiseMoeGemm
...
@@ -695,7 +713,7 @@ struct GridwiseMoeGemm
else
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>
)
else
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>
)
{
{
// KPack * NLane * KLane * K0 * N0
// KPack * NLane * KLane * K0 * N0
b_k_split_offset
=
k_id
*
karg
.
KRead
*
NLane
;
b_k_split_offset
=
k_id
*
karg
.
KRead
*
NLane
/
BPackedSize
;
}
}
if
(
k_id
<
karg
.
KBatch
-
1
)
if
(
k_id
<
karg
.
KBatch
-
1
)
...
@@ -725,7 +743,7 @@ struct GridwiseMoeGemm
...
@@ -725,7 +743,7 @@ struct GridwiseMoeGemm
// in some cases.
// in some cases.
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>::
value
)
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>::
value
)
{
{
constexpr
auto
MLdsLayer
=
32
*
4
/
KPerBlock
/
sizeof
(
LDSTypeA
)
<
1
constexpr
auto
MLdsLayer
=
32
*
4
/
KPerBlock
/
sizeof
(
LDSTypeA
)
/
APackedSize
<
1
?
1
?
1
:
32
*
4
/
KPerBlock
/
sizeof
(
LDSTypeA
);
:
32
*
4
/
KPerBlock
/
sizeof
(
LDSTypeA
);
constexpr
auto
a_lds_block_desc
=
make_naive_tensor_descriptor
(
constexpr
auto
a_lds_block_desc
=
make_naive_tensor_descriptor
(
...
@@ -875,8 +893,8 @@ struct GridwiseMoeGemm
...
@@ -875,8 +893,8 @@ struct GridwiseMoeGemm
BlkGemmPipelineVer
,
BlkGemmPipelineVer
,
BlkGemmPipeSched
,
BlkGemmPipeSched
,
BlockSize
,
BlockSize
,
LDS
Type
A
,
AData
Type
,
LDS
Type
B
,
BData
Type
,
ComputeTypeA
,
ComputeTypeA
,
AccDataType
,
AccDataType
,
decltype
(
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
()),
decltype
(
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
()),
...
@@ -913,7 +931,7 @@ struct GridwiseMoeGemm
...
@@ -913,7 +931,7 @@ struct GridwiseMoeGemm
constexpr
auto
c_block_size
=
constexpr
auto
c_block_size
=
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
();
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
();
return
math
::
max
(
a_block_space_size_aligned
*
sizeof
(
LDSTypeA
),
return
math
::
max
(
a_block_space_size_aligned
*
sizeof
(
LDSTypeA
)
/
APackedSize
,
c_block_size
*
sizeof
(
CShuffleDataType
));
c_block_size
*
sizeof
(
CShuffleDataType
));
}
}
...
@@ -1209,7 +1227,7 @@ struct GridwiseMoeGemm
...
@@ -1209,7 +1227,7 @@ struct GridwiseMoeGemm
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_grid
,
a_grid_desc_ak0_m_ak1
.
GetElementSpaceSize
());
p_a_grid
,
a_grid_desc_ak0_m_ak1
.
GetElementSpaceSize
());
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_grid
+
expert_id
*
expert_stride
,
b_grid_desc_bpreshuffled
.
GetElementSpaceSize
());
p_b_grid
+
expert_id
*
expert_stride
/
BPackedSize
,
b_grid_desc_bpreshuffled
.
GetElementSpaceSize
());
// if(threadIdx.x==0)
// if(threadIdx.x==0)
// printf("tid %d eid %d expert_stride %d bufsize %d\n",
// printf("tid %d eid %d expert_stride %d bufsize %d\n",
// threadIdx.x, expert_id, expert_stride, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
// threadIdx.x, expert_id, expert_stride, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
View file @
182e7480
...
@@ -86,7 +86,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
...
@@ -86,7 +86,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
if
constexpr
(
is_same_v
<
ADataType
,
pk_i4_t
>
)
if
constexpr
(
is_same_v
<
ADataType
,
pk_i4_t
>
)
{
{
uint8_t
i4x2
=
arg
.
a_t_k_
(
t
,
k
).
data
;
uint8_t
i4x2
=
arg
.
a_t_k_
(
t
,
k
).
data
;
uint8_t
i4
=
0
;
uint8_t
i4
=
0
;
if
(
k
%
2
==
1
)
if
(
k
%
2
==
1
)
i4
=
(
i4x2
>>
0
)
&
0xf
;
i4
=
(
i4x2
>>
0
)
&
0xf
;
else
else
...
@@ -101,7 +101,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
...
@@ -101,7 +101,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
if
constexpr
(
is_same_v
<
BDataType
,
pk_i4_t
>
)
if
constexpr
(
is_same_v
<
BDataType
,
pk_i4_t
>
)
{
{
uint8_t
i4x2
=
arg
.
b_e_n_k_
(
e
,
k
,
n
).
data
;
uint8_t
i4x2
=
arg
.
b_e_n_k_
(
e
,
k
,
n
).
data
;
uint8_t
i4
=
0
;
uint8_t
i4
=
0
;
if
(
k
%
2
==
1
)
if
(
k
%
2
==
1
)
i4
=
(
i4x2
>>
0
)
&
0xf
;
i4
=
(
i4x2
>>
0
)
&
0xf
;
else
else
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
View file @
182e7480
...
@@ -100,7 +100,7 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
...
@@ -100,7 +100,7 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
{
{
if
constexpr
(
is_same_v
<
ADataType
,
pk_i4_t
>
)
if
constexpr
(
is_same_v
<
ADataType
,
pk_i4_t
>
)
{
{
uint8_t
i4x2
=
arg
.
a_t_k_
(
m
,
k
).
data
;
uint8_t
i4x2
=
arg
.
a_t_k_
(
t
,
topk_id
,
k
).
data
;
uint8_t
i4
=
0
;
uint8_t
i4
=
0
;
if
(
k
%
2
==
1
)
if
(
k
%
2
==
1
)
i4
=
(
i4x2
>>
0
)
&
0xf
;
i4
=
(
i4x2
>>
0
)
&
0xf
;
...
@@ -124,7 +124,7 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
...
@@ -124,7 +124,7 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
}
}
else
else
{
{
arg
.
b_element_op_
(
v_b
,
arg
.
b_e_n_k_
(
e
,
n
,
k
));
arg
.
b_element_op_
(
v_b
,
arg
.
b_e_n_k_
(
e
,
k
,
n
));
}
}
v_acc
+=
v_acc
+=
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment