Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
363b6744
Commit
363b6744
authored
Jan 14, 2025
by
mtgu0705
Browse files
add instance for gemm_ab_scale
parent
9dac9713
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
237 additions
and
218 deletions
+237
-218
library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
...k/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
+23
-23
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
...ice_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
+21
-21
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
...f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
...8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instance.cpp
...f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instance.cpp
..._f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
..._f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
...f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instance.cpp
..._bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instance.cpp
+3
-3
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+151
-151
profiler/src/profile_gemm_ab_scale.cpp
profiler/src/profile_gemm_ab_scale.cpp
+21
-2
No files found.
library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
View file @
363b6744
...
@@ -17,7 +17,7 @@ namespace tensor_operation {
...
@@ -17,7 +17,7 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_default_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -28,14 +28,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_i
...
@@ -28,14 +28,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_i
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_kpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -46,14 +46,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_
...
@@ -46,14 +46,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_mnpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_mnpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -64,14 +64,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding
...
@@ -64,14 +64,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_mnkpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -82,14 +82,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpaddin
...
@@ -82,14 +82,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpaddin
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_v1_default_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -100,14 +100,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default
...
@@ -100,14 +100,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_v1_kpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -118,14 +118,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpaddin
...
@@ -118,14 +118,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpaddin
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_v1_mnkpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -136,7 +136,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadd
...
@@ -136,7 +136,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadd
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
...
@@ -163,7 +163,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
...
@@ -163,7 +163,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
B1DataType
,
B1DataType
,
Tuple
<>
,
Tuple
<>
,
CDataType
,
CDataType
,
1
28
,
1
,
128
,
128
,
128
,
128
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
...
@@ -180,7 +180,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
...
@@ -180,7 +180,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
B1DataType
,
B1DataType
,
Tuple
<>
,
Tuple
<>
,
CDataType
,
CDataType
,
1
28
,
1
,
128
,
128
,
128
,
128
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
...
@@ -198,20 +198,20 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
...
@@ -198,20 +198,20 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
is_same_v
<
CLayout
,
Row
>
)
{
{
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_default_instances
(
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances
(
op_ptrs
);
op_ptrs
);
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_kpadding_instances
(
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_kpadding_instances
(
op_ptrs
);
op_ptrs
);
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_mnpadding_instances
(
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_mnpadding_instances
(
op_ptrs
);
op_ptrs
);
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_mnkpadding_instances
(
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_mnkpadding_instances
(
op_ptrs
);
op_ptrs
);
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_v1_default_instances
(
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_default_instances
(
op_ptrs
);
op_ptrs
);
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_v1_kpadding_instances
(
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances
(
op_ptrs
);
op_ptrs
);
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_v1_mnkpadding_instances
(
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_mnkpadding_instances
(
op_ptrs
);
op_ptrs
);
}
}
}
}
...
...
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
View file @
363b6744
...
@@ -34,7 +34,7 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
...
@@ -34,7 +34,7 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
static
constexpr
auto
Intrawave
=
BlockGemmPipelineScheduler
::
Intrawave
;
static
constexpr
auto
Intrawave
=
BlockGemmPipelineScheduler
::
Intrawave
;
template
<
GemmSpecialization
GemmSpec
>
template
<
GemmSpecialization
GemmSpec
>
using
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_instances
=
std
::
tuple
<
using
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
//################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| Scale| Scale| Scale| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm|
//################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| Scale| Scale| Scale| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm|
//################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline|
//################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline|
...
@@ -45,15 +45,15 @@ using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_instances =
...
@@ -45,15 +45,15 @@ using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_instances =
// Spill in current compiler
// Spill in current compiler
// DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3< Row, Col, Tuple<>, Row, F8, F8, Tuple<F32, F32>, BF16, F32, F32, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 128, 16, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
// DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3< Row, Col, Tuple<>, Row, F8, F8, Tuple<F32, F32>, BF16, F32, F32, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 224, 256, 128, 16, 16, 16, 16, 7, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
// DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3< Row, Col, Tuple<>, Row, F8, F8, Tuple<F32, F32>, BF16, F32, F32, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 128, 16, 16, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 2, 1, S<1, 64, 1, 4>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
// DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3< Row, Col, Tuple<>, Row, F8, F8, Tuple<F32, F32>, BF16, F32, F32, PassThrough, PassThrough, PassThrough, GemmSpec, 256, 256, 224, 128, 16, 16, 16, 16, 8, 7, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 2, 1, S<1, 64, 1, 4>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
1
28
,
128
,
128
,
128
,
128
,
128
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlockGemmPipelineScheduler
::
Intrawave
,
BlockGemmPipelineVersion
::
v3
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
1
,
128
,
128
,
128
,
128
,
128
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlockGemmPipelineScheduler
::
Intrawave
,
BlockGemmPipelineVersion
::
v3
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
1
28
,
128
,
128
,
128
,
64
,
128
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlockGemmPipelineScheduler
::
Intrawave
,
BlockGemmPipelineVersion
::
v3
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
1
,
128
,
128
,
128
,
64
,
128
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlockGemmPipelineScheduler
::
Intrawave
,
BlockGemmPipelineVersion
::
v3
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
1
28
,
128
,
128
,
64
,
128
,
128
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlockGemmPipelineScheduler
::
Intrawave
,
BlockGemmPipelineVersion
::
v3
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
1
,
128
,
128
,
64
,
128
,
128
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlockGemmPipelineScheduler
::
Intrawave
,
BlockGemmPipelineVersion
::
v3
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
1
28
,
128
,
128
,
64
,
64
,
128
,
16
,
16
,
32
,
32
,
1
,
1
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlockGemmPipelineScheduler
::
Intrawave
,
BlockGemmPipelineVersion
::
v3
,
F8
>
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
1
,
128
,
128
,
64
,
64
,
128
,
16
,
16
,
32
,
32
,
1
,
1
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlockGemmPipelineScheduler
::
Intrawave
,
BlockGemmPipelineVersion
::
v3
,
F8
>
// clang-format on
// clang-format on
>
;
>
;
template
<
BlockGemmPipelineScheduler
BlkGemmPipeSched
,
GemmSpecialization
GemmSpec
>
template
<
BlockGemmPipelineScheduler
BlkGemmPipeSched
,
GemmSpecialization
GemmSpec
>
using
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_instances
=
std
::
tuple
<
using
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
//################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| Scale| Scale| Scale| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm|
//################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| Scale| Scale| Scale| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm|
//################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline|
//################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline|
...
@@ -61,22 +61,22 @@ using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_instances = s
...
@@ -61,22 +61,22 @@ using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_instances = s
//################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// Latency friendly
// Latency friendly
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
32
,
16
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
2
,
2
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
32
,
16
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
2
,
2
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
1
28
,
128
,
128
,
16
,
16
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
1
,
128
,
128
,
16
,
16
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
16
,
32
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
16
,
32
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
,
F8
>
,
// Memory friendly
// Memory friendly
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
128
,
32
,
128
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
128
,
32
,
128
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
128
,
16
,
128
,
16
,
16
,
16
,
16
,
4
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
2
,
2
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
128
,
16
,
128
,
16
,
16
,
16
,
16
,
4
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
2
,
2
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
64
,
32
,
128
,
16
,
16
,
32
,
32
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
64
,
32
,
128
,
16
,
16
,
32
,
32
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
64
,
16
,
128
,
16
,
16
,
16
,
16
,
2
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
2
,
2
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
64
,
16
,
128
,
16
,
16
,
16
,
16
,
2
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
2
,
2
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
32
,
16
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
2
,
2
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
32
,
16
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
2
,
2
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
1
28
,
128
,
128
,
16
,
16
,
64
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
1
,
128
,
128
,
16
,
16
,
64
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
1
28
,
128
,
128
,
16
,
16
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
1
,
128
,
128
,
16
,
16
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
16
,
32
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
16
,
32
,
128
,
16
,
16
,
16
,
16
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
16
,
64
,
128
,
16
,
16
,
16
,
16
,
1
,
2
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
16
,
64
,
128
,
16
,
16
,
16
,
16
,
1
,
2
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
32
,
64
,
128
,
16
,
16
,
32
,
32
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
32
,
64
,
128
,
16
,
16
,
32
,
32
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
16
,
128
,
128
,
16
,
16
,
16
,
16
,
1
,
4
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
16
,
128
,
128
,
16
,
16
,
16
,
16
,
1
,
4
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
4
,
4
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
,
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
28
,
128
,
128
,
32
,
128
,
128
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
<
Row
,
Col
,
Tuple
<>
,
Row
,
F8
,
F32
,
F8
,
F32
,
Tuple
<>
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
1
,
128
,
128
,
32
,
128
,
128
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
F8
>
// clang-format on
// clang-format on
>
;
>
;
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
View file @
363b6744
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_default_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_i
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_i
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_i
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_i
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
instances
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_instances
<
GemmDefault
>
{});
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances
<
GemmDefault
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
View file @
363b6744
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_kpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
instances
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_instances
<
GemmKPadding
>
{});
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances
<
GemmKPadding
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instance.cpp
View file @
363b6744
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_mnkpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpaddin
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpaddin
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpaddin
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpaddin
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
instances
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_instances
<
GemmMNKPadding
>
{});
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances
<
GemmMNKPadding
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instance.cpp
View file @
363b6744
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_mnpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_mnpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
instances
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_comp_instances
<
GemmMNPadding
>
{});
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances
<
GemmMNPadding
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
View file @
363b6744
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_v1_default_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
instances
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_instances
<
Intrawave
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances
<
Intrawave
,
GemmDefault
>
{});
GemmDefault
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
View file @
363b6744
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_v1_kpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpaddin
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpaddin
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpaddin
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpaddin
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
instances
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_instances
<
Intrawave
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances
<
Intrawave
,
GemmKPadding
>
{});
GemmKPadding
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instance.cpp
View file @
363b6744
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_v1_mnkpadding_instances
(
void
add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD_ABScale
<
Row
,
Col
,
Col
,
Tuple
<>
,
Tuple
<>
,
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadd
...
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadd
F32
,
F32
,
Tuple
<>
,
Tuple
<>
,
BF16
,
BF16
,
1
28
,
1
,
128
,
128
,
128
,
128
,
PassThrough
,
PassThrough
,
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadd
...
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadd
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
instances
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1
28
_128_128_mem_instances
<
Intrawave
,
device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances
<
Intrawave
,
GemmMNKPadding
>
{});
GemmMNKPadding
>
{});
}
}
...
...
profiler/src/CMakeLists.txt
View file @
363b6744
# ckProfiler
# ckProfiler
set
(
PROFILER_SOURCES
set
(
PROFILER_SOURCES
profiler.cpp
profiler.cpp
profile_gemm.cpp
#
profile_gemm.cpp
profile_reduce.cpp
#
profile_reduce.cpp
profile_groupnorm_bwd_data.cpp
#
profile_groupnorm_bwd_data.cpp
profile_groupnorm_fwd.cpp
#
profile_groupnorm_fwd.cpp
profile_layernorm_bwd_data.cpp
#
profile_layernorm_bwd_data.cpp
profile_layernorm_bwd_gamma_beta.cpp
#
profile_layernorm_bwd_gamma_beta.cpp
profile_groupnorm_bwd_gamma_beta.cpp
#
profile_groupnorm_bwd_gamma_beta.cpp
profile_layernorm_fwd.cpp
#
profile_layernorm_fwd.cpp
profile_max_pool2d_fwd.cpp
#
profile_max_pool2d_fwd.cpp
profile_pool3d_fwd.cpp
#
profile_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp
#
profile_avg_pool3d_bwd.cpp
profile_max_pool3d_bwd.cpp
#
profile_max_pool3d_bwd.cpp
profile_avg_pool2d_bwd.cpp
#
profile_avg_pool2d_bwd.cpp
profile_max_pool2d_bwd.cpp
#
profile_max_pool2d_bwd.cpp
profile_softmax.cpp
#
profile_softmax.cpp
profile_batchnorm_fwd.cpp
#
profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp
#
profile_batchnorm_bwd.cpp
profile_batchnorm_infer.cpp
#
profile_batchnorm_infer.cpp
profile_conv_tensor_rearrange.cpp
#
profile_conv_tensor_rearrange.cpp
profile_transpose.cpp
#
profile_transpose.cpp
profile_permute_scale.cpp
#
profile_permute_scale.cpp
)
)
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx9"
)
#
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
list
(
APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp
)
#
list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
list
(
APPEND PROFILER_SOURCES profile_contraction_scale.cpp
)
#
list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
endif
()
#
endif()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list
(
APPEND PROFILER_SOURCES profile_gemm_reduce.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_reduce.cpp)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_streamk.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
endif
()
#
endif()
list
(
APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx94"
)
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx94"
)
list
(
APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp
)
endif
()
endif
()
list
(
APPEND PROFILER_SOURCES profile_batched_gemm.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_splitk.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp
)
#
list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp)
list
(
APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp
)
#
list(APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd.cpp
)
#
list(APPEND PROFILER_SOURCES profile_conv_fwd.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_fwd_outelementop.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd_outelementop.cpp)
endif
()
#
endif()
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx11"
OR SUPPORTED_GPU_TARGETS MATCHES
"gfx12"
OR SUPPORTED_GPU_TARGETS MATCHES
"gfx9"
)
#
if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list
(
APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
endif
()
#
endif()
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
endif
()
#
endif()
if
(
DL_KERNELS
)
#
if(DL_KERNELS)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
endif
()
#
endif()
set
(
PROFILER_EXECUTABLE ckProfiler
)
set
(
PROFILER_EXECUTABLE ckProfiler
)
...
@@ -93,89 +93,89 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
...
@@ -93,89 +93,89 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
target_compile_options
(
${
PROFILER_EXECUTABLE
}
PRIVATE --offload-compress
)
target_compile_options
(
${
PROFILER_EXECUTABLE
}
PRIVATE --offload-compress
)
endif
()
endif
()
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE utility getopt::getopt
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_gamma_beta_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_softmax_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_reduce_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batchnorm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_pool2d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool2d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_pool3d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_avg_pool2d_bwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool2d_bwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_avg_pool3d_bwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_max_pool_bwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_image_to_column_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_column_to_image_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_transpose_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_permute_scale_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx9"
)
#
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_bilinear_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_scale_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
endif
()
#
endif()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_add_fastgelu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_fastgelu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_gemm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_add_relu_gemm_add_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_streamk_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_fastgelu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_relu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_silu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_relu_add_layernorm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_fixed_nk_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_fastgelu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_tile_loop_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_tile_loop_instance)
endif
()
#
endif()
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_reduce_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_multiply_add_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx94"
)
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx94"
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_multiply_multiply_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_ab_scale_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_ab_scale_instance
)
endif
()
endif
()
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_splitk_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_batched_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_reduce_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_streamk_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_multiply_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_reduce_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bias_add_reduce_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_bias_relu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv1d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv3d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_fwd_convscale_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convscale_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_fwd_convinvscale_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance)
endif
()
#
endif()
if
(
SUPPORTED_GPU_TARGETS MATCHES
"gfx9"
OR SUPPORTED_GPU_TARGETS MATCHES
"gfx11"
OR SUPPORTED_GPU_TARGETS MATCHES
"gfx12"
)
#
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bilinear_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
endif
()
#
endif()
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
endif
()
#
endif()
if
(
DL_KERNELS
)
#
if(DL_KERNELS)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_multi_d_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
endif
()
#
endif()
rocm_install
(
TARGETS
${
PROFILER_EXECUTABLE
}
COMPONENT profiler
)
rocm_install
(
TARGETS
${
PROFILER_EXECUTABLE
}
COMPONENT profiler
)
profiler/src/profile_gemm_ab_scale.cpp
View file @
363b6744
...
@@ -32,8 +32,10 @@ enum struct GemmDataType
...
@@ -32,8 +32,10 @@ enum struct GemmDataType
enum
struct
ScaleBlockTile
enum
struct
ScaleBlockTile
{
{
Tile_128_128_128
,
// 0
Tile_128_128_128
,
// 0
Tile_1_128_128
,
// 1
};
};
#define OP_NAME "gemm_ab_scale"
#define OP_NAME "gemm_ab_scale"
#define OP_DESC "GEMM_AB_Scale"
#define OP_DESC "GEMM_AB_Scale"
...
@@ -154,8 +156,25 @@ int profile_gemm_ab_scale(int argc, char* argv[])
...
@@ -154,8 +156,25 @@ int profile_gemm_ab_scale(int argc, char* argv[])
return
pass
?
0
:
1
;
return
pass
?
0
:
1
;
};
};
// if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN &&
// scale_block_tile == ScaleBlockTile::Tile_128_128_128)
// {
// return profile(F8{},
// F32{},
// F8{},
// F32{},
// F8{},
// F32{},
// BF16{},
// ck::Number<128>{},
// ck::Number<128>{},
// ck::Number<128>{},
// Row{},
// Col{},
// Row{});
// }
if
(
data_type
==
GemmDataType
::
F8_F8_BF16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
&&
if
(
data_type
==
GemmDataType
::
F8_F8_BF16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
&&
scale_block_tile
==
ScaleBlockTile
::
Tile_1
28
_128_128
)
scale_block_tile
==
ScaleBlockTile
::
Tile_1_128_128
)
{
{
return
profile
(
F8
{},
return
profile
(
F8
{},
F32
{},
F32
{},
...
@@ -164,7 +183,7 @@ int profile_gemm_ab_scale(int argc, char* argv[])
...
@@ -164,7 +183,7 @@ int profile_gemm_ab_scale(int argc, char* argv[])
F8
{},
F8
{},
F32
{},
F32
{},
BF16
{},
BF16
{},
ck
::
Number
<
1
28
>
{},
ck
::
Number
<
1
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
128
>
{},
Row
{},
Row
{},
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment