Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c579740a
"test/vscode:/vscode.git/clone" did not exist on "afa316212881dfa0e54aa36b51cf94fde8da6859"
Commit
c579740a
authored
Mar 15, 2023
by
rocking
Browse files
Add more xdl instances
parent
da1495f7
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
117 additions
and
6 deletions
+117
-6
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
...ice_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
+61
-4
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
...quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+14
-0
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
...quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+14
-0
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
...quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+14
-1
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
...quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+14
-1
No files found.
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
View file @
c579740a
...
...
@@ -16,7 +16,22 @@ using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances = std::
//##############################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | |
//##############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | |
//##############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
256
,
64
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
256
,
64
,
16
,
16
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
128
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
64
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
64
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
64
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
64
,
128
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
64
,
64
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
64
,
64
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
64
,
128
,
64
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
64
,
128
,
64
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
// clang-format on
>
;
...
...
@@ -27,7 +42,22 @@ using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances = std::
//##############################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | |
//##############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | |
//##############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
4
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
4
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
256
,
64
,
4
,
16
,
32
,
32
,
2
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
256
,
64
,
16
,
16
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
128
,
64
,
4
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
128
,
64
,
4
,
16
,
32
,
32
,
2
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
128
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
64
,
64
,
4
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
64
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
64
,
128
,
64
,
4
,
16
,
32
,
32
,
2
,
2
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
64
,
128
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
64
,
64
,
4
,
16
,
32
,
32
,
2
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
64
,
64
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
64
,
128
,
64
,
4
,
16
,
32
,
32
,
1
,
2
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
64
,
128
,
64
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
// clang-format on
>
;
...
...
@@ -38,7 +68,22 @@ using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances = std::
//##############################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | |
//##############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | |
//##############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
16
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
16
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
256
,
64
,
16
,
4
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
256
,
64
,
16
,
16
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
128
,
64
,
16
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
128
,
64
,
16
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
128
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
64
,
64
,
16
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
64
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
64
,
128
,
64
,
16
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
64
,
128
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
64
,
64
,
16
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
64
,
64
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
64
,
128
,
64
,
16
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
0
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
64
,
128
,
64
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
16
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
// clang-format on
>
;
...
...
@@ -49,7 +94,19 @@ using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances = std::
//##############################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | |
//##############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | |
//##############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
256
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
256
,
64
,
16
,
16
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
128
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
64
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
64
,
128
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
64
,
64
,
64
,
64
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
128
,
64
,
64
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
256
,
64
,
128
,
64
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
128
,
32
,
64
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
128
,
32
,
128
,
64
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
64
,
64
,
32
,
64
,
16
,
16
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
,
DeviceGemmMultipleD_Xdl_CShuffle
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
OutElementOp
,
MNKPadding
,
1
,
64
,
32
,
64
,
64
,
16
,
16
,
32
,
32
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
16
,
GemmLoopScheduler
,
GemmPipeline
>
// clang-format on
>
;
...
...
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
View file @
c579740a
...
...
@@ -27,6 +27,20 @@ void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Default
,
PipelineVersion
::
v1
>
{});
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
add_device_operation_instances
(
instances
,
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Interwave
,
PipelineVersion
::
v1
>
{});
#endif
#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
add_device_operation_instances
(
instances
,
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Default
,
PipelineVersion
::
v2
>
{});
#endif
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
View file @
c579740a
...
...
@@ -27,6 +27,20 @@ void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Default
,
PipelineVersion
::
v1
>
{});
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
add_device_operation_instances
(
instances
,
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Interwave
,
PipelineVersion
::
v1
>
{});
#endif
#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
add_device_operation_instances
(
instances
,
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Default
,
PipelineVersion
::
v2
>
{});
#endif
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
View file @
c579740a
...
...
@@ -27,9 +27,22 @@ void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Default
,
PipelineVersion
::
v1
>
{});
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
add_device_operation_instances
(
instances
,
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Interwave
,
PipelineVersion
::
v1
>
{});
#endif
#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
add_device_operation_instances
(
instances
,
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Default
,
PipelineVersion
::
v2
>
{});
#endif
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
View file @
c579740a
...
...
@@ -27,9 +27,22 @@ void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Default
,
PipelineVersion
::
v1
>
{});
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
add_device_operation_instances
(
instances
,
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Interwave
,
PipelineVersion
::
v1
>
{});
#endif
#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
add_device_operation_instances
(
instances
,
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
<
Mul_Clamp
,
LoopScheduler
::
Default
,
PipelineVersion
::
v2
>
{});
#endif
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment