Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7b01dbee
"...composable_kernel_rocm.git" did not exist on "9e80cdceb79f2bb4521d2d46eb585bde3fe21e26"
Commit
7b01dbee
authored
Dec 30, 2021
by
ltqin
Browse files
change disired grid size to parameters
parent
cca0ceee
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
104 additions
and
93 deletions
+104
-93
device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_kn_mn.cpp
.../device_gemm_xdl_splitk_instance_f32_f32_f32_km_kn_mn.cpp
+8
-8
device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_nk_mn.cpp
.../device_gemm_xdl_splitk_instance_f32_f32_f32_km_nk_mn.cpp
+8
-8
device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_kn_mn.cpp
.../device_gemm_xdl_splitk_instance_f32_f32_f32_mk_kn_mn.cpp
+9
-9
device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_nk_mn.cpp
.../device_gemm_xdl_splitk_instance_f32_f32_f32_mk_nk_mn.cpp
+13
-13
device_operation/include/device_gemm.hpp
device_operation/include/device_gemm.hpp
+2
-1
device_operation/include/device_gemm_instance.hpp
device_operation/include/device_gemm_instance.hpp
+1
-1
device_operation/include/device_gemm_splitk_xdl.hpp
device_operation/include/device_gemm_splitk_xdl.hpp
+19
-12
device_operation/include/device_gemm_xdl.hpp
device_operation/include/device_gemm_xdl.hpp
+2
-1
device_operation/include/device_gemm_xdl_instance.hpp
device_operation/include/device_gemm_xdl_instance.hpp
+24
-21
profiler/include/profile_gemm_impl.hpp
profiler/include/profile_gemm_impl.hpp
+0
-4
test/split_k/main.cpp
test/split_k/main.cpp
+18
-15
No files found.
device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_kn_mn.cpp
View file @
7b01dbee
...
@@ -28,14 +28,14 @@ using device_gemm_xdl_instance_f32_f32_f32_km_kn_mn = std::tuple<
...
@@ -28,14 +28,14 @@ using device_gemm_xdl_instance_f32_f32_f32_km_kn_mn = std::tuple<
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
256
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
256
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
256
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
256
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
64
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
64
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
64
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
,
720
>
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
64
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
>
// clang-format on
// clang-format on
>
;
>
;
...
...
device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_nk_mn.cpp
View file @
7b01dbee
...
@@ -28,14 +28,14 @@ using device_gemm_xdl_instance_f32_f32_f32_km_nk_mn = std::tuple<
...
@@ -28,14 +28,14 @@ using device_gemm_xdl_instance_f32_f32_f32_km_nk_mn = std::tuple<
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
256
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
256
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
256
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
256
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
64
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
64
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
64
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
64
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
// clang-format on
// clang-format on
>
;
>
;
...
...
device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_kn_mn.cpp
View file @
7b01dbee
...
@@ -28,15 +28,15 @@ using device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn = std::tuple<
...
@@ -28,15 +28,15 @@ using device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn = std::tuple<
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
96
,
128
,
4
,
8
,
16
,
16
,
3
,
4
,
S
<
1
,
1
,
3
,
4
>
,
S
<
1
,
4
,
32
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
8
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
96
,
128
,
4
,
8
,
16
,
16
,
3
,
4
,
S
<
1
,
1
,
3
,
4
>
,
S
<
1
,
4
,
32
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
8
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
256
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
256
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
256
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
256
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
64
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
64
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
64
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
,
720
>
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
64
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
4
,
7
,
1
,
true
,
true
>
>
;
>
;
template
<
>
template
<
>
...
...
device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_nk_mn.cpp
View file @
7b01dbee
...
@@ -28,19 +28,19 @@ using device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn = std::tuple<
...
@@ -28,19 +28,19 @@ using device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn = std::tuple<
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN|
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | |
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
256
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
256
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
256
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
256
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
128
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
64
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
64
,
128
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
64
,
64
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
64
,
64
,
64
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
128
,
64
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
64
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
256
,
64
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
128
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
32
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
128
,
32
,
128
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
1
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
64
,
64
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
64
,
64
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
,
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
64
,
32
,
64
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
,
720
>
DeviceGemmSplitKXdl
<
F32
,
F32
,
F32
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
64
,
32
,
64
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
1
,
1
,
2
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
4
,
4
,
7
,
1
,
true
,
true
>
// clang-format on
// clang-format on
>
;
>
;
...
...
device_operation/include/device_gemm.hpp
View file @
7b01dbee
...
@@ -25,7 +25,8 @@ struct DeviceGemm : public BaseOperator
...
@@ -25,7 +25,8 @@ struct DeviceGemm : public BaseOperator
ck
::
index_t
StrideC
,
ck
::
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
=
0
;
CElementwiseOperation
c_element_op
,
ck
::
index_t
desired_gride_size
=
1
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
...
...
device_operation/include/device_gemm_instance.hpp
View file @
7b01dbee
...
@@ -12,7 +12,7 @@ namespace device_gemm_instance {
...
@@ -12,7 +12,7 @@ namespace device_gemm_instance {
using
DeviceGemmNoOpPtr
=
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
using
DeviceGemmNoOpPtr
=
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
template
<
typename
ADataType
,
template
<
typename
ADataType
,
typename
BDataType
,
typename
BDataType
,
typename
CDataType
,
typename
CDataType
,
...
...
device_operation/include/device_gemm_splitk_xdl.hpp
View file @
7b01dbee
...
@@ -56,8 +56,7 @@ template <typename ADataType,
...
@@ -56,8 +56,7 @@ template <typename ADataType,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
bool
ABlockLdsAddExtraM
,
bool
ABlockLdsAddExtraM
,
bool
BBlockLdsAddExtraN
,
bool
BBlockLdsAddExtraN
>
ck
::
index_t
DesiredGridSize
>
struct
DeviceGemmSplitKXdl
struct
DeviceGemmSplitKXdl
:
public
DeviceGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
:
public
DeviceGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
{
...
@@ -147,7 +146,7 @@ struct DeviceGemmSplitKXdl
...
@@ -147,7 +146,7 @@ struct DeviceGemmSplitKXdl
}
}
}
}
static
auto
GetKBatchAndKPad
(
index_t
M
,
index_t
N
,
index_t
K
)
static
auto
GetKBatchAndKPad
(
index_t
M
,
index_t
N
,
index_t
K
,
index_t
DesiredGridSize
)
{
{
const
auto
GridMN
=
M
*
N
/
(
MPerBlock
*
NPerBlock
);
const
auto
GridMN
=
M
*
N
/
(
MPerBlock
*
NPerBlock
);
const
index_t
KBatch
=
std
::
max
(
DesiredGridSize
/
GridMN
,
1
);
const
index_t
KBatch
=
std
::
max
(
DesiredGridSize
/
GridMN
,
1
);
...
@@ -195,7 +194,7 @@ struct DeviceGemmSplitKXdl
...
@@ -195,7 +194,7 @@ struct DeviceGemmSplitKXdl
BBlockTransferSrcVectorDim
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
BBlockTransferDstScalarPerVector_K1
,
false
,
// BThreadTransferSrcResetCoordinateAfterRun,
false
,
// BThreadTransferSrcResetCoordinateAfterRun,
BBlockLdsAddExtraN
,
BBlockLdsAddExtraN
,
Sequence
<
0
,
2
,
4
,
5
,
6
,
1
,
3
,
7
>
,
// CThreadTransferSrcDstAccessOrder,
Sequence
<
0
,
2
,
4
,
5
,
6
,
1
,
3
,
7
>
,
// CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim
,
CThreadTransferSrcDstVectorDim
,
...
@@ -236,7 +235,7 @@ struct DeviceGemmSplitKXdl
...
@@ -236,7 +235,7 @@ struct DeviceGemmSplitKXdl
BBlockTransferSrcVectorDim
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
BBlockTransferDstScalarPerVector_K1
,
false
,
// BThreadTransferSrcResetCoordinateAfterRun,
false
,
// BThreadTransferSrcResetCoordinateAfterRun,
BBlockLdsAddExtraN
,
BBlockLdsAddExtraN
,
Sequence
<
0
,
2
,
4
,
5
,
6
,
1
,
3
,
7
>
,
// CThreadTransferSrcDstAccessOrder,
Sequence
<
0
,
2
,
4
,
5
,
6
,
1
,
3
,
7
>
,
// CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim
,
CThreadTransferSrcDstVectorDim
,
...
@@ -264,7 +263,8 @@ struct DeviceGemmSplitKXdl
...
@@ -264,7 +263,8 @@ struct DeviceGemmSplitKXdl
index_t
N01
,
index_t
N01
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
CElementwiseOperation
c_element_op
,
index_t
desired_grid_Size
)
:
p_a_grid_
{
p_a_grid
},
:
p_a_grid_
{
p_a_grid
},
p_b_grid_
{
p_b_grid
},
p_b_grid_
{
p_b_grid
},
p_c_grid_
{
p_c_grid
},
p_c_grid_
{
p_c_grid
},
...
@@ -277,10 +277,12 @@ struct DeviceGemmSplitKXdl
...
@@ -277,10 +277,12 @@ struct DeviceGemmSplitKXdl
N01_
{
N01
},
N01_
{
N01
},
a_element_op_
{
a_element_op
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
b_element_op_
{
b_element_op
},
c_element_op_
{
c_element_op
}
c_element_op_
{
c_element_op
},
desired_grid_size_
{
desired_grid_Size
}
{
{
int
KBatch
=
1
,
KPad
=
K
;
int
KBatch
=
1
,
KPad
=
K
;
std
::
tie
(
KBatch
,
KPad
)
=
DeviceGemmSplitKXdl
::
GetKBatchAndKPad
(
M
,
N
,
K
);
std
::
tie
(
KBatch
,
KPad
)
=
DeviceGemmSplitKXdl
::
GetKBatchAndKPad
(
M
,
N
,
K
,
desired_grid_size_
);
a_grid_desc_kbatch_k0_m_k1_
=
DeviceGemmSplitKXdl
::
MakeAGridDescriptor_KBatch_K0_M_K1
(
a_grid_desc_kbatch_k0_m_k1_
=
DeviceGemmSplitKXdl
::
MakeAGridDescriptor_KBatch_K0_M_K1
(
M
,
K
,
StrideA
,
KBatch
,
KPad
);
M
,
K
,
StrideA
,
KBatch
,
KPad
);
...
@@ -316,6 +318,7 @@ struct DeviceGemmSplitKXdl
...
@@ -316,6 +318,7 @@ struct DeviceGemmSplitKXdl
AElementwiseOperation
a_element_op_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
BElementwiseOperation
b_element_op_
;
CElementwiseOperation
c_element_op_
;
CElementwiseOperation
c_element_op_
;
index_t
desired_grid_size_
;
};
};
// Invoker
// Invoker
...
@@ -524,7 +527,8 @@ struct DeviceGemmSplitKXdl
...
@@ -524,7 +527,8 @@ struct DeviceGemmSplitKXdl
index_t
StrideC
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
CElementwiseOperation
c_element_op
,
index_t
desired_grid_Size
)
{
{
return
Argument
{
p_a
,
return
Argument
{
p_a
,
p_b
,
p_b
,
...
@@ -539,7 +543,8 @@ struct DeviceGemmSplitKXdl
...
@@ -539,7 +543,8 @@ struct DeviceGemmSplitKXdl
1
,
1
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
c_element_op
};
c_element_op
,
desired_grid_Size
};
}
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
...
@@ -556,7 +561,8 @@ struct DeviceGemmSplitKXdl
...
@@ -556,7 +561,8 @@ struct DeviceGemmSplitKXdl
index_t
StrideC
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
override
CElementwiseOperation
c_element_op
,
ck
::
index_t
desired_gride_size
=
1
)
override
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
BDataType
*>
(
p_b
),
...
@@ -571,7 +577,8 @@ struct DeviceGemmSplitKXdl
...
@@ -571,7 +577,8 @@ struct DeviceGemmSplitKXdl
1
,
1
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
c_element_op
);
c_element_op
,
desired_gride_size
);
}
}
// polymorphic
// polymorphic
...
...
device_operation/include/device_gemm_xdl.hpp
View file @
7b01dbee
...
@@ -408,7 +408,8 @@ struct DeviceGemmXdl
...
@@ -408,7 +408,8 @@ struct DeviceGemmXdl
index_t
StrideC
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
override
CElementwiseOperation
c_element_op
,
ck
::
index_t
)
override
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
BDataType
*>
(
p_b
),
...
...
device_operation/include/device_gemm_xdl_instance.hpp
View file @
7b01dbee
...
@@ -7,36 +7,39 @@ namespace device_gemm_instance {
...
@@ -7,36 +7,39 @@ namespace device_gemm_instance {
template
<
>
template
<
>
void
add_device_splitk_gemm_instance
<
float
,
void
add_device_splitk_gemm_instance
<
float
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
template
<
>
template
<
>
void
add_device_splitk_gemm_instance
<
float
,
void
add_device_splitk_gemm_instance
<
float
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
template
<
>
template
<
>
void
add_device_splitk_gemm_instance
<
float
,
void
add_device_splitk_gemm_instance
<
float
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
template
<
>
template
<
>
void
add_device_splitk_gemm_instance
<
float
,
void
add_device_splitk_gemm_instance
<
float
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
}
// namespace device_gemm_instance
}
// namespace device_gemm_instance
}
// namespace device
}
// namespace device
...
...
profiler/include/profile_gemm_impl.hpp
View file @
7b01dbee
...
@@ -6,10 +6,6 @@ namespace tensor_operation {
...
@@ -6,10 +6,6 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
device_gemm_instance
{
namespace
device_gemm_instance
{
using
DeviceGemmNoOpPtr
=
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
template
<
>
template
<
>
void
add_device_gemm_instance
<
float
,
void
add_device_gemm_instance
<
float
,
float
,
float
,
...
...
test/split_k/main.cpp
View file @
7b01dbee
...
@@ -25,12 +25,13 @@ using DeviceGemmNoOpPtr =
...
@@ -25,12 +25,13 @@ using DeviceGemmNoOpPtr =
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
using
GEMM_PTR
=
std
::
vector
<
DeviceGemmNoOpPtr
>
;
using
GEMM_PTR
=
std
::
vector
<
DeviceGemmNoOpPtr
>
;
static
std
::
vector
<
std
::
vector
<
bool
>>&
GetLayoutType
(){
static
std
::
vector
<
std
::
vector
<
bool
>>&
GetLayoutType
()
static
std
::
vector
<
std
::
vector
<
bool
>>
LayOut
=
{{
0
,
0
,
0
},
{
0
,
1
,
0
},
{
1
,
0
,
0
},
{
1
,
1
,
0
}};
{
static
std
::
vector
<
std
::
vector
<
bool
>>
LayOut
=
{{
0
,
0
,
0
},
{
0
,
1
,
0
},
{
1
,
0
,
0
},
{
1
,
1
,
0
}};
return
LayOut
;
return
LayOut
;
}
}
static
void
add_device_gemm_instance_mk_kn_mn
(
GEMM_PTR
&
gemm_ptrs
)
static
void
add_device_gemm_instance_mk_kn_mn
(
GEMM_PTR
&
gemm_ptrs
)
{
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_splitk_gemm_instance
<
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_splitk_gemm_instance
<
...
@@ -74,14 +75,14 @@ static void add_device_gemm_instance_km_nk_mn(GEMM_PTR& gemm_ptrs)
...
@@ -74,14 +75,14 @@ static void add_device_gemm_instance_km_nk_mn(GEMM_PTR& gemm_ptrs)
static
auto
&
GetAddDeviceGemmInstance
()
static
auto
&
GetAddDeviceGemmInstance
()
{
{
static
std
::
vector
<
void
(
*
)(
GEMM_PTR
&
)
>
AddDeviceGemmInstance
=
{
add_device_gemm_instance_mk_kn_mn
,
static
std
::
vector
<
void
(
*
)(
GEMM_PTR
&
)
>
AddDeviceGemmInstance
=
{
add_device_gemm_instance_mk_nk_mn
,
add_device_gemm_instance_mk_kn_mn
,
add_device_gemm_instance_km_kn_mn
,
add_device_gemm_instance_mk_nk_mn
,
add_device_gemm_instance_km_nk_mn
};
add_device_gemm_instance_km_kn_mn
,
add_device_gemm_instance_km_nk_mn
};
return
AddDeviceGemmInstance
;
return
AddDeviceGemmInstance
;
}
}
static
void
add_device_gemm_instance
(
GEMM_PTR
&
gemm_ptrs
,
int
layout
)
static
void
add_device_gemm_instance
(
GEMM_PTR
&
gemm_ptrs
,
int
layout
)
{
{
GetAddDeviceGemmInstance
()[
layout
](
gemm_ptrs
);
GetAddDeviceGemmInstance
()[
layout
](
gemm_ptrs
);
...
@@ -105,13 +106,13 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
...
@@ -105,13 +106,13 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
}
}
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
if
(
argc
!=
8
)
if
(
argc
!=
9
)
{
{
printf
(
"arg1: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
"arg1: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, n] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 2: A[k, n] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 3: A[k, n] * B[n, k] = C[m, n])
\n
"
);
printf
(
" 3: A[k, n] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg2 to 7: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg2 to 7: M, N, K, StrideA, StrideB, StrideC
DesiredGridSize
\n
"
);
return
1
;
return
1
;
}
}
...
@@ -121,9 +122,10 @@ int main(int argc, char* argv[])
...
@@ -121,9 +122,10 @@ int main(int argc, char* argv[])
const
int
N
=
std
::
stoi
(
argv
[
3
]);
const
int
N
=
std
::
stoi
(
argv
[
3
]);
const
int
K
=
std
::
stoi
(
argv
[
4
]);
const
int
K
=
std
::
stoi
(
argv
[
4
]);
const
int
StrideA
=
std
::
stoi
(
argv
[
5
]);
const
int
StrideA
=
std
::
stoi
(
argv
[
5
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
6
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
6
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
7
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
7
]);
const
int
DesiredGridSize
=
std
::
stoi
(
argv
[
8
]);
if
(
layout
>
3
||
layout
<
0
)
if
(
layout
>
3
||
layout
<
0
)
{
{
...
@@ -191,7 +193,8 @@ int main(int argc, char* argv[])
...
@@ -191,7 +193,8 @@ int main(int argc, char* argv[])
StrideC
,
StrideC
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
DesiredGridSize
);
auto
invoker_ptr
=
gemm_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
gemm_ptr
->
MakeInvokerPointer
();
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment