add failed tuning params

a5aa963c · Jing Zhang · 852e9bd2 · a5aa963c
Commit a5aa963c authored Dec 07, 2021 by Jing Zhang
Show whitespace changes
Inline Side-by-side

Showing with 27 additions and 0 deletions

host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp +27 -0

No files found.
--- a/host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp
@@ -333,6 +333,33 @@ void device_gemm_xdlops_mk_kn_mn(const Tensor<ABType>& a_m_k,
 #elif 1
    constexpr index_t BlockSize = 64;
+    constexpr index_t MPerBlock = 48;
+    constexpr index_t NPerBlock = 32;
+    constexpr index_t KPerBlock = 4;
+    constexpr index_t MPerXDL = 16;
+    constexpr index_t NPerXDL = 16;
+    constexpr index_t K1      = 8;
+    constexpr index_t MRepeat = 3;
+    constexpr index_t NRepeat = 2;
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<4, 1, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<1, 48, 1>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<4, 1, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<1, 32, 1>;
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 1;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 1
+    constexpr index_t BlockSize = 64;
    constexpr index_t MPerBlock = 48;
    constexpr index_t NPerBlock = 16;
    constexpr index_t KPerBlock = 4;