add gemm kernel: mk_kn_mn and km_kn_mn

da21d4a9 · Chao Liu · fbc58d9b · da21d4a9 · da21d4a9 · da21d4a9
Commit da21d4a9 authored Sep 04, 2021 by Chao Liu
6 changed files
--- a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+template <typename ABType,
+          typename AccType,
+          typename CType,
+          typename ADesc,
+          typename BDesc,
+          typename CDesc>
+void device_gemm_xdlops_km_kn_mn(const ADesc& a_k_m_grid_desc,
+                                 const BDesc& b_k_n_grid_desc,
+                                 const CDesc& c_m_n_grid_desc,
+                                 const Tensor<ABType>& a_k_m,
+                                 const Tensor<ABType>& b_k_n,
+                                 Tensor<CType>& c_m_n,
+                                 ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    DeviceMem a_k_m_device_buf(sizeof(ABType) * a_k_m.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
+    a_k_m_device_buf.ToDevice(a_k_m.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    c_m_n_device_buf.ToDevice(c_m_n.mData.data());
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#endif
+    const auto K = a_k_m_grid_desc.GetLength(I0);
+    const auto M = a_k_m_grid_desc.GetLength(I1);
+    const auto N = b_k_n_grid_desc.GetLength(I1);
+    constexpr auto K1Number = Number<K1>{};
+    const auto K0           = K / K1Number;
+    const auto a_k0_m_k1_grid_desc =
+        transform_tensor_descriptor(a_k_m_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                                               make_pass_through_transform(M)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    const auto b_k0_n_k1_grid_desc =
+        transform_tensor_descriptor(b_k_n_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                                               make_pass_through_transform(N)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto a_k0_m_k1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
+                              Sequence<0, 0, 0>{},   // 1+: M
+                              Sequence<0, 0, 0>{}),  // 2+: K1
+                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
+                              Sequence<0, 0, 0>{},   // 1-: M
+                              Sequence<0, 0, 0>{})); // 2-: K1
+    constexpr auto b_k0_n_k1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
+                              Sequence<0, 0, 0>{},   // 1+: N
+                              Sequence<0, 0, 0>{}),  // 2+: K1
+                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
+                              Sequence<0, 0, 0>{},   // 1-: N
+                              Sequence<0, 0, 0>{})); // 2-: K1
+    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time =
+            driver_gemm_xdlops_v2r3<BlockSize,
+                                    ABType,
+                                    AccType,
+                                    CType,
+                                    InMemoryDataOperationEnum_t::Set,
+                                    decltype(a_k0_m_k1_grid_desc),
+                                    decltype(b_k0_n_k1_grid_desc),
+                                    decltype(c_m_n_grid_desc),
+                                    MPerBlock,
+                                    NPerBlock,
+                                    KPerBlock,
+                                    MPerXDL,
+                                    NPerXDL,
+                                    K1,
+                                    MRepeat,
+                                    NRepeat,
+                                    ABlockTransferThreadSliceLengths_K0_M_K1,
+                                    ABlockTransferThreadClusterLengths_K0_M_K1,
+                                    Sequence<0, 2, 1>,
+                                    Sequence<0, 2, 1>,
+                                    1,
+                                    ABlockTransferSrcScalarPerVector_M,
+                                    ABlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    BBlockTransferThreadSliceLengths_K0_N_K1,
+                                    BBlockTransferThreadClusterLengths_K0_N_K1,
+                                    Sequence<0, 2, 1>,
+                                    Sequence<0, 2, 1>,
+                                    1,
+                                    BBlockTransferSrcScalarPerVector_N,
+                                    BBlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    Sequence<0, 2, 4, 5, 6, 1, 3, 7>,
+                                    7,
+                                    CThreadTransferDstScalarPerVector,
+                                    decltype(a_k0_m_k1_grid_step_hacks),
+                                    decltype(b_k0_n_k1_grid_step_hacks),
+                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
+                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
+                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
+                                    false // CAccessOrderMRepeatNRepeat
+                                    >(static_cast<ABType*>(a_k_m_device_buf.GetDeviceBuffer()),
+                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      a_k0_m_k1_grid_desc,
+                                      b_k0_n_k1_grid_desc,
+                                      c_m_n_grid_desc,
+                                      a_k0_m_k1_grid_step_hacks,
+                                      b_k0_n_k1_grid_step_hacks,
+                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
+                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
+                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
+                                      nrepeat);
+        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+    // copy result back to host
+    c_m_n_device_buf.FromDevice(c_m_n.mData.data());
+}
--- a/host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_mk_kn_mn.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+template <typename ABType,
+          typename AccType,
+          typename CType,
+          typename ADesc,
+          typename BDesc,
+          typename CDesc>
+void device_gemm_xdlops_mk_kn_mn(const ADesc& a_m_k_grid_desc,
+                                 const BDesc& b_k_n_grid_desc,
+                                 const CDesc& c_m_n_grid_desc,
+                                 const Tensor<ABType>& a_m_k,
+                                 const Tensor<ABType>& b_k_n,
+                                 Tensor<CType>& c_m_n,
+                                 ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    DeviceMem a_m_k_device_buf(sizeof(ABType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    c_m_n_device_buf.ToDevice(c_m_n.mData.data());
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t MPerBlock = 256;
+    constexpr index_t NPerBlock = 128;
+    constexpr index_t KPerBlock = 4;
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t MPerBlock = 128;
+    constexpr index_t NPerBlock = 256;
+    constexpr index_t KPerBlock = 4;
+    constexpr index_t MPerXDL = 32;
+    constexpr index_t NPerXDL = 32;
+    constexpr index_t K1      = 8;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
+    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+    constexpr index_t CThreadTransferDstScalarPerVector = 1;
+#endif
+    const auto K = a_m_k_grid_desc.GetLength(I1);
+    const auto M = a_m_k_grid_desc.GetLength(I0);
+    const auto N = b_k_n_grid_desc.GetLength(I1);
+    constexpr auto K1Number = Number<K1>{};
+    const auto K0           = K / K1Number;
+    const auto a_k0_m_k1_grid_desc =
+        transform_tensor_descriptor(a_m_k_grid_desc,
+                                    make_tuple(make_pass_through_transform(M),
+                                               make_unmerge_transform(make_tuple(K0, K1Number))),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+    const auto b_k0_n_k1_grid_desc =
+        transform_tensor_descriptor(b_k_n_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                                               make_pass_through_transform(N)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto a_k0_m_k1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
+                              Sequence<0, 0, 0>{},   // 1+: M
+                              Sequence<0, 0, 0>{}),  // 2+: K1
+                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
+                              Sequence<0, 0, 0>{},   // 1-: M
+                              Sequence<0, 0, 0>{})); // 2-: K1
+    constexpr auto b_k0_n_k1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0>{},   // 0+: K0
+                              Sequence<0, 0, 0>{},   // 1+: N
+                              Sequence<0, 0, 0>{}),  // 2+: K1
+                   make_tuple(Sequence<0, 0, 0>{},   // 0-: K0
+                              Sequence<0, 0, 0>{},   // 1-: N
+                              Sequence<0, 0, 0>{})); // 2-: K1
+    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
+    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time =
+            driver_gemm_xdlops_v2r3<BlockSize,
+                                    ABType,
+                                    AccType,
+                                    CType,
+                                    InMemoryDataOperationEnum_t::Set,
+                                    decltype(a_k0_m_k1_grid_desc),
+                                    decltype(b_k0_n_k1_grid_desc),
+                                    decltype(c_m_n_grid_desc),
+                                    MPerBlock,
+                                    NPerBlock,
+                                    KPerBlock,
+                                    MPerXDL,
+                                    NPerXDL,
+                                    K1,
+                                    MRepeat,
+                                    NRepeat,
+                                    ABlockTransferThreadSliceLengths_K0_M_K1,
+                                    ABlockTransferThreadClusterLengths_K0_M_K1,
+                                    Sequence<1, 0, 2>,
+                                    Sequence<1, 0, 2>,
+                                    2,
+                                    ABlockTransferSrcScalarPerVector_K1,
+                                    ABlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    BBlockTransferThreadSliceLengths_K0_N_K1,
+                                    BBlockTransferThreadClusterLengths_K0_N_K1,
+                                    Sequence<0, 2, 1>,
+                                    Sequence<0, 2, 1>,
+                                    1,
+                                    BBlockTransferSrcScalarPerVector_N,
+                                    BBlockTransferDstScalarPerVector_K1,
+                                    false, // don't move back src coordinate after threadwise copy
+                                    Sequence<0, 2, 4, 5, 6, 1, 3, 7>,
+                                    7,
+                                    CThreadTransferDstScalarPerVector,
+                                    decltype(a_k0_m_k1_grid_step_hacks),
+                                    decltype(b_k0_n_k1_grid_step_hacks),
+                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
+                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
+                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
+                                    false // CAccessOrderMRepeatNRepeat
+                                    >(static_cast<ABType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      a_k0_m_k1_grid_desc,
+                                      b_k0_n_k1_grid_desc,
+                                      c_m_n_grid_desc,
+                                      a_k0_m_k1_grid_step_hacks,
+                                      b_k0_n_k1_grid_step_hacks,
+                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
+                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
+                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
+                                      nrepeat);
+        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+    // copy result back to host
+    c_m_n_device_buf.FromDevice(c_m_n.mData.data());
+}
--- a/host/driver_offline/include/device_gemm_xdlops_mk_nk_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_mk_nk_mn.hpp
+#pragma once
 #include <unistd.h>
 #include "device.hpp"
 #include "host_tensor.hpp"

--- a/host/driver_offline/src/gemm_driver_offline.cpp
+++ b/host/driver_offline/src/gemm_driver_offline.cpp
@@ -12,14 +12,20 @@
 #include "gemm_common.hpp"
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
+#include "device_gemm_xdlops_mk_kn_mn.hpp"
 #include "device_gemm_xdlops_mk_nk_mn.hpp"
+#include "device_gemm_xdlops_km_kn_mn.hpp"
-#define USE_GEMM_XDL_MK_NK_MN 1
+#define USE_GEMM_XDL_MK_KN_MN 0
+#define USE_GEMM_XDL_MK_NK_MN 0
+#define USE_GEMM_XDL_KM_KN_MN 1
 enum GemmAlgo
 {
    Xdl_MK_KN_MN, // 0
    Xdl_MK_NK_MN, // 1
+    Xdl_KM_KN_MN, // 2
+    Xdl_KM_NK_MN, // 3
 };
 int main(int argc, char* argv[])
@@ -66,11 +72,11 @@ int main(int argc, char* argv[])
    std::vector<std::size_t> a_lengths_host(2), b_lengths_host(2), c_lengths_host(2);
    std::vector<std::size_t> a_strides_host(2), b_strides_host(2), c_strides_host(2);
-    if(layout == GemmMatrixLayout::KM_KN_MN)
+    if(layout == GemmMatrixLayout::MK_KN_MN)
    {
-        a_lengths_host[0] = static_cast<std::size_t>(K);
+        a_lengths_host[0] = static_cast<std::size_t>(M);
-        a_lengths_host[1] = static_cast<std::size_t>(M);
+        a_lengths_host[1] = static_cast<std::size_t>(K);
-        a_strides_host[0] = static_cast<std::size_t>(M);
+        a_strides_host[0] = static_cast<std::size_t>(K);
        a_strides_host[1] = static_cast<std::size_t>(1);
        b_lengths_host[0] = static_cast<std::size_t>(K);
@@ -100,6 +106,23 @@ int main(int argc, char* argv[])
        c_strides_host[0] = static_cast<std::size_t>(N);
        c_strides_host[1] = static_cast<std::size_t>(1);
    }
+    else if(layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        a_lengths_host[0] = static_cast<std::size_t>(K);
+        a_lengths_host[1] = static_cast<std::size_t>(M);
+        a_strides_host[0] = static_cast<std::size_t>(M);
+        a_strides_host[1] = static_cast<std::size_t>(1);
+        b_lengths_host[0] = static_cast<std::size_t>(K);
+        b_lengths_host[1] = static_cast<std::size_t>(N);
+        b_strides_host[0] = static_cast<std::size_t>(N);
+        b_strides_host[1] = static_cast<std::size_t>(1);
+        c_lengths_host[0] = static_cast<std::size_t>(M);
+        c_lengths_host[1] = static_cast<std::size_t>(N);
+        c_strides_host[0] = static_cast<std::size_t>(N);
+        c_strides_host[1] = static_cast<std::size_t>(1);
+    }
    else
    {
        std::runtime_error("wrong! not implemented");
@@ -143,6 +166,14 @@ int main(int argc, char* argv[])
        b.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
    }
+    auto f_make_for_device_mk_kn_mn = [&]() {
+        const auto a_desc = make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(K, I1));
+        const auto b_desc = make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(N, I1));
+        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
+        return make_tuple(a_desc, b_desc, c_desc);
+    };
    auto f_make_for_device_mk_nk_mn = [&]() {
        const auto a_desc = make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(K, I1));
        const auto b_desc = make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(K, I1));
@@ -151,6 +182,29 @@ int main(int argc, char* argv[])
        return make_tuple(a_desc, b_desc, c_desc);
    };
+    auto f_make_for_device_km_kn_mn = [&]() {
+        const auto a_desc = make_naive_tensor_descriptor(make_tuple(K, M), make_tuple(M, I1));
+        const auto b_desc = make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(N, I1));
+        const auto c_desc = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(N, I1));
+        return make_tuple(a_desc, b_desc, c_desc);
+    };
+#if USE_GEMM_XDL_MK_KN_MN
+    if(algo == GemmAlgo::Xdl_MK_KN_MN)
+    {
+        if(layout != GemmMatrixLayout::MK_KN_MN)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+        const auto descs = f_make_for_device_mk_kn_mn();
+        device_gemm_xdlops_mk_kn_mn<ab_data_t, acc_data_t, c_data_t>(
+            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+    }
+#endif
 #if USE_GEMM_XDL_MK_NK_MN
    if(algo == GemmAlgo::Xdl_MK_NK_MN)
    {
@@ -166,6 +220,21 @@ int main(int argc, char* argv[])
    }
 #endif
+#if USE_GEMM_XDL_KM_KN_MN
+    if(algo == GemmAlgo::Xdl_KM_KN_MN)
+    {
+        if(layout != GemmMatrixLayout::KM_KN_MN)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+        const auto descs = f_make_for_device_km_kn_mn();
+        device_gemm_xdlops_km_kn_mn<ab_data_t, acc_data_t, c_data_t>(
+            descs[I0], descs[I1], descs[I2], a, b, c_device, nrepeat);
+    }
+#endif
    if(do_verification)
    {
        host_gemm(a, b, c_host, layout);

--- a/host/host_tensor/include/gemm_common.hpp
+++ b/host/host_tensor/include/gemm_common.hpp
@@ -4,9 +4,9 @@
 enum GemmMatrixLayout
 {
    MK_KN_MN, // 0
-    KM_KN_MN, // 1
+    MK_NK_MN, // 1
-    KM_NK_MN, // 2
+    KM_KN_MN, // 2
-    MK_NK_MN, // 3
+    KM_NK_MN, // 3
 };
 #endif
--- a/host/host_tensor/include/host_gemm.hpp
+++ b/host/host_tensor/include/host_gemm.hpp
@@ -8,24 +8,60 @@ void host_gemm(const Tensor<AType>& a,
               Tensor<CType>& c,
               const GemmMatrixLayout layout)
 {
-    auto f_mk_nk_mn = [&](auto m, auto n) {
+    if(layout == GemmMatrixLayout::MK_KN_MN)
-        const int K = a.mDesc.GetLengths()[1];
+    {
+        auto f_mk_kn_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[1];
-        double v = 0;
+            double v = 0;
-        for(int k = 0; k < K; ++k)
+            for(int k = 0; k < K; ++k)
-        {
+            {
-            v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
-        }
+            }
-        c(m, n) = v;
+            c(m, n) = v;
-    };
+        };
-    if(layout == GemmMatrixLayout::MK_NK_MN)
+        make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_MN)
    {
+        auto f_mk_nk_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[1];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+            }
+            c(m, n) = v;
+        };
        make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
            std::thread::hardware_concurrency());
    }
+    else if(layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        auto f_km_kn_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[0];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
+            }
+            c(m, n) = v;
+        };
+        make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
    else
    {
        throw std::runtime_error("wrong! not supported layout");