Add new instances and support for small cases in DPP8 GEMM (#896)

547dbcfb · Bartlomiej Wroblewski · GitHub · 85e2e1e2 · 547dbcfb · 547dbcfb
Unverified Commit 547dbcfb authored Sep 12, 2023 by Bartlomiej Wroblewski Committed by GitHub Sep 12, 2023
11 changed files
--- a/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp
@@ -11,8 +11,14 @@ namespace ck {

 enum struct DppInstr
 {
-    dpp8_f16_16x16x2 = 0,
+    dpp8_f16_1x32x2 = 0,
+    dpp8_f16_2x16x2,
+    dpp8_f16_2x32x2,
+    dpp8_f16_4x16x2,
+    dpp8_f16_4x32x2,
+    dpp8_f16_8x16x2,
    dpp8_f16_8x32x2,
+    dpp8_f16_16x16x2,
    dpp8_f16_32x8x2
 };

@@ -101,6 +107,36 @@ struct dpp_type<DppInstr::dpp8_f16_8x32x2>
    }
 };

+template <>
+struct dpp_type<DppInstr::dpp8_f16_8x16x2>
+{
+    static constexpr index_t wave_size       = 32;
+    static constexpr index_t lanegroup_size  = 8;
+    static constexpr index_t m_per_wave      = 8;
+    static constexpr index_t n_per_wave      = 16;
+    static constexpr index_t m_per_lanegroup = 4;
+    static constexpr index_t n_per_lanegroup = 8;
+    static constexpr index_t m_per_thread    = 4;
+    static constexpr index_t n_per_thread    = 1;
+    static constexpr index_t k_per_dpp       = 2;
+    static constexpr bool share_a            = true;
+    using BaseType                           = half_t;
+
+    template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
+    __device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
+    {
+        dpp8::DppLanegroupGemm<m_per_thread,
+                               n_per_thread,
+                               k_per_dpp,
+                               BaseType,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               share_a>{}
+            .Run(a, b, reg_c);
+    }
+};
+
 template <>
 struct dpp_type<DppInstr::dpp8_f16_16x16x2>
 {
@@ -131,6 +167,156 @@ struct dpp_type<DppInstr::dpp8_f16_16x16x2>
    }
 };

+template <>
+struct dpp_type<DppInstr::dpp8_f16_4x32x2>
+{
+    static constexpr index_t wave_size       = 32;
+    static constexpr index_t lanegroup_size  = 8;
+    static constexpr index_t m_per_wave      = 4;
+    static constexpr index_t n_per_wave      = 32;
+    static constexpr index_t m_per_lanegroup = 4;
+    static constexpr index_t n_per_lanegroup = 8;
+    static constexpr index_t m_per_thread    = 4;
+    static constexpr index_t n_per_thread    = 1;
+    static constexpr index_t k_per_dpp       = 2;
+    static constexpr bool share_a            = true;
+    using BaseType                           = half_t;
+
+    template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
+    __device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
+    {
+        dpp8::DppLanegroupGemm<m_per_thread,
+                               n_per_thread,
+                               k_per_dpp,
+                               BaseType,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               share_a>{}
+            .Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct dpp_type<DppInstr::dpp8_f16_4x16x2>
+{
+    static constexpr index_t wave_size       = 32;
+    static constexpr index_t lanegroup_size  = 8;
+    static constexpr index_t m_per_wave      = 4;
+    static constexpr index_t n_per_wave      = 16;
+    static constexpr index_t m_per_lanegroup = 2;
+    static constexpr index_t n_per_lanegroup = 8;
+    static constexpr index_t m_per_thread    = 2;
+    static constexpr index_t n_per_thread    = 1;
+    static constexpr index_t k_per_dpp       = 2;
+    static constexpr bool share_a            = true;
+    using BaseType                           = half_t;
+
+    template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
+    __device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
+    {
+        dpp8::DppLanegroupGemm<m_per_thread,
+                               n_per_thread,
+                               k_per_dpp,
+                               BaseType,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               share_a>{}
+            .Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct dpp_type<DppInstr::dpp8_f16_1x32x2>
+{
+    static constexpr index_t wave_size       = 32;
+    static constexpr index_t lanegroup_size  = 8;
+    static constexpr index_t m_per_wave      = 1;
+    static constexpr index_t n_per_wave      = 32;
+    static constexpr index_t m_per_lanegroup = 1;
+    static constexpr index_t n_per_lanegroup = 8;
+    static constexpr index_t m_per_thread    = 1;
+    static constexpr index_t n_per_thread    = 1;
+    static constexpr index_t k_per_dpp       = 2;
+    static constexpr bool share_a            = true;
+    using BaseType                           = half_t;
+
+    template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
+    __device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
+    {
+        dpp8::DppLanegroupGemm<m_per_thread,
+                               n_per_thread,
+                               k_per_dpp,
+                               BaseType,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               share_a>{}
+            .Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct dpp_type<DppInstr::dpp8_f16_2x32x2>
+{
+    static constexpr index_t wave_size       = 32;
+    static constexpr index_t lanegroup_size  = 8;
+    static constexpr index_t m_per_wave      = 2;
+    static constexpr index_t n_per_wave      = 32;
+    static constexpr index_t m_per_lanegroup = 2;
+    static constexpr index_t n_per_lanegroup = 8;
+    static constexpr index_t m_per_thread    = 2;
+    static constexpr index_t n_per_thread    = 1;
+    static constexpr index_t k_per_dpp       = 2;
+    static constexpr bool share_a            = true;
+    using BaseType                           = half_t;
+
+    template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
+    __device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
+    {
+        dpp8::DppLanegroupGemm<m_per_thread,
+                               n_per_thread,
+                               k_per_dpp,
+                               BaseType,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               share_a>{}
+            .Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct dpp_type<DppInstr::dpp8_f16_2x16x2>
+{
+    static constexpr index_t wave_size       = 32;
+    static constexpr index_t lanegroup_size  = 8;
+    static constexpr index_t m_per_wave      = 2;
+    static constexpr index_t n_per_wave      = 16;
+    static constexpr index_t m_per_lanegroup = 1;
+    static constexpr index_t n_per_lanegroup = 8;
+    static constexpr index_t m_per_thread    = 1;
+    static constexpr index_t n_per_thread    = 1;
+    static constexpr index_t k_per_dpp       = 2;
+    static constexpr bool share_a            = true;
+    using BaseType                           = half_t;
+
+    template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
+    __device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
+    {
+        dpp8::DppLanegroupGemm<m_per_thread,
+                               n_per_thread,
+                               k_per_dpp,
+                               BaseType,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               share_a>{}
+            .Run(a, b, reg_c);
+    }
+};
+
 template <typename BaseType, index_t MPerDpp, index_t NPerDpp>
 struct DppSelector
 {
@@ -143,6 +329,12 @@ struct DppSelector
        return DppInstr::dpp8_f16_8x32x2;
    }

+    template <>
+    static constexpr auto GetDpp<half_t, 8, 16>()
+    {
+        return DppInstr::dpp8_f16_8x16x2;
+    }
+
    template <>
    static constexpr auto GetDpp<half_t, 16, 16>()
    {
@@ -155,6 +347,36 @@ struct DppSelector
        return DppInstr::dpp8_f16_32x8x2;
    }

+    template <>
+    static constexpr auto GetDpp<half_t, 1, 32>()
+    {
+        return DppInstr::dpp8_f16_1x32x2;
+    }
+
+    template <>
+    static constexpr auto GetDpp<half_t, 2, 32>()
+    {
+        return DppInstr::dpp8_f16_2x32x2;
+    }
+
+    template <>
+    static constexpr auto GetDpp<half_t, 2, 16>()
+    {
+        return DppInstr::dpp8_f16_2x16x2;
+    }
+
+    template <>
+    static constexpr auto GetDpp<half_t, 4, 16>()
+    {
+        return DppInstr::dpp8_f16_4x16x2;
+    }
+
+    template <>
+    static constexpr auto GetDpp<half_t, 4, 32>()
+    {
+        return DppInstr::dpp8_f16_4x32x2;
+    }
+
    static constexpr auto selected_dpp = dpp_type<GetDpp<BaseType, MPerDpp, NPerDpp>()>{};

    __host__ __device__ constexpr DppSelector()
@@ -191,7 +413,6 @@ struct DppSelector
        // in the future when the implementation is more generalized.
        static_assert(selected_dpp.share_a);
        static_assert(selected_dpp.n_per_thread == 1);
-        static_assert(selected_dpp.m_per_thread == selected_dpp.lanegroup_size);
        static_assert(selected_dpp.m_per_lanegroup == selected_dpp.m_per_thread);
        static_assert(selected_dpp.n_per_lanegroup ==
                      selected_dpp.n_per_thread * selected_dpp.lanegroup_size);
@@ -215,11 +436,6 @@ struct DppGemm

    __host__ __device__ constexpr DppGemm()
    {
-        static_assert(MPerDpp == 8 || MPerDpp == 16 || MPerDpp == 32,
-                      "MPerDpp must be either 8, 16 or 32.");
-        static_assert(NPerDpp == 8 || NPerDpp == 16 || NPerDpp == 32,
-                      "NPerDpp must be either 8, 16 or 32.");
-
        static_assert(KPack % dpp_instr.k_per_dpp == 0, "KPack must be divisible by k_per_dpp.");
    }


--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -23,12 +23,17 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

+void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
+void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -38,12 +43,17 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

+void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
+void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -53,12 +63,17 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

+void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
+void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -68,12 +83,17 @@ void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
+void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -375,6 +395,7 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
            }
@@ -386,6 +407,7 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
@@ -398,6 +420,7 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
            }
@@ -409,6 +432,7 @@ struct DeviceOperationInstanceFactory<
                add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
            }

--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -32,9 +32,13 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp)
     list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp)
     list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp)
     list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp)
     list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp)
     list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp)
+     list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp)
   endif()
   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp)
   list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp)

--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp
@@ -36,11 +36,21 @@ using device_gemm_dpp_f16_f16_f16_km_kn_mn_instances = std::tuple<
    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   4,   4,   16,   16,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   4,   4,   32,    8,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,    64,    64,   4,   4,   16,   16,       1,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    64,   4,   4,   32,    8,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    64,    64,   4,   4,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,    32,   4,   4,   32,    8,       1,       1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    64,   4,   4,   32,    8,       2,       4,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,    32,   4,   4,   32,    8,       1,       4,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,    16,   4,   4,   16,   16,       1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,     1,   128,    64,   4,   4,    1,   32,       1,       1,      S<4, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    32,    64,   4,   4,    8,   32,       1,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    16,    64,   4,   4,    8,   16,       1,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     4,    32,    64,   4,   4,    2,   32,       1,       1,      S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     2,    64,    64,   4,   4,    2,   32,       1,       1,      S<4, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     2,    32,    64,   4,   4,    1,   32,       1,       1,      S<4, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     1,    64,    64,   4,   4,    1,   32,       1,       1,      S<4, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    16,    64,   4,   4,    4,   16,       2,       1,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    16,    32,   4,   4,    8,   16,       1,       1,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     4,    32,    32,   4,   4,    4,   32,       1,       1,      S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     2,    16,    32,   4,   4,    2,   16,       1,       1,      S<4, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>
    >;
 // clang-format on


--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+// clang-format off
+using device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances = std::tuple<
+    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   256,   128,   128,    64,   4,   4,   16,   16,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   256,    64,    64,    64,   4,   4,   16,   16,       1,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,    64,    64,    64,   4,   4,   32,    8,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,    32,    32,    64,   4,   4,   32,    8,       1,       1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,     4,    32,    64,   4,   4,    1,   32,       1,       1,      S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    64,    64,    32,   4,   4,   32,    8,       2,       4,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    16,    32,    64,   4,   4,    8,   32,       1,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    16,    16,    64,   4,   4,    8,   16,       1,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,     2,    32,    64,   4,   4,    1,   32,       1,       1,      S<4, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,    32,    16,    64,   4,   4,   16,   16,       2,       1,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,     8,    16,    64,   4,   4,    4,   16,       2,       1,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,     1,    32,    64,   4,   4,    1,   32,       1,       1,      S<4, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>
+    >;
+// clang-format on
+
+void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp
@@ -36,11 +36,21 @@ using device_gemm_dpp_f16_f16_f16_km_nk_mn_instances = std::tuple<
    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   4,   8,   16,   16,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   4,   8,   32,    8,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,    64,    64,   4,   8,   16,   16,       1,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    64,   4,   8,   32,    8,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    64,    64,   4,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,    32,   4,   8,   32,    8,       1,       1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    64,   4,   8,   32,    8,       2,       4,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,    32,   4,   8,   32,    8,       1,       4,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,    16,   4,   8,   16,   16,       1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,     1,   128,    64,   4,   8,    1,   32,       1,       1,      S<4, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    32,    64,   4,   8,    8,   32,       1,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    16,    64,   4,   8,    8,   16,       1,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     4,    32,    64,   4,   8,    2,   32,       1,       1,      S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     2,    64,    64,   4,   8,    2,   32,       1,       1,      S<4, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     2,    32,    64,   4,   8,    1,   32,       1,       1,      S<4, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     1,    64,    64,   4,   8,    1,   32,       1,       1,      S<4, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    16,    64,   4,   8,    4,   16,       2,       1,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    16,    32,   4,   8,    8,   16,       1,       1,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     4,    32,    32,   4,   8,    4,   32,       1,       1,      S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     2,    16,    32,   4,   8,    2,   16,       1,       1,      S<4, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>
    >;
 // clang-format on


--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+// clang-format off
+using device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances = std::tuple<
+    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   256,   128,   128,    64,   4,   8,   16,   16,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,    64,    64,    64,   4,   8,   32,    8,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,    32,    64,    64,   4,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,    32,    32,    64,   4,   8,   32,    8,       1,       1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,     4,    32,    64,   4,   8,    1,   32,       1,       1,      S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    64,    64,    32,   4,   8,   32,    8,       2,       4,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    16,    32,    64,   4,   8,    8,   32,       1,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    16,    16,    64,   4,   8,    8,   16,       1,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,     2,    32,    64,   4,   8,    1,   32,       1,       1,      S<4, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,    32,    16,    64,   4,   8,   16,   16,       2,       1,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,     8,    16,    64,   4,   8,    4,   16,       2,       1,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,     1,    32,    64,   4,   8,    1,   32,       1,       1,      S<4, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>
+    >;
+// clang-format on
+
+void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -35,11 +35,21 @@ using device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances = std::tuple<
    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   4,   16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   4,   32,    8,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,    64,    64,   8,   4,   16,   16,       1,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    64,   8,   4,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    64,    64,   8,   4,   16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,    32,   8,   4,   32,    8,       1,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    64,   8,   4,   32,    8,       2,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,    32,   8,   4,   32,    8,       1,       4,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,    16,   8,   4,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,     1,   128,    64,   8,   4,    1,   32,       1,       1,      S<4, 1, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    32,    64,   8,   4,    8,   32,       1,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    16,    64,   8,   4,    8,   16,       1,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     4,    32,    64,   8,   4,    2,   32,       1,       1,      S<4, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     2,    64,    64,   8,   4,    2,   32,       1,       1,      S<4, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     2,    32,    64,   8,   4,    1,   32,       1,       1,      S<4, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     1,    64,    64,   8,   4,    1,   32,       1,       1,      S<4, 1, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    16,    64,   8,   4,    4,   16,       2,       1,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    16,    32,   8,   4,    8,   16,       1,       1,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     4,    32,    32,   8,   4,    4,   32,       1,       1,      S<4, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     2,    16,    32,   8,   4,    2,   16,       1,       1,      S<4, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>
    >;
 // clang-format on


--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+// clang-format off
+using device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances = std::tuple<
+    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   256,   128,   128,    64,   8,   4,   16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   256,    64,    64,    64,   8,   4,   16,   16,       1,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,    64,    64,    64,   8,   4,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,    32,    32,    64,   8,   4,   32,    8,       1,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,     4,    32,    64,   8,   4,    1,   32,       1,       1,      S<4, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    64,    64,    32,   8,   4,   32,    8,       2,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    16,    32,    64,   8,   4,    8,   32,       1,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    16,    16,    64,   8,   4,    8,   16,       1,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,     2,    32,    64,   8,   4,    1,   32,       1,       1,      S<4, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,    32,    16,    64,   8,   4,   16,   16,       2,       1,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,     8,    16,    64,   8,   4,    4,   16,       2,       1,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,     1,    32,    64,   8,   4,    1,   32,       1,       1,      S<4, 1, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>
+    >;
+// clang-format on
+
+void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -36,11 +36,21 @@ using device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances = std::tuple<
    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   8,   16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   8,   32,    8,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,    64,    64,   8,   8,   16,   16,       1,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    64,   8,   8,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    64,    64,   8,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,    32,   8,   8,   32,    8,       1,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    64,   8,   8,   32,    8,       2,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,    32,   8,   8,   32,    8,       1,       4,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,    16,   8,   8,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,     1,   128,    64,   8,   8,    1,   32,       1,       1,      S<4, 1, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    32,    64,   8,   8,    8,   32,       1,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    16,    64,   8,   8,    8,   16,       1,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     4,    32,    64,   8,   8,    2,   32,       1,       1,      S<4, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     2,    64,    64,   8,   8,    2,   32,       1,       1,      S<4, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     2,    32,    64,   8,   8,    1,   32,       1,       1,      S<4, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     1,    64,    64,   8,   8,    1,   32,       1,       1,      S<4, 1, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    16,    64,   8,   8,    4,   16,       2,       1,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     8,    16,    32,   8,   8,    8,   16,       1,       1,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     4,    32,    32,   8,   8,    4,   32,       1,       1,      S<4, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,     2,    16,    32,   8,   8,    2,   16,       1,       1,      S<4, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>
    >;
 // clang-format on


--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+// clang-format off
+using device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances = std::tuple<
+    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   256,   128,   128,    64,   8,   8,   16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   256,    64,    64,    64,   8,   8,   16,   16,       1,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,    64,    64,    64,   8,   8,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,    32,    32,    64,   8,   8,   32,    8,       1,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,   128,     4,    32,    64,   8,   8,    1,   32,       1,       1,      S<4, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    64,    64,    32,   8,   8,   32,    8,       2,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    16,    32,    64,   8,   8,    8,   32,       1,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,    16,    16,    64,   8,   8,    8,   16,       1,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    64,     2,    32,    64,   8,   8,    1,   32,       1,       1,      S<4, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,    32,    16,    64,   8,   8,   16,   16,       2,       1,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,     8,    16,    32,   8,   8,    8,   16,       1,       1,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      MNPadding,    32,     1,    32,    64,   8,   8,    1,   32,       1,       1,      S<4, 1, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>
+    >;
+// clang-format on
+
+void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck