More memory-bound instances.

235903ed · Adam Osewski · 73adb83d · 235903ed · 235903ed · 235903ed
Commit 235903ed authored Jun 27, 2024 by Adam Osewski
9 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -17,83 +17,83 @@ namespace instance {

 #if defined(CK_USE_XDL)
 #if defined(CK_ENABLE_FP16)
-void add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// void add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Row,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F16,
+//                                                   F16,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

-void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Col,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Col,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F16,
+//                                                   F16,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

-void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+//                                                   Row,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F16,
+//                                                   F16,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

-void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
-                                                  Col,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+//                                                   Col,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F16,
+//                                                   F16,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

-void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Col,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Col,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F16,
+//                                                   F16,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

-void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Row,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F16,
+//                                                   F16,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

 void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
@@ -108,116 +108,116 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
                                                  PassThrough,
                                                  PassThrough>>>& instances);

-void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Row,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F16,
+//                                                   F16,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

-void add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// void add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Row,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F16,
+//                                                   F16,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);
 #endif

-#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8)
-void add_device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F8,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// #if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8)
+// void add_device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Row,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F16,
+//                                                   F8,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

-void add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F8,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
-#endif
+// void add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Row,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   F8,
+//                                                   F16,
+//                                                   Empty_Tuple,
+//                                                   F16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);
+// #endif

-#if defined(CK_ENABLE_BF16)
-void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  BF16,
-                                                  BF16,
-                                                  Empty_Tuple,
-                                                  BF16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// #if defined(CK_ENABLE_BF16)
+// void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_kn_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Row,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   BF16,
+//                                                   BF16,
+//                                                   Empty_Tuple,
+//                                                   BF16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

-void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Col,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  BF16,
-                                                  BF16,
-                                                  Empty_Tuple,
-                                                  BF16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
-#endif
+// void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Col,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   BF16,
+//                                                   BF16,
+//                                                   Empty_Tuple,
+//                                                   BF16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);
+// #endif

-#if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
-void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  BF16,
-                                                  I8,
-                                                  Empty_Tuple,
-                                                  BF16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
+// #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
+// void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_kn_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Row,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   BF16,
+//                                                   I8,
+//                                                   Empty_Tuple,
+//                                                   BF16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);

-void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_nk_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Col,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  BF16,
-                                                  I8,
-                                                  Empty_Tuple,
-                                                  BF16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances);
-#endif
+// void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_nk_mn_instances(
+//     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+//                                                   Col,
+//                                                   Empty_Tuple,
+//                                                   Row,
+//                                                   BF16,
+//                                                   I8,
+//                                                   Empty_Tuple,
+//                                                   BF16,
+//                                                   PassThrough,
+//                                                   PassThrough,
+//                                                   PassThrough>>>& instances);
+// #endif
 #endif // CK_USE_XDL
 template <typename ALayout,
          typename BLayout,
@@ -260,30 +260,30 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
-                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
-                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
-                    op_ptrs);
-                add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(
-                    op_ptrs);
+                //                 add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                //                 add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                //                 add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+                //                     op_ptrs);
+                //                 add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(
+                //                     op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
-                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                //                 add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                //                 add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                              is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                //                 add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                              is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                //                 add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
            }
        }
 #endif
@@ -294,7 +294,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instances(op_ptrs);
+                //                 add_device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instances(op_ptrs);
            }
        }
        else if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
@@ -303,7 +303,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
+                //                 add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
            }
        }
 #endif
@@ -314,14 +314,14 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_kn_mn_instances(
-                    op_ptrs);
+                //                 add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_kn_mn_instances(
+                //                     op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_nk_mn_instances(
-                    op_ptrs);
+                //                 add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_nk_mn_instances(
+                //                     op_ptrs);
            }
        }
 #endif
@@ -332,14 +332,14 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_kn_mn_instances(
-                    op_ptrs);
+                //                 add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_kn_mn_instances(
+                //                     op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<ELayout, Row>)
            {
-                add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instances(
-                    op_ptrs);
+                //                 add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instances(
+                //                     op_ptrs);
            }
        }
 #endif

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d.hpp
@@ -31,6 +31,58 @@ void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_irregu
                                                  PassThrough,
                                                  PassThrough>>>& instances);

+void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv1_pf1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv1_pf2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv2_pf1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv2_pf2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v1(
    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
                                                  Row,
@@ -130,6 +182,14 @@ struct DeviceOperationInstanceFactory<
 #if defined(CK_ENABLE_FP16)
                add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_irregular_instances(
                    op_ptrs);
+                add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv1_pf1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv1_pf2_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv2_pf1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv2_pf2_instances(
+                    op_ptrs);
 #endif
            }
        }

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once

 #include <cstdlib>


--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/utility/loop_scheduler.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+using ck::tensor_operation::device::GemmSpecialization;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+template <GemmSpecialization GemmSpec,
+          ck::index_t NumPrefetch,
+          ck::PipelineVersion Pipeline,
+          LoopScheduler Scheduler = LoopScheduler::Default>
+using device_ggemm_md_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_memory_instances = std::tuple<
+    // clang-format off
+        //#########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|     PipelineVersion| LoopScheduler|
+        //#########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                    |              |
+        //#########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                    |              |
+        //#########################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                    |              |
+        // Memory friendly
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   256,   256,    32,    64,   8,   8,   32,   32,    2,    1,   S<8, 32, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 32, 1, 8>,               4,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   256,   256,    16,    64,   8,   8,   16,   16,    4,    1,   S<8, 32, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 32, 1, 8>,               2,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,   128,    32,    64,   8,   8,   32,   32,    2,    1,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               4,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,   128,    16,    64,   8,   8,   16,   16,    4,    1,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               2,  Pipeline, Scheduler>,
+        
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,    64,    32,    64,   8,   8,   32,   32,    1,    1,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               4,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,    64,    16,    64,   8,   8,   16,   16,    2,    1,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               2,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,    32,    16,    64,   8,   8,   16,   16,    1,    1,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               2,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,    64,    16,    16,   128,   8,   8,   16,   16,    1,    1,   S<16, 4, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<16, 4, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 4>,               4,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,    64,    16,    16,    64,   8,   8,   16,   16,    1,    1,   S<8,  8, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8,  8, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 4>,               4,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,    16,    32,    64,   8,   8,   16,   16,    1,    1,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               4,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,    16,    64,    64,   8,   8,   16,   16,    1,    2,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               4,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,    32,    64,    64,   8,   8,   32,   32,    1,    1,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               8,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,    16,   128,    64,   8,   8,   16,   16,    1,    4,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               4,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   128,    32,   128,    64,   8,   8,   32,   32,    1,    2,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 8>,               8,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   256,    16,   256,    64,   8,   8,   16,   16,    1,    4,   S<8, 16, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 16>,              4,  Pipeline, Scheduler>,
+        DeviceGroupedGemmMultipleDSplitKXdlCShuffle<  Row,     Col,  Empty_Tuple,     Row,     F16,   F16,  F32,   F32,  Empty_Tuple,   F16,    PassThrough, PassThrough, PassThrough,  GemmSpec, NumPrefetch,   256,    32,   256,    64,   8,   8,   32,   32,    1,    2,   S<8, 32, 1>,  S<1, 0, 2>, S<1, 0, 2>,              2,              8,              8,         1,  S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 16, 1, 16>,              8,  Pipeline, Scheduler>
+        >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/CMakeLists.txt
 # ONLY XDL_KERNELS
 add_instance_library(device_grouped_gemm_multiple_d_instance
   device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+   device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv1_pf1_instance.cpp
+   device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv1_pf2_instance.cpp
+   device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv2_pf1_instance.cpp
+   device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv2_pf2_instance.cpp
   device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance_pipeline_v1.cpp
   device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance_pipeline_v1_interwave.cpp
   device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance_pipeline_v2.cpp

--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv1_pf1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv1_pf1_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_instance.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16         = ck::half_t;
+using Row         = ck::tensor_layout::gemm::RowMajor;
+using Col         = ck::tensor_layout::gemm::ColumnMajor;
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+static constexpr ck::index_t NumPrefetchK = 1;
+
+void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv1_pf1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_ggemm_md_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_memory_instances<
+            GemmMNKPadding,
+            NumPrefetchK,
+            ck::PipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv1_pf2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv1_pf2_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_instance.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16         = ck::half_t;
+using Row         = ck::tensor_layout::gemm::RowMajor;
+using Col         = ck::tensor_layout::gemm::ColumnMajor;
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+static constexpr ck::index_t NumPrefetchK = 2;
+
+void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv1_pf2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_ggemm_md_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_memory_instances<
+            GemmMNKPadding,
+            NumPrefetchK,
+            ck::PipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv2_pf1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv2_pf1_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_instance.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16         = ck::half_t;
+using Row         = ck::tensor_layout::gemm::RowMajor;
+using Col         = ck::tensor_layout::gemm::ColumnMajor;
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+static constexpr ck::index_t NumPrefetchK = 1;
+
+void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv2_pf1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_ggemm_md_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_memory_instances<
+            GemmMNKPadding,
+            NumPrefetchK,
+            ck::PipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv2_pf2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_memory_pv2_pf2_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_instance.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16         = ck::half_t;
+using Row         = ck::tensor_layout::gemm::RowMajor;
+using Col         = ck::tensor_layout::gemm::ColumnMajor;
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+static constexpr ck::index_t NumPrefetchK = 2;
+
+void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_mem_pv2_pf2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_ggemm_md_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_memory_instances<
+            GemmMNKPadding,
+            NumPrefetchK,
+            ck::PipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck