Merge remote-tracking branch 'origin/develop' into bwroblew/direct_loads

261d3267 · Bartlomiej Wroblewski · 2d5b22fe · f2398f61 · 261d3267 · 261d3267
Commit 261d3267 authored Nov 14, 2023 by Bartlomiej Wroblewski
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include <memory>
 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

@@ -20,866 +20,866 @@ namespace instance {
 #ifdef CK_ENABLE_BF16
 // grouped conv1d forward, GNWC/GKXC/GNWK
 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
-                                                              GNWC,
-                                                              GKXC,
-                                                              Empty_Tuple,
-                                                              GNWK,
-                                                              BF16,
-                                                              BF16,
-                                                              Empty_Tuple,
-                                                              BF16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<1,
+                                                                GNWC,
+                                                                GKXC,
+                                                                Empty_Tuple,
+                                                                GNWK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
-                                                              GNWC,
-                                                              GKXC,
-                                                              Empty_Tuple,
-                                                              GNWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<1,
+                                                                GNWC,
+                                                                GKXC,
+                                                                Empty_Tuple,
+                                                                GNWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
-                                                              GNWC,
-                                                              GKXC,
-                                                              Empty_Tuple,
-                                                              GNWK,
-                                                              F32,
-                                                              F32,
-                                                              Empty_Tuple,
-                                                              F32,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<1,
+                                                                GNWC,
+                                                                GKXC,
+                                                                Empty_Tuple,
+                                                                GNWK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
-                                                              GNWC,
-                                                              GKXC,
-                                                              Empty_Tuple,
-                                                              GNWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<1,
+                                                                GNWC,
+                                                                GKXC,
+                                                                Empty_Tuple,
+                                                                GNWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_BF16
 // grouped conv2d forward, GNHWC/GKYXC/GNHWK
 void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              BF16,
-                                                              BF16,
-                                                              Empty_Tuple,
-                                                              BF16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F32,
-                                                              F32,
-                                                              Empty_Tuple,
-                                                              F32,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 // grouped conv2d forward, NHWGC/GKYXC/NHWGK
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              BF16,
-                                                              BF16,
-                                                              Empty_Tuple,
-                                                              BF16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F32,
-                                                              F32,
-                                                              Empty_Tuple,
-                                                              F32,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              BF16,
-                                                              BF16,
-                                                              Empty_Tuple,
-                                                              BF16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              F32,
-                                                              F32,
-                                                              Empty_Tuple,
-                                                              F32,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              GNDHWC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              GNDHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              BF16,
-                                                              BF16,
-                                                              Empty_Tuple,
-                                                              BF16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP8
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_f8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              F8>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                F8>>>& instances);
 #endif

 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              F32,
-                                                              F32,
-                                                              Empty_Tuple,
-                                                              F32,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
 void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F32,
-                                                              F32,
-                                                              Empty_Tuple,
-                                                              F32,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);

 #endif

 #if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
 void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 #if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F32,
-                                                              F32,
-                                                              Empty_Tuple,
-                                                              F32,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif

 template <ck::index_t NumDimSpatial,
@@ -890,7 +890,7 @@ template <ck::index_t NumDimSpatial,
          typename WeiDataType,
          typename OutDataType,
          typename ComputeType>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
    NumDimSpatial,
    InLayout,
    WeiLayout,
@@ -905,19 +905,20 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    ck::tensor_operation::element_wise::PassThrough,
    ComputeType>>
 {
-    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
-                                                   InLayout,
-                                                   WeiLayout,
-                                                   Empty_Tuple,
-                                                   OutLayout,
-                                                   InDataType,
-                                                   WeiDataType,
-                                                   Empty_Tuple,
-                                                   OutDataType,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ComputeType>;
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        Empty_Tuple,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        Empty_Tuple,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ComputeType>;

    static auto GetInstances()
    {
@@ -928,28 +929,29 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
        {
 #ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float>)
+                         is_same_v<OutDataType, float> && is_same_v<ComputeType, float>)
            {
                add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                         is_same_v<OutDataType, half_t>)
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
            {
                add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_BF16
            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> && is_same_v<ComputeType, bhalf_t>)
            {
                add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_INT8
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                         is_same_v<OutDataType, int8_t>)
+                         is_same_v<OutDataType, int8_t> && is_same_v<ComputeType, int8_t>)
            {
                add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances(op_ptrs);
            }
@@ -961,7 +963,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
        {
 #ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float>)
+                         is_same_v<OutDataType, float> && is_same_v<ComputeType, float>)
            {
                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
            }
@@ -969,7 +971,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float>)
+                         is_same_v<OutDataType, float> && is_same_v<ComputeType, float>)
            {
                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
            }
@@ -977,7 +979,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                         is_same_v<OutDataType, half_t>)
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
            {
                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
@@ -989,7 +991,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                         is_same_v<OutDataType, half_t>)
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
            {
                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
            }
@@ -997,7 +999,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #ifdef CK_ENABLE_BF16
            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> && is_same_v<ComputeType, bhalf_t>)
            {
                add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(op_ptrs);
            }
@@ -1005,7 +1008,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #ifdef CK_ENABLE_INT8
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                         is_same_v<OutDataType, int8_t>)
+                         is_same_v<OutDataType, int8_t> && is_same_v<ComputeType, int8_t>)
            {
                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances(op_ptrs);
@@ -1021,7 +1024,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float>)
+                         is_same_v<OutDataType, float> && is_same_v<ComputeType, float>)
            {
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
            }
@@ -1029,7 +1032,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float>)
+                         is_same_v<OutDataType, float> && is_same_v<ComputeType, float>)
            {
                add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
            }
@@ -1037,7 +1040,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                         is_same_v<OutDataType, half_t>)
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
            {
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
            }
@@ -1045,7 +1048,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                         is_same_v<OutDataType, half_t>)
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
            {
                add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
            }
@@ -1053,14 +1056,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe

 #ifdef CK_ENABLE_BF16
            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> && is_same_v<ComputeType, bhalf_t>)
            {
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_INT8
            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
+                              is_same_v<OutDataType, int8_t> && is_same_v<ComputeType, int8_t>)
            {
                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances(op_ptrs);
@@ -1075,14 +1079,14 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
        {
 #ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float>)
+                         is_same_v<OutDataType, float> && is_same_v<ComputeType, float>)
            {
                add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                         is_same_v<OutDataType, half_t>)
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
            {
                add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances(op_ptrs);
@@ -1095,14 +1099,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
 #ifdef CK_ENABLE_BF16
            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> && is_same_v<ComputeType, bhalf_t>)
            {
                add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_INT8
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                         is_same_v<OutDataType, int8_t>)
+                         is_same_v<OutDataType, int8_t> && is_same_v<ComputeType, int8_t>)
            {
                add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances(op_ptrs);
@@ -1119,7 +1124,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
        {
 #ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
-                         is_same_v<OutDataType, float>)
+                         is_same_v<OutDataType, float> && is_same_v<ComputeType, float>)
            {
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
            }
@@ -1148,14 +1153,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
 #ifdef CK_ENABLE_BF16
            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> && is_same_v<ComputeType, bhalf_t>)
            {
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_INT8
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                         is_same_v<OutDataType, int8_t>)
+                         is_same_v<OutDataType, int8_t> && is_same_v<ComputeType, int8_t>)
            {
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(op_ptrs);

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_ab.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_ab.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ScaleAdd    = ck::tensor_operation::element_wise::ScaleAdd;
+
+#ifdef CK_ENABLE_BF16
+// grouped conv3d forward multi AB scaleadd, NDHWGC/GKZYXC/NDHWGK
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                ck::Tuple<BF16, BF16>,
+                                                                ck::Tuple<BF16, BF16>,
+                                                                ck::Tuple<>,
+                                                                BF16,
+                                                                ScaleAdd,
+                                                                ScaleAdd,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                ck::Tuple<F16, F16>,
+                                                                ck::Tuple<F16, F16>,
+                                                                ck::Tuple<>,
+                                                                F16,
+                                                                ScaleAdd,
+                                                                ScaleAdd,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                ck::Tuple<F32, F32>,
+                                                                ck::Tuple<F32, F32>,
+                                                                ck::Tuple<>,
+                                                                F32,
+                                                                ScaleAdd,
+                                                                ScaleAdd,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                ck::Tuple<int8_t, int8_t>,
+                                                                ck::Tuple<int8_t, int8_t>,
+                                                                ck::Tuple<>,
+                                                                int8_t,
+                                                                ScaleAdd,
+                                                                ScaleAdd,
+                                                                PassThrough>>>& instances);
+#endif
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DLayouts,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename DDataTypes,
+          typename OutDataType,
+          typename ComputeType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DLayouts,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DDataTypes,
+    OutDataType,
+    ck::tensor_operation::element_wise::ScaleAdd,
+    ck::tensor_operation::element_wise::ScaleAdd,
+    ck::tensor_operation::element_wise::PassThrough,
+    ComputeType>>
+{
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        DLayouts,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        DDataTypes,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::ScaleAdd,
+                                        ck::tensor_operation::element_wise::ScaleAdd,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ComputeType>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
+        {
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, ck::Tuple<float, float>> &&
+                         is_same_v<WeiDataType, ck::Tuple<float, float>> &&
+                         is_same_v<OutDataType, float> && is_same_v<ComputeType, float>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, ck::Tuple<half_t, half_t>> &&
+                         is_same_v<WeiDataType, ck::Tuple<half_t, half_t>> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::Tuple<ck::bhalf_t, ck::bhalf_t>> &&
+                         is_same_v<WeiDataType, ck::Tuple<ck::bhalf_t, ck::bhalf_t>> &&
+                         is_same_v<OutDataType, ck::bhalf_t> && is_same_v<ComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_INT8
+            if constexpr(is_same_v<InDataType, ck::Tuple<int8_t, int8_t>> &&
+                         is_same_v<WeiDataType, ck::Tuple<int8_t, int8_t>> &&
+                         is_same_v<OutDataType, int8_t> && is_same_v<ComputeType, int8_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using PassThrough          = ck::tensor_operation::element_wise::PassThrough;
+using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
+
+#ifdef CK_ENABLE_BF16
+// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                ck::Tuple<BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                ScaleAddScaleAddRelu>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                ck::Tuple<F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                ScaleAddScaleAddRelu>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                ck::Tuple<F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                ScaleAddScaleAddRelu>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                ck::Tuple<F32, F32>,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                ScaleAddScaleAddRelu>>>& instances);
+#endif
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DLayouts,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename DDataTypes,
+          typename OutDataType,
+          typename ComputeType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DLayouts,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DDataTypes,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::ScaleAddScaleAddRelu,
+    ComputeType>>
+{
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        DLayouts,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        DDataTypes,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::ScaleAddScaleAddRelu,
+                                        ComputeType>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
+        {
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_INT8
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -120,6 +120,32 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
                                                  PassThrough,
                                                  PassThrough>>>& instances);

+void add_device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F8,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F8,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 template <typename ALayout,
          typename BLayout,
          typename ELayout,
@@ -184,6 +210,24 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
            }
        }
+        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+                          is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
+                          is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
+            }
+        }
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
@@ -7,7 +7,7 @@
 #include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
@@ -18,25 +18,31 @@ namespace device {
 namespace instance {
 #ifdef CK_ENABLE_FP16
 // FP16
-void add_device_normalization_rank_2_1_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F16, F32, PassThrough, 2, 1>>>&);
+void add_device_normalization_fwd_rank_2_1_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F32, PassThrough, 2, 1>>>&);

-void add_device_normalization_rank_4_3_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F16, F32, PassThrough, 4, 3>>>&);
+void add_device_normalization_fwd_rank_4_3_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F32, PassThrough, 4, 3>>>&);

-void add_device_normalization_rank_5_3_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F16, F32, PassThrough, 5, 3>>>&);
+void add_device_normalization_fwd_rank_5_3_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F32, PassThrough, 5, 3>>>&);
 #endif
 #ifdef CK_ENABLE_FP32
 // FP32
-void add_device_normalization_rank_2_1_f32_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, PassThrough, 2, 1>>>&);
+void add_device_normalization_fwd_rank_2_1_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, PassThrough, 2, 1>>>&);

-void add_device_normalization_rank_4_3_f32_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);
+void add_device_normalization_fwd_rank_4_3_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);

-void add_device_normalization_rank_5_3_f32_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, PassThrough, 5, 3>>>&);
+void add_device_normalization_fwd_rank_5_3_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, PassThrough, 5, 3>>>&);
 #endif
 template <typename XDataType,
          typename GammaDataType,
@@ -45,7 +51,7 @@ template <typename XDataType,
          typename SaveMeanInvStdDataType,
          index_t Rank,
          index_t NumReduceDim>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormalization<
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormalizationFwd<
    XDataType,
    GammaDataType,
    BetaDataType,
@@ -55,14 +61,14 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormal
    Rank,
    NumReduceDim>>
 {
-    using DeviceOp = DeviceNormalization<XDataType,
-                                         GammaDataType,
-                                         BetaDataType,
-                                         YDataType,
-                                         SaveMeanInvStdDataType,
-                                         ck::tensor_operation::element_wise::PassThrough,
-                                         Rank,
-                                         NumReduceDim>;
+    using DeviceOp = DeviceNormalizationFwd<XDataType,
+                                            GammaDataType,
+                                            BetaDataType,
+                                            YDataType,
+                                            SaveMeanInvStdDataType,
+                                            ck::tensor_operation::element_wise::PassThrough,
+                                            Rank,
+                                            NumReduceDim>;

    static auto GetInstances()
    {
@@ -74,15 +80,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormal
        {
            if constexpr(Rank == 2 && NumReduceDim == 1)
            {
-                add_device_normalization_rank_2_1_f16_instances(op_ptrs);
+                add_device_normalization_fwd_rank_2_1_f16_instances(op_ptrs);
            }
            else if constexpr(Rank == 4 && NumReduceDim == 3)
            {
-                add_device_normalization_rank_4_3_f16_instances(op_ptrs);
+                add_device_normalization_fwd_rank_4_3_f16_instances(op_ptrs);
            }
            else if constexpr(Rank == 5 && NumReduceDim == 3)
            {
-                add_device_normalization_rank_5_3_f16_instances(op_ptrs);
+                add_device_normalization_fwd_rank_5_3_f16_instances(op_ptrs);
            }
        }
 #endif
@@ -93,15 +99,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormal
        {
            if constexpr(Rank == 2 && NumReduceDim == 1)
            {
-                add_device_normalization_rank_2_1_f32_instances(op_ptrs);
+                add_device_normalization_fwd_rank_2_1_f32_instances(op_ptrs);
            }
            else if constexpr(Rank == 4 && NumReduceDim == 3)
            {
-                add_device_normalization_rank_4_3_f32_instances(op_ptrs);
+                add_device_normalization_fwd_rank_4_3_f32_instances(op_ptrs);
            }
            else if constexpr(Rank == 5 && NumReduceDim == 3)
            {
-                add_device_normalization_rank_5_3_f32_instances(op_ptrs);
+                add_device_normalization_fwd_rank_5_3_f32_instances(op_ptrs);
            }
        }
 #endif

--- a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp
@@ -7,7 +7,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
@@ -18,16 +18,16 @@ namespace device {
 namespace instance {

 // FP16
-void add_device_normalization_rank_5_3_swish_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F16, F32, Swish, 5, 3>>>&);
+void add_device_normalization_fwd_rank_5_3_swish_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F32, Swish, 5, 3>>>&);

 // FP32
-void add_device_normalization_rank_5_3_swish_f32_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&);
+void add_device_normalization_fwd_rank_5_3_swish_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, Swish, 5, 3>>>&);

 // [x, gamma, beta, y] = [f16, f32, f32, f16]
-void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F16, F32, Swish, 5, 3>>>&);
+void add_device_normalization_fwd_rank_5_3_swish_f16_f32_f32_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F32, F32, F16, F32, Swish, 5, 3>>>&);

 template <typename XDataType,
          typename GammaDataType,
@@ -37,23 +37,23 @@ template <typename XDataType,
          index_t Rank,
          index_t NumReduceDim>
 struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::DeviceNormalization<XDataType,
-                                                      GammaDataType,
-                                                      BetaDataType,
-                                                      YDataType,
-                                                      SaveMeanInvStdDataType,
-                                                      ck::tensor_operation::element_wise::Swish,
-                                                      Rank,
-                                                      NumReduceDim>>
+    ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
+                                                         GammaDataType,
+                                                         BetaDataType,
+                                                         YDataType,
+                                                         SaveMeanInvStdDataType,
+                                                         ck::tensor_operation::element_wise::Swish,
+                                                         Rank,
+                                                         NumReduceDim>>
 {
-    using DeviceOp = DeviceNormalization<XDataType,
-                                         GammaDataType,
-                                         BetaDataType,
-                                         YDataType,
-                                         SaveMeanInvStdDataType,
-                                         ck::tensor_operation::element_wise::Swish,
-                                         Rank,
-                                         NumReduceDim>;
+    using DeviceOp = DeviceNormalizationFwd<XDataType,
+                                            GammaDataType,
+                                            BetaDataType,
+                                            YDataType,
+                                            SaveMeanInvStdDataType,
+                                            ck::tensor_operation::element_wise::Swish,
+                                            Rank,
+                                            NumReduceDim>;

    static auto GetInstances()
    {
@@ -65,7 +65,7 @@ struct DeviceOperationInstanceFactory<
        {
            if constexpr(Rank == 5 && NumReduceDim == 3)
            {
-                add_device_normalization_rank_5_3_swish_f16_instances(op_ptrs);
+                add_device_normalization_fwd_rank_5_3_swish_f16_instances(op_ptrs);
            }
        }
        else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> &&
@@ -74,7 +74,7 @@ struct DeviceOperationInstanceFactory<
        {
            if constexpr(Rank == 5 && NumReduceDim == 3)
            {
-                add_device_normalization_rank_5_3_swish_f32_instances(op_ptrs);
+                add_device_normalization_fwd_rank_5_3_swish_f32_instances(op_ptrs);
            }
        }
        else if constexpr(is_same_v<XDataType, F16> && is_same_v<GammaDataType, F32> &&
@@ -83,7 +83,7 @@ struct DeviceOperationInstanceFactory<
        {
            if constexpr(Rank == 5 && NumReduceDim == 3)
            {
-                add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(op_ptrs);
+                add_device_normalization_fwd_rank_5_3_swish_f16_f32_f32_f16_instances(op_ptrs);
            }
        }


--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
@@ -7,7 +7,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
@@ -20,96 +20,96 @@ namespace instance {
 // grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
    std::vector<
-        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      NHWGC,
-                                                      GKYXC,
-                                                      GK_GK_Tuple,
-                                                      NHWGK,
-                                                      int8_t,
-                                                      int8_t,
-                                                      I32_F32_Tuple,
-                                                      int8_t,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      Add_Activation_Mul2_Clamp<PassThrough>>>>&
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        I32_F32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Add_Activation_Mul2_Clamp<PassThrough>>>>&
        instances);

 void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              I32_F32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Add_Activation_Mul2_Clamp<Relu>>>>&
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                GK_GK_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                I32_F32_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Add_Activation_Mul2_Clamp<Relu>>>>&
        instances);

 void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
    std::vector<
-        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      NHWGC,
-                                                      GKYXC,
-                                                      GK_GK_Tuple,
-                                                      NHWGK,
-                                                      int8_t,
-                                                      int8_t,
-                                                      I32_F32_Tuple,
-                                                      int8_t,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      Add_Mul2_Activation_Mul_Clamp<TanH>>>>&
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        I32_F32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Add_Mul2_Activation_Mul_Clamp<TanH>>>>&
        instances);
 #endif
 void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
    std::vector<
-        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      NHWGC,
-                                                      GKYXC,
-                                                      GK_GK_Tuple,
-                                                      NHWGK,
-                                                      int8_t,
-                                                      int8_t,
-                                                      I32_F32_Tuple,
-                                                      int8_t,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      Add_Activation_Mul2_Clamp<PassThrough>>>>&
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        I32_F32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Add_Activation_Mul2_Clamp<PassThrough>>>>&
        instances);

 void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              I32_F32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Add_Activation_Mul2_Clamp<Relu>>>>&
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                GK_GK_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                I32_F32_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Add_Activation_Mul2_Clamp<Relu>>>>&
        instances);

 void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
    std::vector<
-        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      NHWGC,
-                                                      GKYXC,
-                                                      GK_GK_Tuple,
-                                                      NHWGK,
-                                                      int8_t,
-                                                      int8_t,
-                                                      I32_F32_Tuple,
-                                                      int8_t,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      Add_Mul2_Activation_Mul_Clamp<TanH>>>>&
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        I32_F32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Add_Mul2_Activation_Mul_Clamp<TanH>>>>&
        instances);

 // piecewise activation function
@@ -123,7 +123,7 @@ template <ck::index_t NumDimSpatial,
          typename DsDataType,
          typename OutDataType,
          typename Activation>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
    NumDimSpatial,
    InLayout,
    WeiLayout,
@@ -137,18 +137,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    ck::tensor_operation::element_wise::PassThrough,
    Add_Activation_Mul2_Clamp<Activation>>>
 {
-    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
-                                                   InLayout,
-                                                   WeiLayout,
-                                                   DsLayout,
-                                                   OutLayout,
-                                                   InDataType,
-                                                   WeiDataType,
-                                                   DsDataType,
-                                                   OutDataType,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Add_Activation_Mul2_Clamp<Activation>>;
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        DsLayout,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        DsDataType,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        Add_Activation_Mul2_Clamp<Activation>>;

    static auto GetInstances()
    {
@@ -193,7 +194,7 @@ template <ck::index_t NumDimSpatial,
          typename DsDataType,
          typename OutDataType,
          typename Activation>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
    NumDimSpatial,
    InLayout,
    WeiLayout,
@@ -207,18 +208,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    ck::tensor_operation::element_wise::PassThrough,
    Add_Mul2_Activation_Mul_Clamp<Activation>>>
 {
-    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
-                                                   InLayout,
-                                                   WeiLayout,
-                                                   DsLayout,
-                                                   OutLayout,
-                                                   InDataType,
-                                                   WeiDataType,
-                                                   DsDataType,
-                                                   OutDataType,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Add_Mul2_Activation_Mul_Clamp<Activation>>;
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        DsLayout,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        DsDataType,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        Add_Mul2_Activation_Mul_Clamp<Activation>>;

    static auto GetInstances()
    {

--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
@@ -7,7 +7,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
@@ -20,94 +20,96 @@ namespace instance {
 // grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
    std::vector<
-        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      NHWGC,
-                                                      GKYXC,
-                                                      GK_Tuple,
-                                                      NHWGK,
-                                                      int8_t,
-                                                      int8_t,
-                                                      I32_Tuple,
-                                                      int8_t,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      Add_Activation_Mul_Clamp<PassThrough>>>>&
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        I32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Add_Activation_Mul_Clamp<PassThrough>>>>&
        instances);

 void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              I32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Add_Activation_Mul_Clamp<Relu>>>>&
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                GK_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                I32_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Add_Activation_Mul_Clamp<Relu>>>>&
        instances);

 void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              I32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Add_Mul_Activation_Mul_Clamp<TanH>>>>&
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        I32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Add_Mul_Activation_Mul_Clamp<TanH>>>>&
        instances);
 #endif
 void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
    std::vector<
-        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      NHWGC,
-                                                      GKYXC,
-                                                      GK_Tuple,
-                                                      NHWGK,
-                                                      int8_t,
-                                                      int8_t,
-                                                      I32_Tuple,
-                                                      int8_t,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      Add_Activation_Mul_Clamp<PassThrough>>>>&
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        I32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Add_Activation_Mul_Clamp<PassThrough>>>>&
        instances);

 void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              I32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Add_Activation_Mul_Clamp<Relu>>>>&
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                GK_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                I32_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Add_Activation_Mul_Clamp<Relu>>>>&
        instances);

 void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              I32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Add_Mul_Activation_Mul_Clamp<TanH>>>>&
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        I32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Add_Mul_Activation_Mul_Clamp<TanH>>>>&
        instances);

 // piecewise activation function
@@ -121,7 +123,7 @@ template <ck::index_t NumDimSpatial,
          typename DsDataType,
          typename OutDataType,
          typename Activation>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
    NumDimSpatial,
    InLayout,
    WeiLayout,
@@ -135,18 +137,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    ck::tensor_operation::element_wise::PassThrough,
    Add_Activation_Mul_Clamp<Activation>>>
 {
-    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
-                                                   InLayout,
-                                                   WeiLayout,
-                                                   DsLayout,
-                                                   OutLayout,
-                                                   InDataType,
-                                                   WeiDataType,
-                                                   DsDataType,
-                                                   OutDataType,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Add_Activation_Mul_Clamp<Activation>>;
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        DsLayout,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        DsDataType,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        Add_Activation_Mul_Clamp<Activation>>;

    static auto GetInstances()
    {
@@ -191,7 +194,7 @@ template <ck::index_t NumDimSpatial,
          typename DsDataType,
          typename OutDataType,
          typename Activation>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
    NumDimSpatial,
    InLayout,
    WeiLayout,
@@ -205,18 +208,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    ck::tensor_operation::element_wise::PassThrough,
    Add_Mul_Activation_Mul_Clamp<Activation>>>
 {
-    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
-                                                   InLayout,
-                                                   WeiLayout,
-                                                   DsLayout,
-                                                   OutLayout,
-                                                   InDataType,
-                                                   WeiDataType,
-                                                   DsDataType,
-                                                   OutDataType,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Add_Mul_Activation_Mul_Clamp<Activation>>;
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        DsLayout,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        DsDataType,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        Add_Mul_Activation_Mul_Clamp<Activation>>;

    static auto GetInstances()
    {

--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
@@ -7,7 +7,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
@@ -19,63 +19,65 @@ namespace instance {
 #ifdef DL_KERNELS
 // grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              F32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Activation_Mul2_Clamp<PassThrough>>>>&
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        F32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Activation_Mul2_Clamp<PassThrough>>>>&
        instances);

 void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              F32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Activation_Mul2_Clamp<Relu>>>>&
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                GK_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                F32_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Activation_Mul2_Clamp<Relu>>>>&
        instances);
 #endif
 void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              F32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Activation_Mul2_Clamp<PassThrough>>>>&
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        GK_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        F32_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Activation_Mul2_Clamp<PassThrough>>>>&
        instances);

 void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              GK_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              F32_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Activation_Mul2_Clamp<Relu>>>>&
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                GK_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                F32_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Activation_Mul2_Clamp<Relu>>>>&
        instances);

 template <ck::index_t NumDimSpatial,
@@ -88,7 +90,7 @@ template <ck::index_t NumDimSpatial,
          typename DsDataType,
          typename OutDataType,
          typename Activation>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
    NumDimSpatial,
    InLayout,
    WeiLayout,
@@ -102,18 +104,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    ck::tensor_operation::element_wise::PassThrough,
    Activation_Mul2_Clamp<Activation>>>
 {
-    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
-                                                   InLayout,
-                                                   WeiLayout,
-                                                   GK_Tuple,
-                                                   OutLayout,
-                                                   InDataType,
-                                                   WeiDataType,
-                                                   F32_Tuple,
-                                                   OutDataType,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Activation_Mul2_Clamp<Activation>>;
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        GK_Tuple,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        F32_Tuple,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        Activation_Mul2_Clamp<Activation>>;

    static auto GetInstances()
    {

--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
@@ -7,7 +7,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
@@ -19,63 +19,65 @@ namespace instance {
 #ifdef DL_KERNELS
 // grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Activation_Mul_Clamp<PassThrough>>>>&
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Activation_Mul_Clamp<PassThrough>>>>&
        instances);

 void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Activation_Mul_Clamp<Relu>>>>&
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Activation_Mul_Clamp<Relu>>>>&
        instances);
 #endif
 void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Activation_Mul_Clamp<PassThrough>>>>&
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NHWGK,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        Activation_Mul_Clamp<PassThrough>>>>&
        instances);

 void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              Activation_Mul_Clamp<Relu>>>>&
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Activation_Mul_Clamp<Relu>>>>&
        instances);

 template <ck::index_t NumDimSpatial,
@@ -86,7 +88,7 @@ template <ck::index_t NumDimSpatial,
          typename WeiDataType,
          typename OutDataType,
          typename Activation>
-struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
    NumDimSpatial,
    InLayout,
    WeiLayout,
@@ -100,18 +102,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    ck::tensor_operation::element_wise::PassThrough,
    Activation_Mul_Clamp<Activation>>>
 {
-    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
-                                                   InLayout,
-                                                   WeiLayout,
-                                                   Empty_Tuple,
-                                                   OutLayout,
-                                                   InDataType,
-                                                   WeiDataType,
-                                                   Empty_Tuple,
-                                                   OutDataType,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Activation_Mul_Clamp<Activation>>;
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        Empty_Tuple,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        Empty_Tuple,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        Activation_Mul_Clamp<Activation>>;

    static auto GetInstances()
    {

--- a/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using device_transpose_f16_instances = std::tuple<
+    // FOR 16, 32, 16, 32, 16
+    // clang-format off
+    DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<1>, ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 1, 1, ck::Sequence<1>, ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 4, 4, ck::Sequence<1>, ck::Sequence<1>>
+    // clang-format on
+    >;
+
+using device_transpose_f32_instances = std::tuple<
+    // for 16, 8, 16, 32, 8 -> test with instances for fp16
+    // clang-format off
+    DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 4, ck::Sequence<1>, ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 8, ck::Sequence<1>, ck::Sequence<1>>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+void add_device_transpose_f16_instances(
+    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 5>>>&
+        instances);
+
+void add_device_transpose_f32_instances(
+    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 5>>>&
+        instances);
+
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          index_t NumDim>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::
+        DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>
+{
+    using DeviceOp =
+        DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
+                     is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
+        {
+            add_device_transpose_f32_instances(op_ptrs);
+        }
+        else if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
+                          is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
+        {
+            add_device_transpose_f16_instances(op_ptrs);
+        }
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
 set(DEVICE_CONTRACTION_BILINEAR_INSTANCES)
-#float
+
+# FP32
 list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
-                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
-                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
-                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp)
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp)
+
+list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp)
+
+list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp)

-#double
+# FP64
 list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
-                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
-                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
-                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp)
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp)
+
+list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp)
+
+# FP16
+list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp)
+
+# BF16
+list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp)

 add_instance_library(device_contraction_bilinear_instance ${DEVICE_CONTRACTION_BILINEAR_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance =
+    device_contraction_kk_instance<BF16,
+                                   BF16,
+                                   F32,
+                                   BF16,
+                                   BF16_Tuple,
+                                   BF16,
+                                   F32,
+                                   PassThrough,
+                                   PassThrough,
+                                   Bilinear>;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/n/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance =
+    device_contraction_kn_instance<BF16,
+                                   BF16,
+                                   F32,
+                                   BF16,
+                                   BF16_Tuple,
+                                   BF16,
+                                   F32,
+                                   PassThrough,
+                                   PassThrough,
+                                   Bilinear>;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// m/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance =
+    device_contraction_mk_instance<BF16,
+                                   BF16,
+                                   F32,
+                                   BF16,
+                                   BF16_Tuple,
+                                   BF16,
+                                   F32,
+                                   PassThrough,
+                                   PassThrough,
+                                   Bilinear>;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// m/n/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance =
+    device_contraction_mn_instance<BF16,
+                                   BF16,
+                                   F32,
+                                   BF16,
+                                   BF16_Tuple,
+                                   BF16,
+                                   F32,
+                                   PassThrough,
+                                   PassThrough,
+                                   Bilinear>;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance =
+    device_contraction_kk_instance<F16,
+                                   F16,
+                                   F32,
+                                   F16,
+                                   F16_Tuple,
+                                   F16,
+                                   F32,
+                                   PassThrough,
+                                   PassThrough,
+                                   Bilinear>;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/n/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance =
+    device_contraction_kn_instance<F16,
+                                   F16,
+                                   F32,
+                                   F16,
+                                   F16_Tuple,
+                                   F16,
+                                   F32,
+                                   PassThrough,
+                                   PassThrough,
+                                   Bilinear>;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// m/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance =
+    device_contraction_mk_instance<F16,
+                                   F16,
+                                   F32,
+                                   F16,
+                                   F16_Tuple,
+                                   F16,
+                                   F32,
+                                   PassThrough,
+                                   PassThrough,
+                                   Bilinear>;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck