Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel...

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel into barkocot/fix-Filter1x1Pad0-check

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel...
Merge branch 'develop' of github.com:ROCmSoftwarePlatform/composable_kernel into barkocot/fix-Filter1x1Pad0-check
5641b889 · Bartlomiej Kocot · 4ecef37f · f2398f61 · 5641b889 · 5641b889
Commit 5641b889 authored Nov 14, 2023 by Bartlomiej Kocot
8 changed files
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
@@ -4,7 +4,7 @@
 #pragma once
 #include "conv2d_quantization_common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -26,19 +26,19 @@ using device_grouped_conv2d_xdl_int8_instances =
        //########################################|    Spatial|  Layout|  Layout|   Layout|  Layout|   Type|   Type|    Type| DataType|   DataType|   Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|    ScalarPerVector|
        //########################################|           |        |        |         |        |       |       |        |         |           |       |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|      _NWaveNPerXdl|
        //########################################|           |        |        |         |        |       |       |        |         |           |       |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                   |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>
    >;
 // clang-format on

--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
@@ -8,18 +8,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              F32_Tuple,
+                                                                F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Mul2_Clamp>>>& instances)
+                                                                Mul2_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
 }
 void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              GK_Tuple,
+                                                                GK_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              F32_Tuple,
+                                                                F32_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Relu_Mul2_Clamp>>>& instances)
+                                                                Relu_Mul2_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,

--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
@@ -8,18 +8,18 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Mul_Clamp>>>& instances)
+                                                                Mul_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
 }
 void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                              NHWGC,
+                                                                NHWGC,
-                                                              GKYXC,
+                                                                GKYXC,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              NHWGK,
+                                                                NHWGK,
-                                                              int8_t,
+                                                                int8_t,
-                                                              int8_t,
+                                                                int8_t,
-                                                              Empty_Tuple,
+                                                                Empty_Tuple,
-                                                              int8_t,
+                                                                int8_t,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              PassThrough,
+                                                                PassThrough,
-                                                              Relu_Mul_Clamp>>>& instances)
+                                                                Relu_Mul_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
                                   device_grouped_conv2d_xdl_int8_instances<NHWGC,

--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -76,7 +76,7 @@ int profile_gemm_impl(int do_verification,
        break;
    default:
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 0.1});
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.05, 0.05});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.01, 0.01});
    }
    using AElementOp = ck::tensor_operation::element_wise::PassThrough;

--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -198,18 +198,18 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
        }
    };
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-                                                                                 InLayout,
+                                                                                   InLayout,
-                                                                                 WeiLayout,
+                                                                                   WeiLayout,
-                                                                                 ck::Tuple<>,
+                                                                                   ck::Tuple<>,
-                                                                                 OutLayout,
+                                                                                   OutLayout,
-                                                                                 InDataType,
+                                                                                   InDataType,
-                                                                                 WeiDataType,
+                                                                                   WeiDataType,
-                                                                                 ck::Tuple<>,
+                                                                                   ck::Tuple<>,
-                                                                                 OutDataType,
+                                                                                   OutDataType,
-                                                                                 InElementOp,
+                                                                                   InElementOp,
-                                                                                 WeiElementOp,
+                                                                                   WeiElementOp,
-                                                                                 OutElementOp>;
+                                                                                   OutElementOp>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<

--- a/test/grouped_convnd_fwd/CMakeLists.txt
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
 add_gtest_executable(test_grouped_convnd_fwd test_grouped_convnd_fwd.cpp)
 target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
+add_gtest_executable(test_grouped_convnd_fwd_multi_ab_interface test_grouped_convnd_fwd_multi_ab_interface.cpp)
+target_link_libraries(test_grouped_convnd_fwd_multi_ab_interface PRIVATE utility)
+add_gtest_executable(test_grouped_convnd_fwd_multi_d_interface_compatibility test_grouped_convnd_fwd_multi_d_interface_compatibility.cpp)
+target_link_libraries(test_grouped_convnd_fwd_multi_d_interface_compatibility PRIVATE utility device_grouped_conv3d_fwd_instance)
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_ab_interface.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_ab_interface.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include <gtest/gtest.h>
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using ScaleAdd    = ck::tensor_operation::element_wise::ScaleAdd;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+template <typename DataType,
+          typename InDataTypes,
+          typename WeiDataTypes,
+          typename InElementOp,
+          typename WeiElementOp>
+class TestGroupedConvndFwdMultiABInterfaceBase : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 3;
+    static constexpr ck::index_t NumAs       = 2;
+    static constexpr ck::index_t NumBs       = 2;
+    static constexpr auto ConvSpec =
+        ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+    using InLayout                 = ck::tensor_layout::convolution::GNDHWC;
+    using WeiLayout                = ck::tensor_layout::convolution::GKZYXC;
+    using OutLayout                = ck::tensor_layout::convolution::GNDHWK;
+    using OutElementOp             = PassThrough;
+    using DeviceGroupedConvNDMultiABFwdInstance =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+            NDimSpatial,
+            InLayout,
+            WeiLayout,
+            ck::Tuple<>,
+            OutLayout,
+            InDataTypes,
+            WeiDataTypes,
+            DataType,
+            DataType,
+            ck::Tuple<>,
+            DataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            ConvSpec,    // ConvForwardSpecialization
+            GemmSpec,    // GemmSpecialization
+            1,           //
+            256,         // BlockSize
+            128,         // MPerBlock
+            256,         // NPerBlock
+            32,          // KPerBlock
+            8,           // AK1
+            8,           // BK1
+            32,          // MPerXdl
+            32,          // NPerXdl
+            2,           // MXdlPerWave
+            4,           // NXdlPerWave
+            S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+            S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+            S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+            2,           // ABlockTransferSrcVectorDim
+            8,           // ABlockTransferSrcScalarPerVector
+            8,           // ABlockTransferDstScalarPerVector_AK1
+            1,           // ABlockLdsExtraM
+            S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+            S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+            S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+            2,           // BBlockTransferSrcVectorDim
+            8,           // BBlockTransferSrcScalarPerVector
+            8,           // BBlockTransferDstScalarPerVector_BK1
+            1,           // BBlockLdsExtraN
+            1,
+            1,
+            S<1, 32, 1, 8>,
+            8>;
+    const ck::utils::conv::ConvParam conv_param{
+        3, 1, 16, 16, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    void SetUp() override
+    {
+        if(!ck::is_xdl_supported())
+        {
+            GTEST_SKIP();
+        }
+    }
+    template <typename ADataType, typename BDataType>
+    bool Run(ADataType as, BDataType bs)
+    {
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+        std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+        auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+        copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+        copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+        copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+        copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+        copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+        copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+        copy(conv_param.conv_filter_strides_, conv_filter_strides);
+        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+        copy(conv_param.input_left_pads_, input_left_pads);
+        copy(conv_param.input_right_pads_, input_right_pads);
+        std::array<const void*, 0> ds{};
+        // do Conv
+        auto conv     = DeviceGroupedConvNDMultiABFwdInstance{};
+        auto invoker  = conv.MakeInvoker();
+        auto argument = conv.MakeArgument(as,
+                                          bs,
+                                          ds,
+                                          nullptr,
+                                          a_g_n_c_wis_lengths,
+                                          a_g_n_c_wis_strides,
+                                          b_g_k_c_xs_lengths,
+                                          b_g_k_c_xs_strides,
+                                          {},
+                                          {},
+                                          e_g_n_k_wos_lengths,
+                                          e_g_n_k_wos_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          InElementOp{},
+                                          WeiElementOp{},
+                                          OutElementOp{});
+        return conv.IsSupportedArgument(argument);
+    }
+};
+class TestGroupedConvndFwdMultiAInterface
+    : public TestGroupedConvndFwdMultiABInterfaceBase<float,
+                                                      ck::Tuple<float, float>,
+                                                      float,
+                                                      ScaleAdd,
+                                                      PassThrough>
+{
+};
+class TestGroupedConvndFwdMultiBInterface
+    : public TestGroupedConvndFwdMultiABInterfaceBase<float,
+                                                      float,
+                                                      ck::Tuple<float, float>,
+                                                      PassThrough,
+                                                      ScaleAdd>
+{
+};
+class TestGroupedConvndFwdMultiABInterface
+    : public TestGroupedConvndFwdMultiABInterfaceBase<float,
+                                                      ck::Tuple<float, float>,
+                                                      ck::Tuple<float, float>,
+                                                      ScaleAdd,
+                                                      ScaleAdd>
+{
+};
+class TestGroupedConvndFwdInterface
+    : public TestGroupedConvndFwdMultiABInterfaceBase<float, float, float, PassThrough, PassThrough>
+{
+};
+TEST_F(TestGroupedConvndFwdMultiAInterface, MultiA)
+{
+    std::array<const void*, NumAs> as{nullptr, nullptr};
+    const void* b = nullptr;
+    EXPECT_TRUE(this->template Run(as, b));
+}
+TEST_F(TestGroupedConvndFwdMultiBInterface, MultiB)
+{
+    const void* a = nullptr;
+    std::array<const void*, NumBs> bs{nullptr, nullptr};
+    EXPECT_TRUE(this->template Run(a, bs));
+}
+TEST_F(TestGroupedConvndFwdMultiABInterface, MultiAB)
+{
+    std::array<const void*, NumAs> as{nullptr, nullptr};
+    std::array<const void*, NumBs> bs{nullptr, nullptr};
+    EXPECT_TRUE(this->template Run(as, bs));
+}
+TEST_F(TestGroupedConvndFwdInterface, SingleAB)
+{
+    const void* a = nullptr;
+    const void* b = nullptr;
+    EXPECT_TRUE(this->template Run(a, b));
+}
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_d_interface_compatibility.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_multi_d_interface_compatibility.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
+#include <gtest/gtest.h>
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+class TestGroupedConvndFwdMultiDInterfaceCompatibility : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 3;
+    using InDataType  = float;
+    using WeiDataType = float;
+    using OutDataType = float;
+    using InLayout    = ck::tensor_layout::convolution::GNDHWC;
+    using WeiLayout   = ck::tensor_layout::convolution::GKZYXC;
+    using OutLayout   = ck::tensor_layout::convolution::GNDHWK;
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 ck::Tuple<>,
+                                                                                 OutLayout,
+                                                                                 InDataType,
+                                                                                 WeiDataType,
+                                                                                 ck::Tuple<>,
+                                                                                 OutDataType,
+                                                                                 PassThrough,
+                                                                                 PassThrough,
+                                                                                 PassThrough>;
+    bool Run()
+    {
+        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+        return op_ptrs.size() != 0;
+    }
+};
+TEST_F(TestGroupedConvndFwdMultiDInterfaceCompatibility, CompatibilityTest)
+{
+    EXPECT_TRUE(this->Run());
+}