Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into e2e_kernellib

Merge branch 'develop' of...
Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into e2e_kernellib
9fb64dae · aska-0096 · e330961d · fe96e8fb · 9fb64dae · 9fb64dae
Commit 9fb64dae authored Mar 23, 2023 by aska-0096
20 changed files
--- a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
@@ -7,7 +7,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-void add_device_conv2d_bias_perchannel_quantization_int8_instances(
+void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
                                                              GNHWC,
                                                              GKYXC,
@@ -22,23 +22,26 @@ void add_device_conv2d_bias_perchannel_quantization_int8_instances(
                                                              Add_Mul2_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
-                                                                     I32_F32_Tuple,
-                                                                     Add_Mul2_Clamp,
-                                                                     ConvFwdDefault>{});
+                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                                                            I32_F32_Tuple,
+                                                                            Add_Mul2_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            8>{});
    add_device_operation_instances(instances,
-                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
-                                                                     I32_F32_Tuple,
-                                                                     Add_Mul2_Clamp,
-                                                                     ConvFwd1x1P0>{});
+                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                                                            I32_F32_Tuple,
+                                                                            Add_Mul2_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            8>{});
    add_device_operation_instances(instances,
-                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
-                                                                     I32_F32_Tuple,
-                                                                     Add_Mul2_Clamp,
-                                                                     ConvFwd1x1S1P0>{});
+                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                                                            I32_F32_Tuple,
+                                                                            Add_Mul2_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            8>{});
 }

-void add_device_conv2d_bias_relu_perchannel_quantization_int8_instances(
+void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
                                                              GNHWC,
                                                              GKYXC,
@@ -53,20 +56,23 @@ void add_device_conv2d_bias_relu_perchannel_quantization_int8_instances(
                                                              Add_Relu_Mul2_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
-                                                                     I32_F32_Tuple,
-                                                                     Add_Relu_Mul2_Clamp,
-                                                                     ConvFwdDefault>{});
+                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                                                            I32_F32_Tuple,
+                                                                            Add_Relu_Mul2_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            8>{});
    add_device_operation_instances(instances,
-                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
-                                                                     I32_F32_Tuple,
-                                                                     Add_Relu_Mul2_Clamp,
-                                                                     ConvFwd1x1P0>{});
+                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                                                            I32_F32_Tuple,
+                                                                            Add_Relu_Mul2_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            8>{});
    add_device_operation_instances(instances,
-                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
-                                                                     I32_F32_Tuple,
-                                                                     Add_Relu_Mul2_Clamp,
-                                                                     ConvFwd1x1S1P0>{});
+                                   device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
+                                                                            I32_F32_Tuple,
+                                                                            Add_Relu_Mul2_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            8>{});
 }
 } // namespace instance
 } // namespace device

--- a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
@@ -7,7 +7,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-void add_device_conv2d_bias_perlayer_quantization_int8_instances(
+void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
                                                              GNHWC,
                                                              GKYXC,
@@ -21,18 +21,27 @@ void add_device_conv2d_bias_perlayer_quantization_int8_instances(
                                                              PassThrough,
                                                              Add_Mul_Clamp>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Mul_Clamp, ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Mul_Clamp, ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Mul_Clamp, ConvFwd1x1S1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            I32_Tuple,
+                                                                            Add_Mul_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            8>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            I32_Tuple,
+                                                                            Add_Mul_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            8>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            I32_Tuple,
+                                                                            Add_Mul_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            8>{});
 }

-void add_device_conv2d_bias_relu_perlayer_quantization_int8_instances(
+void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
                                                              GNHWC,
                                                              GKYXC,
@@ -47,20 +56,25 @@ void add_device_conv2d_bias_relu_perlayer_quantization_int8_instances(
                                                              Add_Relu_Mul_Clamp>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   device_conv2d_int8_32Ds_instances<GK_Tuple,
-                                                                     I32_Tuple,
-                                                                     Add_Relu_Mul_Clamp,
-                                                                     ConvFwdDefault>{});
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            I32_Tuple,
+                                                                            Add_Relu_Mul_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            8>{});

-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Relu_Mul_Clamp, ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            I32_Tuple,
+                                                                            Add_Relu_Mul_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            8>{});

    add_device_operation_instances(instances,
-                                   device_conv2d_int8_32Ds_instances<GK_Tuple,
-                                                                     I32_Tuple,
-                                                                     Add_Relu_Mul_Clamp,
-                                                                     ConvFwd1x1S1P0>{});
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            I32_Tuple,
+                                                                            Add_Relu_Mul_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            8>{});
 }
 } // namespace instance
 } // namespace device

--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "conv2d_quantization_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+template <typename DsLayout,
+          typename DsDatatype,
+          typename OutElementOp,
+          ConvolutionForwardSpecialization ConvSpec,
+          index_t DstScalarPerVector>
+using device_grouped_conv2d_xdl_int8_instances =
+    std::tuple <
+        //########################################|     NumDim|      A|      B|       Ds|      E|  AData|  BData| AccData| CShuffle|         Ds|  EData|           A|           B|          CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|     CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|   Layout| Layout|   Type|   Type|    Type| DataType|   DataType|   Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|    ScalarPerVector|
+        //########################################|           |       |       |         |       |       |       |        |         |           |       |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|      _NWaveNPerXdl|
+        //########################################|           |       |       |         |       |       |       |        |         |           |       |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                   |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>, DstScalarPerVector>
+    >;
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
@@ -7,7 +7,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-void add_device_conv2d_perchannel_quantization_int8_instances(
+void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
                                                              GNHWC,
                                                              GKYXC,
@@ -21,18 +21,27 @@ void add_device_conv2d_perchannel_quantization_int8_instances(
                                                              PassThrough,
                                                              Mul2_Clamp>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Mul2_Clamp, ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Mul2_Clamp, ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Mul2_Clamp, ConvFwd1x1S1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            F32_Tuple,
+                                                                            Mul2_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            8>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            F32_Tuple,
+                                                                            Mul2_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            8>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            F32_Tuple,
+                                                                            Mul2_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            8>{});
 }

-void add_device_conv2d_relu_perchannel_quantization_int8_instances(
+void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
                                                              GNHWC,
                                                              GKYXC,
@@ -46,15 +55,24 @@ void add_device_conv2d_relu_perchannel_quantization_int8_instances(
                                                              PassThrough,
                                                              Relu_Mul2_Clamp>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Relu_Mul2_Clamp, ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Relu_Mul2_Clamp, ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Relu_Mul2_Clamp, ConvFwd1x1S1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            F32_Tuple,
+                                                                            Relu_Mul2_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            8>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            F32_Tuple,
+                                                                            Relu_Mul2_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            8>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
+                                                                            F32_Tuple,
+                                                                            Relu_Mul2_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            8>{});
 }
 } // namespace instance
 } // namespace device

--- a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
@@ -7,7 +7,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-void add_device_conv2d_perlayer_quantization_int8_instances(
+void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
                                                              GNHWC,
                                                              GKYXC,
@@ -21,18 +21,27 @@ void add_device_conv2d_perlayer_quantization_int8_instances(
                                                              PassThrough,
                                                              Mul_Clamp>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Mul_Clamp, ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Mul_Clamp, ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Mul_Clamp, ConvFwd1x1S1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                                                            Empty_Tuple,
+                                                                            Mul_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            16>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                                                            Empty_Tuple,
+                                                                            Mul_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            16>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                                                            Empty_Tuple,
+                                                                            Mul_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            16>{});
 }

-void add_device_conv2d_relu_perlayer_quantization_int8_instances(
+void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
                                                              GNHWC,
                                                              GKYXC,
@@ -46,15 +55,24 @@ void add_device_conv2d_relu_perlayer_quantization_int8_instances(
                                                              PassThrough,
                                                              Relu_Mul_Clamp>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Relu_Mul_Clamp, ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Relu_Mul_Clamp, ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Relu_Mul_Clamp, ConvFwd1x1S1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                                                            Empty_Tuple,
+                                                                            Relu_Mul_Clamp,
+                                                                            ConvFwdDefault,
+                                                                            16>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                                                            Empty_Tuple,
+                                                                            Relu_Mul_Clamp,
+                                                                            ConvFwd1x1P0,
+                                                                            16>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
+                                                                            Empty_Tuple,
+                                                                            Relu_Mul_Clamp,
+                                                                            ConvFwd1x1S1P0,
+                                                                            16>{});
 }
 } // namespace instance
 } // namespace device

--- a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_int8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_int8_instance.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using Empty_Tuple = ck::Tuple<>;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC       = ck::tensor_layout::convolution::GNHWC;
-using GKYXC       = ck::tensor_layout::convolution::GKYXC;
-using GNHWK       = ck::tensor_layout::convolution::GNHWK;
-using GK          = ck::tensor_layout::convolution::G_K;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using Relu        = ck::tensor_operation::element_wise::Relu;
-
-using GK_Tuple      = ck::Tuple<GK>;
-using GK_GK_Tuple   = ck::Tuple<GK, GK>;
-using I32_Tuple     = ck::Tuple<int32_t>;
-using F32_Tuple     = ck::Tuple<float>;
-using I32_F32_Tuple = ck::Tuple<int32_t, float>;
-
-using Mul_Clamp      = ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>;
-using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Relu>;
-
-using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<PassThrough>;
-using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Relu>;
-
-using Mul2_Clamp      = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<PassThrough>;
-using Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<Relu>;
-
-using Add_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<PassThrough>;
-using Add_Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<Relu>;
-
-static constexpr ck::index_t NDimSpatial = 2;
-static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-template <typename DsLayout,
-          typename DsDatatype,
-          typename OutElementOp,
-          ConvolutionForwardSpecialization ConvSpec>
-// clang-format off
-using device_conv2d_int8_instances =
-    std::tuple <
-        //########################################|  NumDim|      A|      B|       Ds|      E|  AData|  BData| AccData| CShuffle|         Ds|  EData|           A|           B|          CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|   Layout| Layout|   Type|   Type|    Type| DataType|   DataType|   Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |         |       |       |       |        |         |           |       |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |         |       |       |       |        |         |           |       |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               16>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               16>
-    >;
-// clang-format on
-
-// for conv + multiple of 32 bit Ds. bit of Ds will affect the ScalarPerVector of C
-template <typename DsLayout,
-          typename DsDatatype,
-          typename OutElementOp,
-          ConvolutionForwardSpecialization ConvSpec>
-// clang-format off
-using device_conv2d_int8_32Ds_instances =
-    std::tuple <
-        //########################################|  NumDim|      A|      B|       Ds|      E|  AData|  BData| AccData| CShuffle|         Ds|  EData|           A|           B|          CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|   Layout| Layout|   Type|   Type|    Type| DataType|   DataType|   Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |         |       |       |       |        |         |           |       |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |         |       |       |       |        |         |           |       |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               8>
-    >;
-// clang-format on
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_quantization_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <typename OutElementOp>
+using device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //####################|      A|      B|          Ds|      E|  AData|  BData| AccData|      DsData|   EData|           A|           B|           CDE|           GEMM| Block|  MPer|  NPer|  KPer| K1|   M1Per|  N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //####################| Layout| Layout|      Layout| Layout|   Type|   Type|    Type|        Type|    Type| Elementwise| Elementwise|   Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM| Thread| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //####################|       |       |            |       |       |       |        |            |        |   Operation|   Operation|     Operation|               |      |      |      |      |   |        |       |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //####################|       |       |            |       |       |       |        |            |        |            |            |              |               |      |      |      |      |   |        |       |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGemmMultipleD_Dl<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,   256,   128,   128,    16,  4,       4,      4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+    // clang-format on
+    >;
+
+template <typename OutElementOp>
+using device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //####################|      A|      B|          Ds|      E|  AData|  BData| AccData|      DsData|   EData|           A|           B|           CDE|           GEMM| Block|  MPer|  NPer|  KPer| K1|   M1Per|  N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //####################| Layout| Layout|      Layout| Layout|   Type|   Type|    Type|        Type|    Type| Elementwise| Elementwise|   Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM| Thread| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //####################|       |       |            |       |       |       |        |            |        |   Operation|   Operation|     Operation|               |      |      |      |      |   |        |       |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //####################|       |       |            |       |       |       |        |            |        |            |            |              |               |      |      |      |      |   |        |       |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGemmMultipleD_Dl<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,   256,   128,   128,    16,  4,      4,       4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+    // clang-format on
+    >;
+
+template <typename OutElementOp>
+using device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //####################|      A|      B|          Ds|      E|  AData|  BData| AccData|      DsData|   EData|           A|           B|           CDE|           GEMM| Block|  MPer|  NPer|  KPer| K1|   M1Per|  N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //####################| Layout| Layout|      Layout| Layout|   Type|   Type|    Type|        Type|    Type| Elementwise| Elementwise|   Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM| Thread| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //####################|       |       |            |       |       |       |        |            |        |   Operation|   Operation|     Operation|               |      |      |      |      |   |        |       |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //####################|       |       |            |       |       |       |        |            |        |            |            |              |               |      |      |      |      |   |        |       |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGemmMultipleD_Dl<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,   256,   128,   128,    16,  4,       4,      4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+    // clang-format on
+    >;
+
+template <typename OutElementOp>
+using device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //####################|      A|      B|          Ds|      E|  AData|  BData| AccData|      DsData|   EData|           A|           B|           CDE|           GEMM| Block|  MPer|  NPer|  KPer| K1|   M1Per|  N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //####################| Layout| Layout|      Layout| Layout|   Type|   Type|    Type|        Type|    Type| Elementwise| Elementwise|   Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM| Thread| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //####################|       |       |            |       |       |       |        |            |        |   Operation|   Operation|     Operation|               |      |      |      |      |   |        |       |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //####################|       |       |            |       |       |       |        |            |        |            |            |              |               |      |      |      |      |   |        |       |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGemmMultipleD_Dl<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,   256,   128,   128,    16,  4,       4,      4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Layout(A, B, C) = [Col, Row, Row]
+void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    int8_t,
+                                                    int8_t,
+                                                    Empty_Tuple,
+                                                    int8_t,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances<Mul_Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Layout(A, B, C) = [Col, Col, Row]
+void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    int8_t,
+                                                    int8_t,
+                                                    Empty_Tuple,
+                                                    int8_t,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances<Mul_Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Layout(A, B, C) = [Row, Row, Row]
+void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    int8_t,
+                                                    int8_t,
+                                                    Empty_Tuple,
+                                                    int8_t,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances<Mul_Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Layout(A, B, C) = [Row, Col, Row]
+void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    int8_t,
+                                                    int8_t,
+                                                    Empty_Tuple,
+                                                    int8_t,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances<Mul_Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_quantization_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <typename OutElementOp, LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
+using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|      DsData|   EData|           A|           B|           CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|     LoopScheduler|     Pipeline|
+        //##############################| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|        Type|    Type| Elementwise| Elementwise|   Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                  |             |
+        //##############################|       |       |            |       |       |       |        |         |            |        |   Operation|   Operation|     Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                  |             |
+        //##############################|       |       |            |       |       |       |        |         |            |        |            |            |              |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                  |             |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   256,   128,    64,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   256,    64,   4,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,   128,    64,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   128,    64,   4,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,    64,    64,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,    64,   128,    64,   4,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,    64,    64,   4,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,    64,   128,    64,   4,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>
+    // clang-format on
+    >;
+
+template <typename OutElementOp, LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
+using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|      DsData|   EData|           A|           B|           CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|     LoopScheduler|     Pipeline|
+        //##############################| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|        Type|    Type| Elementwise| Elementwise|   Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                  |             |
+        //##############################|       |       |            |       |       |       |        |         |            |        |   Operation|   Operation|     Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                  |             |
+        //##############################|       |       |            |       |       |       |        |         |            |        |            |            |              |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                  |             |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   256,   128,    64,   4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   256,    64,   4,  16,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,   128,    64,   4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   128,    64,   4,  16,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,    64,    64,   4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,    64,   128,    64,   4,  16,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,    64,    64,   4,  16,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,    64,   128,    64,   4,  16,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>
+    // clang-format on
+    >;
+
+template <typename OutElementOp, LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
+using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|      DsData|   EData|           A|           B|           CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|     LoopScheduler|     Pipeline|
+        //##############################| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|        Type|    Type| Elementwise| Elementwise|   Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                  |             |
+        //##############################|       |       |            |       |       |       |        |         |            |        |   Operation|   Operation|     Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                  |             |
+        //##############################|       |       |            |       |       |       |        |         |            |        |            |            |              |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                  |             |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   256,   128,    64,  16,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   256,    64,  16,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,   128,    64,  16,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   128,    64,  16,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,    64,    64,  16,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,    64,   128,    64,  16,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,    64,    64,  16,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,    64,   128,    64,  16,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>
+    // clang-format on
+    >;
+
+template <typename OutElementOp, LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
+using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|      DsData|   EData|           A|           B|           CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|     LoopScheduler|     Pipeline|
+        //##############################| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|        Type|    Type| Elementwise| Elementwise|   Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                  |             |
+        //##############################|       |       |            |       |       |       |        |         |            |        |   Operation|   Operation|     Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                  |             |
+        //##############################|       |       |            |       |       |       |        |         |            |        |            |            |              |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                  |             |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t,  int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  OutElementOp,     MNKPadding,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,              16, GemmLoopScheduler, GemmPipeline>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Layout(A, B, C) = [Col, Row, Row]
+void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    int8_t,
+                                                    int8_t,
+                                                    Empty_Tuple,
+                                                    int8_t,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Default,
+                                                                           PipelineVersion::v1>{});
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Interwave,
+                                                                           PipelineVersion::v1>{});
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Default,
+                                                                           PipelineVersion::v2>{});
+#endif
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Layout(A, B, C) = [Col, Col, Row]
+void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    int8_t,
+                                                    int8_t,
+                                                    Empty_Tuple,
+                                                    int8_t,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Default,
+                                                                           PipelineVersion::v1>{});
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Interwave,
+                                                                           PipelineVersion::v1>{});
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Default,
+                                                                           PipelineVersion::v2>{});
+#endif
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Layout(A, B, C) = [Row, Row, Row]
+void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    int8_t,
+                                                    int8_t,
+                                                    Empty_Tuple,
+                                                    int8_t,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Default,
+                                                                           PipelineVersion::v1>{});
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Interwave,
+                                                                           PipelineVersion::v1>{});
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Default,
+                                                                           PipelineVersion::v2>{});
+#endif
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Layout(A, B, C) = [Row, Col, Row]
+void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    int8_t,
+                                                    int8_t,
+                                                    Empty_Tuple,
+                                                    int8_t,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Default,
+                                                                           PipelineVersion::v1>{});
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Interwave,
+                                                                           PipelineVersion::v1>{});
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
+    add_device_operation_instances(
+        instances,
+        device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances<Mul_Clamp,
+                                                                           LoopScheduler::Default,
+                                                                           PipelineVersion::v2>{});
+#endif
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using Empty_Tuple   = ck::Tuple<>;
+using Row_Row_Tuple = ck::Tuple<Row, Row>;
+using Col_Col_Tuple = ck::Tuple<Col, Col>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Relu        = ck::tensor_operation::element_wise::Relu;
+
+using Mul_Clamp      = ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>;
+using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Relu>;
+
+using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<PassThrough>;
+using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Relu>;
+
+static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_fastgelu.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_grouped_gemm_fastgelu_impl(int do_verification,
+                                        int init_method,
+                                        bool do_log,
+                                        bool time_kernel,
+                                        const std::vector<int>& Ms,
+                                        const std::vector<int>& Ns,
+                                        const std::vector<int>& Ks,
+                                        const std::vector<int>& StrideAs,
+                                        const std::vector<int>& StrideBs,
+                                        const std::vector<int>& StrideCs)
+{
+
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    std::size_t group_count = Ms.size();
+
+    if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() &&
+         group_count == StrideBs.size() && group_count == StrideCs.size()))
+    {
+        throw std::runtime_error("wrong! inconsistent M/N/Ks, StrideA/B/Cs size\n");
+    }
+
+    std::vector<Tensor<ADataType>> a_m_k;
+    std::vector<Tensor<BDataType>> b_k_n;
+    std::vector<Tensor<CDataType>> c_m_n_device_results;
+
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        a_m_k.push_back(
+            Tensor<ADataType>(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{})));
+        b_k_n.push_back(
+            Tensor<BDataType>(f_host_tensor_descriptor(Ks[i], Ns[i], StrideBs[i], BLayout{})));
+
+        c_m_n_device_results.push_back(
+            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
+
+        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
+                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
+                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            ck::utils::FillUniformDistributionIntegerValue<ADataType>{}(a_m_k[i]);
+            ck::utils::FillUniformDistributionIntegerValue<BDataType>{}(b_k_n[i]);
+            break;
+        default:
+            ck::utils::FillUniformDistribution<ADataType>{0.0, 1.0}(a_m_k[i]);
+            ck::utils::FillUniformDistribution<BDataType>{-0.5, 0.5}(b_k_n[i]);
+        }
+
+        ck::utils::FillConstant<CDataType>{}(c_m_n_device_results[i]);
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::FastGelu;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+    std::vector<DeviceMemPtr> a_device_buf, b_device_buf, c_device_buf;
+
+    a_device_buf.reserve(group_count);
+    b_device_buf.reserve(group_count);
+    c_device_buf.reserve(group_count);
+
+    std::vector<const void*> p_a, p_b;
+    std::vector<void*> p_c;
+
+    p_a.reserve(group_count);
+    p_b.reserve(group_count);
+    p_c.reserve(group_count);
+
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+
+    gemm_descs.reserve(group_count);
+
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        a_device_buf.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
+        b_device_buf.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
+        c_device_buf.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
+
+        a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
+        b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
+        c_device_buf[i]->SetZero();
+
+        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
+
+        p_a.push_back(a_device_buf[i]->GetDeviceBuffer());
+        p_b.push_back(b_device_buf[i]->GetDeviceBuffer());
+        p_c.push_back(c_device_buf[i]->GetDeviceBuffer());
+    }
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm<ALayout,
+                                                                     BLayout,
+                                                                     ck::Tuple<>,
+                                                                     CLayout,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     ck::Tuple<>,
+                                                                     CDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     CElementOp>;
+
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    if(op_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device GEMM instance found");
+    }
+
+    std::string best_gemm_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    auto p_ds = std::vector<std::array<const void*, 0>>{};
+
+    // profile device GEMM instances
+    for(auto& gemm_ptr : op_ptrs)
+    {
+        auto argument_ptr = gemm_ptr->MakeArgumentPointer(
+            p_a, p_b, p_ds, p_c, gemm_descs, a_element_op, b_element_op, c_element_op);
+
+        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
+        DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
+        gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+
+        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string gemm_name = gemm_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = 0, num_btype = 0;
+            for(std::size_t i = 0; i < gemm_descs.size(); i++)
+            {
+                flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
+                num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] +
+                             sizeof(CDataType) * Ms[i] * Ns[i];
+            }
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << gemm_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_gemm_name  = gemm_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                for(std::size_t i = 0; i < gemm_descs.size(); i++)
+                {
+
+                    c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
+                    Tensor<CDataType> c_m_n_host_result(
+                        f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
+
+                    using ReferenceGemmInstance =
+                        ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                  BDataType,
+                                                                  CDataType,
+                                                                  AccDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  CElementOp>;
+
+                    auto ref_gemm     = ReferenceGemmInstance{};
+                    auto ref_invoker  = ref_gemm.MakeInvoker();
+                    auto ref_argument = ref_gemm.MakeArgument(a_m_k[i],
+                                                              b_k_n[i],
+                                                              c_m_n_host_result,
+                                                              a_element_op,
+                                                              b_element_op,
+                                                              c_element_op);
+
+                    ref_invoker.Run(ref_argument);
+
+                    bool group_pass =
+                        ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
+                    pass = pass && group_pass;
+
+                    std::cout << "group: " << i << " verification result: " << std::boolalpha
+                              << group_pass << std::endl;
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                            << std::endl;
+                    }
+                }
+            }
+        }
+        else
+        {
+            std::cout << "does not support this GEMM problem" << std::endl;
+        }
+    }
+
+    if(do_verification)
+    {
+        std::cout << "Verification: " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -108,11 +108,6 @@ bool profile_grouped_gemm_impl(int do_verification,
    const auto b_element_op = BElementOp{};
    const auto c_element_op = CElementOp{};

-    // if(do_verification)
-    // {
-
-    // }
-
    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
    std::vector<DeviceMemPtr> a_device_buf, b_device_buf, c_device_buf;

@@ -285,7 +280,7 @@ bool profile_grouped_gemm_impl(int do_verification,
              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;

    return pass;
-} // namespace profiler
+}

 } // namespace profiler
 } // namespace ck
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -29,6 +29,7 @@ set(PROFILER_SOURCES
    profile_batchnorm_fwd.cpp
    profile_batchnorm_bwd.cpp
    profile_batchnorm_infer.cpp
+    profile_grouped_gemm_fastgelu.cpp
 )

 set(PROFILER_EXECUTABLE ckProfiler)
@@ -68,4 +69,5 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instan
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)