Merge remote-tracking branch 'origin/develop' into fused-gemm

000eefbf · Chao Liu · b64a2860 · cac014f1 · 000eefbf · 000eefbf
Commit 000eefbf authored Aug 13, 2022 by Chao Liu
20 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -18,6 +18,89 @@ def runShell(String command){
    return (output != "")
 }
+def getDockerImageName(){
+    def img = "${env.MIOPEN_IMAGE_URL}:composable_kernels_${params.COMPILER_VERSION}"
+    return img
+}
+def getDockerImage(Map conf=[:]){
+    env.DOCKER_BUILDKIT=1
+    def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
+    def gpu_arch = conf.get("gpu_arch", "gfx908") // prebuilt dockers should have all the architectures enabled so one image can be used for all stages
+    def no_cache = conf.get("no_cache", false)
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+    if(env.CCACHE_HOST)
+    {
+        def check_host = sh(script:"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+        if(check_host == "+PONG")
+        {
+            echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+        }
+        else 
+        {
+            echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+        }
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+        env.CCACHE_DIR = """/tmp/ccache_store"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+    }
+    if(no_cache)
+    {
+        dockerArgs = dockerArgs + " --no-cache "
+    }
+    echo "Docker Args: ${dockerArgs}"
+    def image = getDockerImageName()
+    //Check if image exists 
+    def retimage
+    try 
+    {
+        echo "Pulling down image: ${image}"
+        retimage = docker.image("${image}")
+        retimage.pull()
+    }
+    catch(Exception ex)
+    {
+        error "Unable to locate image: ${image}"
+    }
+    return [retimage, image]
+}
+def buildDocker(install_prefix){
+    show_node_info()
+    env.DOCKER_BUILDKIT=1
+    checkout scm
+    def image_name = getDockerImageName()
+    echo "Building Docker for ${image_name}"
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+    if(env.CCACHE_HOST)
+    {
+        def check_host = sh(script:"""(printf "PING\\r\\n";) | nc  -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+        if(check_host == "+PONG")
+        {
+            echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+        }
+        else 
+        {
+            echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+        }
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+        env.CCACHE_DIR = """/tmp/ccache_store"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+    }
+    echo "Build Args: ${dockerArgs}"
+    try{
+        echo "Checking for image: ${image_name}"
+        sh "docker manifest inspect --insecure ${image_name}"
+        echo "Image: ${image_name} found!! Skipping building image"
+    }
+    catch(Exception ex){
+        echo "Unable to locate image: ${image_name}. Building image now"
+        retimage = docker.build("${image_name}", dockerArgs + ' .')
+        retimage.push()
+    }
+}
 def cmake_build(Map conf=[:]){
    def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
@@ -100,9 +183,10 @@ def buildHipClangJob(Map conf=[:]){
        // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        if (conf.get("enforce_xnack_on", false)) {
-            dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
+        //def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
        if (params.COMPILER_VERSION != "release"){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -113,7 +197,8 @@ def buildHipClangJob(Map conf=[:]){
        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
            try {
-                retimage = docker.build("${image}", dockerArgs + '.')
+                //retimage = docker.build("${image}", dockerArgs + '.')
+                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
@@ -190,9 +275,9 @@ def runCKProfiler(Map conf=[:]){
        // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        if (conf.get("enforce_xnack_on", false)) {
-            dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
        if (params.COMPILER_VERSION != "release"){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -202,7 +287,8 @@ def runCKProfiler(Map conf=[:]){
        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
            try {
-                retimage = docker.build("${image}", dockerArgs + '.')
+                //retimage = docker.build("${image}", dockerArgs + '.')
+                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
@@ -248,7 +334,7 @@ def runCKProfiler(Map conf=[:]){
                            archiveArtifacts "perf_batched_gemm_${gpu_arch}.log"
                            archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
                            archiveArtifacts "perf_conv_fwd_${gpu_arch}.log"
-                            archiveArtifacts "perf_conv_bwd_${gpu_arch}.log"
+                            archiveArtifacts "perf_conv_bwd_data_${gpu_arch}.log"
                            archiveArtifacts "perf_gemm_bilinear_${gpu_arch}.log"
                            archiveArtifacts "perf_reduction_${gpu_arch}.log"
                           // stash perf files to master
@@ -258,7 +344,7 @@ def runCKProfiler(Map conf=[:]){
                            stash name: "perf_batched_gemm_${gpu_arch}.log"
                            stash name: "perf_grouped_gemm_${gpu_arch}.log"
                            stash name: "perf_conv_fwd_${gpu_arch}.log"
-                            stash name: "perf_conv_bwd_${gpu_arch}.log"
+                            stash name: "perf_conv_bwd_data_${gpu_arch}.log"
                            stash name: "perf_gemm_bilinear_${gpu_arch}.log"
                            stash name: "perf_reduction_${gpu_arch}.log"
                            //we will process results on the master node
@@ -308,16 +394,17 @@ def process_results(Map conf=[:]){
    // Jenkins is complaining about the render group 
    def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
    if (conf.get("enforce_xnack_on", false)) {
-        dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+        dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
    }
-    def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
+    def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='release' "
    def variant = env.STAGE_NAME
    def retimage
    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
        try {
-            retimage = docker.build("${image}", dockerArgs + '.')
+            //retimage = docker.build("${image}", dockerArgs + '.')
+            (retimage, image) = getDockerImage(conf)
        }
        catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
            echo "The job was cancelled or aborted"
@@ -337,7 +424,7 @@ def process_results(Map conf=[:]){
                        unstash "perf_batched_gemm_${gpu_arch}.log"
                        unstash "perf_grouped_gemm_${gpu_arch}.log"
                        unstash "perf_conv_fwd_${gpu_arch}.log"
-                        unstash "perf_conv_bwd${gpu_arch}.log"
+                        unstash "perf_conv_bwd_data_${gpu_arch}.log"
                        unstash "perf_gemm_bilinear_${gpu_arch}.log"
                        unstash "perf_reduction_${gpu_arch}.log"
                        sh "./process_qa_data.sh ${gpu_arch}"
@@ -372,14 +459,22 @@ pipeline {
        parallelsAlwaysFailFast()
    }
    parameters {
+        booleanParam(
+            name: "BUILD_DOCKER",
+            defaultValue: true,
+            description: "Force building docker image (default: true)")
        string(
            name: 'COMPILER_VERSION', 
            defaultValue: 'ck-9110', 
-            description: 'Specify which version of compiler to use: ck-9110 (default), release, or amd-mainline-open.')
+            description: 'Specify which version of compiler to use: ck-9110 (default), release, or amd-stg-open.')
        booleanParam(
            name: "RUN_FULL_QA",
            defaultValue: false,
            description: "Select whether to run small set of performance tests (default) or full QA")
+        booleanParam(
+            name: "TEST_NODE_PERFORMANCE",
+            defaultValue: false,
+            description: "Test the node GPU performance (default: false)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -393,7 +488,24 @@ pipeline {
        DOCKER_BUILDKIT = "1"
    }
    stages{
+        stage("Build Docker"){
+            when {
+                expression { params.BUILD_DOCKER.toBoolean() }
+            }
+            parallel{
+                stage('Docker /opt/rocm'){
+                    agent{ label rocmnode("nogpu") }
+                    steps{
+                        buildDocker('/opt/rocm')
+                    }
+                }
+            }
+        }
        stage("Static checks") {
+            when {
+                beforeAgent true
+                expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
+            }
            parallel{
                // enable after we move from hipcc to hip-clang
                // stage('Tidy') {
@@ -427,6 +539,10 @@ pipeline {
        }
 		stage("Tests")
        {
+            when {
+                beforeAgent true
+                expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
+            }
            parallel
            {
                stage("Run Tests: gfx908")
@@ -457,6 +573,10 @@ pipeline {
        }
        stage("Client App")
        {
+            when {
+                beforeAgent true
+                expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
+            }
            parallel
            {
                stage("Run Client App")
@@ -480,7 +600,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
                    }
                    agent{ label rocmnode("gfx908")}
                    environment{
@@ -494,7 +614,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() }
+                        expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
                    }
                    agent{ label rocmnode("gfx90a")}
                    environment{
@@ -513,7 +633,7 @@ pipeline {
                stage("Process results for gfx908"){
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
                    }
                    agent { label 'mici' }
                    steps{
@@ -523,7 +643,7 @@ pipeline {
                stage("Process results for gfx90a"){
                    when {
                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() }
+                        expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
                    }
                    agent { label 'mici' }
                    steps{

--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
 add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
+add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
+add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = BF16;
+using D1DataType       = BF16;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = BF16;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+#include "run_gemm_add_add_fastgelu_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstddef>
 #include <iostream>
-#include <numeric>
+#include <stdexcept>
-#include <initializer_list>
+#include <string>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -12,11 +12,12 @@
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/check_err.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -61,189 +62,6 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on
-int main(int argc, char* argv[])
+#include "run_gemm_add_add_fastgelu_example.inc"
-{
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-    // GEMM shape
-    ck::index_t M = 3840;
-    ck::index_t N = 4096;
-    ck::index_t K = 4096;
-    ck::index_t StrideA  = 4096;
-    ck::index_t StrideB  = 4096;
-    ck::index_t StrideD0 = 0;
-    ck::index_t StrideD1 = 4096;
-    ck::index_t StrideE  = 4096;
-    if(argc == 1)
-    {
-        // use default case
-    }
-    else if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 12)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        M = std::stoi(argv[4]);
-        N = std::stoi(argv[5]);
-        K = std::stoi(argv[6]);
-        StrideA  = std::stoi(argv[7]);
-        StrideB  = std::stoi(argv[8]);
-        StrideD0 = std::stoi(argv[9]);
-        StrideD1 = std::stoi(argv[10]);
-        StrideE  = std::stoi(argv[11]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
-               "StrideE\n");
-        exit(0);
-    }
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
-            }
-            else
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
-            }
-        };
-    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
-    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
-    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
-    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
-    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
-    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
-    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
-        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
-        break;
-    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
-        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
-    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
-    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    d0_device_buf.ToDevice(d0_m_n.mData.data());
-    d1_device_buf.ToDevice(d1_m_n.mData.data());
-    auto a_element_op   = AElementOp{};
-    auto b_element_op   = BElementOp{};
-    auto cde_element_op = CDEElementOp{};
-    // do GEMM
-    auto device_op = DeviceOpInstance{};
-    auto invoker   = device_op.MakeInvoker();
-    auto argument =
-        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
-                               b_device_buf.GetDeviceBuffer(),
-                               std::array<const void*, 2>{d0_device_buf.GetDeviceBuffer(),
-                                                          d1_device_buf.GetDeviceBuffer()},
-                               e_device_buf.GetDeviceBuffer(),
-                               M,
-                               N,
-                               K,
-                               StrideA,
-                               StrideB,
-                               std::array<ck::index_t, 2>{StrideD0, StrideD1},
-                               StrideE,
-                               a_element_op,
-                               b_element_op,
-                               cde_element_op);
-    if(!device_op.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error("wrong! this device_op instance does not support this problem");
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    std::size_t flop      = std::size_t(2) * M * N * K;
-    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                            sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
-                            sizeof(EDataType) * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << device_op.GetTypeString() << std::endl;
-    if(do_verification)
-    {
-        Tensor<AccDataType> c_m_n(HostTensorDescriptor(
-            std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
-        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                                BDataType,
-                                                                                AccDataType,
-                                                                                AccDataType,
-                                                                                AElementOp,
-                                                                                BElementOp,
-                                                                                PassThrough>;
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-        auto ref_argument =
-            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
-        ref_invoker.Run(ref_argument);
-        for(int m = 0; m < M; ++m)
-        {
-            for(int n = 0; n < N; ++n)
-            {
-                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
-            }
-        }
-        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
-        return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
-    }
-    return 0;
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
-}
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = F32;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+// clang-format on
+#include "run_gemm_add_add_fastgelu_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using D0DataType       = I8;
+using D1DataType       = I8;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = I8;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+#include "run_gemm_add_add_fastgelu_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
--- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+#pragma once
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+};
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {StrideD0, StrideD1},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+    std::size_t flop      = 2_uz * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
+                            sizeof(EDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+    if(config.do_verification)
+    {
+        Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+        return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
+    }
+    return true;
+}
+bool run_gemm_add_add_fastgelu_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 12)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        problem_size.M = std::stoi(argv[4]);
+        problem_size.N = std::stoi(argv[5]);
+        problem_size.K = std::stoi(argv[6]);
+        problem_size.StrideA  = std::stoi(argv[7]);
+        problem_size.StrideB  = std::stoi(argv[8]);
+        problem_size.StrideD0 = std::stoi(argv[9]);
+        problem_size.StrideD1 = std::stoi(argv[10]);
+        problem_size.StrideE  = std::stoi(argv[11]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << std::endl
+                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
+                     "StrideE"
+                  << std::endl;
+        return true;
+    }
+    return run_gemm_add_add_fastgelu(problem_size, config);
+}
--- a/example/25_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/25_gemm_bias_e_permute/CMakeLists.txt
-add_example_executable(example_gemm_bias_e_permute_m3n2_xdl_fp16 gemm_bias_e_permute_m3n2_xdl_fp16.cpp)
+add_example_executable(example_gemm_bias_e_permute_g1m3n2k1_xdl_fp16 gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp)
-add_example_executable(example_gemm_bias_e_permute_m2n3_xdl_fp16 gemm_bias_e_permute_m2n3_xdl_fp16.cpp)
+add_example_executable(example_gemm_bias_e_permute_g1m2n3k1_xdl_fp16 gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp)
--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_m2n3_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_m2n3_xdl_fp16.cpp
@@ -16,6 +16,8 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -33,7 +35,7 @@ using DDataType        = F16;
 using DsDataType       = ck::Tuple<DDataType>;
 using EDataType        = F16;
-static constexpr ck::index_t NumDimG = 0;
+static constexpr ck::index_t NumDimG = 1;
 static constexpr ck::index_t NumDimM = 2;
 static constexpr ck::index_t NumDimN = 3;
 static constexpr ck::index_t NumDimK = 1;
@@ -69,30 +71,31 @@ template <ck::index_t NumDimM,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 3 && NumDimK == 1, bool> = false>
+          ck::enable_if_t<NumDimG == 1 && NumDimM == 2 && NumDimN == 3 && NumDimK == 1, bool> =
-struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::BaseOperator
+              false>
+struct ReferenceContraction_G1_M2_N3_K1 : public ck::tensor_operation::device::BaseOperator
 {
    // Argument
    struct Argument : public ck::tensor_operation::device::BaseArgument
    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
+                 Tensor<EDataType>& e_gs_ms_ns,
                 AElementwiseOperation a_element_op,
                 BElementwiseOperation b_element_op,
                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
+            : a_gs_ms_ks_{a_gs_ms_ks},
-              b_ns_ks_{b_ns_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
-              e_ms_ns_{e_ms_ns},
+              e_gs_ms_ns_{e_gs_ms_ns},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
              cde_element_op_{cde_element_op}
        {
        }
-        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<ADataType>& a_gs_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
+        Tensor<EDataType>& e_gs_ms_ns_;
        AElementwiseOperation a_element_op_;
        BElementwiseOperation b_element_op_;
@@ -102,12 +105,12 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base
    // Invoker
    struct Invoker : public ck::tensor_operation::device::BaseInvoker
    {
-        using Argument = ReferenceContraction_M2_N3_K1::Argument;
+        using Argument = ReferenceContraction_G1_M2_N3_K1::Argument;
        float Run(const Argument& arg)
        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1, auto n2) {
+            auto f_gs_ms_ns = [&](auto g0, auto m0, auto m1, auto n0, auto n1, auto n2) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[3];
                AccDataType v_acc = 0;
@@ -117,9 +120,10 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base
                    AccDataType v_b;
                    arg.a_element_op_(
-                        v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0)));
+                        v_a, ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, m0, m1, k0)));
                    arg.b_element_op_(
-                        v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, n2, k0)));
+                        v_b,
+                        ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, n0, n1, n2, k0)));
                    v_acc += v_a * v_b;
                }
@@ -128,15 +132,16 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base
                arg.cde_element_op_(v_c, v_acc);
-                arg.e_ms_ns_(m0, m1, n0, n1, n2) = v_c;
+                arg.e_gs_ms_ns_(g0, m0, m1, n0, n1, n2) = v_c;
            };
-            make_ParallelTensorFunctor(f_ms_ns,
+            make_ParallelTensorFunctor(f_gs_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[4])(
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
                std::thread::hardware_concurrency());
            return 0;
@@ -160,14 +165,15 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base
        return true;
    }
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
+                             Tensor<EDataType>& e_gs_ms_ns,
                             AElementwiseOperation a_element_op,
                             BElementwiseOperation b_element_op,
                             CDEElementwiseOperation cde_element_op)
    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
    }
    static auto MakeInvoker() { return Invoker{}; }
@@ -196,28 +202,31 @@ int main(int argc, char* argv[])
    int init_method      = 1;
    bool time_kernel     = false;
+    ck::index_t G0 = 1;
    ck::index_t M0 = 4;
    ck::index_t M1 = 256;
    ck::index_t N0 = 4;
-    ck::index_t N1 = 8;
+    ck::index_t N1 = 16;
-    ck::index_t N2 = 128;
+    ck::index_t N2 = 32;
    ck::index_t K0 = 256;
    // A[M0, M1, M2, K0]
-    std::vector<ck::index_t> a_ms_ks_lengths{M0, M1, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, M0, M1, K0};
-    std::vector<ck::index_t> a_ms_ks_strides{M1 * K0, K0, 1};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{M0 * M1 * K0, M1 * K0, K0, 1};
    // B[N0, N1, K0]
-    std::vector<ck::index_t> b_ns_ks_lengths{N0, N1, N2, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, N0, N1, N2, K0};
-    std::vector<ck::index_t> b_ns_ks_strides{N1 * N2 * K0, N2 * K0, K0, 1};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{N0 * N1 * N2 * K0, N1 * N2 * K0, N2 * K0, K0, 1};
    // D[N0, M0, N1, M1, N2]
-    std::vector<ck::index_t> d_ms_ns_lengths{M0, M1, N0, N1, N2};
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
-    std::vector<ck::index_t> d_ms_ns_strides{0, 0, N1 * N2, N1, 1};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1};
    // E[N0, M0, N1, M1, N2]
-    std::vector<ck::index_t> e_ms_ns_lengths{M0, M1, N0, N1, N2};
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
-    std::vector<ck::index_t> e_ms_ns_strides{N1 * M1 * N2, N2, M0 * N1 * M1 * N2, M1 * N2, 1};
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        M0 * M1 * N0 * N1 * N2, N1 * M1 * N2, N2, M0 * N1 * M1 * N2, M1 * N2, 1};
    if(argc == 1)
    {
@@ -237,50 +246,51 @@ int main(int argc, char* argv[])
        exit(0);
    }
-    Tensor<ADataType> a_ms_ks(
+    Tensor<ADataType> a_gs_ms_ks(
-        std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
+        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
-        std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
+        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
-    Tensor<BDataType> b_ns_ks(
+    Tensor<BDataType> b_gs_ns_ks(
-        std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
+        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
+        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
-    Tensor<DDataType> d_ms_ns(
+    Tensor<DDataType> d_gs_ms_ns(
-        std::vector<std::size_t>(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()),
+        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_ms_ns_strides.begin(), d_ms_ns_strides.end()));
+        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_host_result(
+    Tensor<EDataType> e_gs_ms_ns_host_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_device_result(
+    Tensor<EDataType> e_gs_ms_ns_device_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
-    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
    switch(init_method)
    {
    case 0: break;
    case 1:
-        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
        break;
    default:
-        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
        break;
    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
-    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
-    b_device_buf.ToDevice(b_ns_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
-    d_device_buf.ToDevice(d_ms_ns.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
    // set zero
    e_device_buf.SetZero();
@@ -296,14 +306,14 @@ int main(int argc, char* argv[])
                                    b_device_buf.GetDeviceBuffer(),
                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
                                    e_device_buf.GetDeviceBuffer(),
-                                    a_ms_ks_lengths,
+                                    a_gs_ms_ks_lengths,
-                                    a_ms_ks_strides,
+                                    a_gs_ms_ks_strides,
-                                    b_ns_ks_lengths,
+                                    b_gs_ns_ks_lengths,
-                                    b_ns_ks_strides,
+                                    b_gs_ns_ks_strides,
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
-                                    e_ms_ns_lengths,
+                                    e_gs_ms_ns_lengths,
-                                    e_ms_ns_strides,
+                                    e_gs_ms_ns_strides,
                                    a_element_op,
                                    b_element_op,
                                    cde_element_op);
@@ -317,18 +327,18 @@ int main(int argc, char* argv[])
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
+    std::size_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
-                                    e_ms_ns_lengths.begin() + NumDimM,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
                                    ck::index_t{1},
                                    std::multiplies<ck::index_t>{});
-    ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
+    std::size_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
-                                    e_ms_ns_lengths.begin() + NumDimM + NumDimN,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
                                    ck::index_t{1},
                                    std::multiplies<ck::index_t>{});
-    ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
+    std::size_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
-                                    a_ms_ks_lengths.begin() + NumDimM + NumDimK,
+                                    a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
                                    ck::index_t{1},
                                    std::multiplies<ck::index_t>{});
@@ -343,15 +353,15 @@ int main(int argc, char* argv[])
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << op.GetTypeString() << std::endl;
-    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
-            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-        using ReferenceOpInstance = ReferenceContraction_M2_N3_K1<NumDimM,
+        using ReferenceOpInstance = ReferenceContraction_G1_M2_N3_K1<NumDimM,
                                                                     NumDimN,
                                                                     NumDimK,
                                                                     ADataType,
@@ -365,31 +375,41 @@ int main(int argc, char* argv[])
        auto ref_gemm    = ReferenceOpInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
+        auto ref_argument = ref_gemm.MakeArgument(a_gs_ms_ks,
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+                                                  b_gs_ns_ks,
+                                                  c_gs_ms_ns_host_result,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  PassThrough{});
        ref_invoker.Run(ref_argument);
-        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        for(size_t g0 = 0; g0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[0]; ++g0)
+        {
+            for(size_t m0 = 0; m0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[1]; ++m0)
            {
-            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+                for(size_t m1 = 0; m1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[2]; ++m1)
                {
-                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
+                    for(size_t n0 = 0; n0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[3]; ++n0)
                    {
-                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
+                        for(size_t n1 = 0; n1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[4]; ++n1)
                        {
-                        for(size_t n2 = 0; n2 < e_ms_ns_host_result.mDesc.GetLengths()[4]; ++n2)
+                            for(size_t n2 = 0; n2 < e_gs_ms_ns_host_result.mDesc.GetLengths()[5];
+                                ++n2)
                            {
-                            cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1, n2),
+                                cde_element_op(e_gs_ms_ns_host_result(g0, m0, m1, n0, n1, n2),
-                                           c_ms_ns_host_result(m0, m1, n0, n1, n2),
+                                               c_gs_ms_ns_host_result(g0, m0, m1, n0, n1, n2),
-                                           d_ms_ns(m0, m1, n0, n1, n2));
+                                               d_gs_ms_ns(g0, m0, m1, n0, n1, n2));
+                            }
                        }
                    }
                }
            }
        }
-        return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
+                   ? 0
+                   : 1;
    }
    return 0;

--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_m3n2_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_m3n2_xdl_fp16.cpp
@@ -33,7 +33,7 @@ using DDataType        = F16;
 using DsDataType       = ck::Tuple<DDataType>;
 using EDataType        = F16;
-static constexpr ck::index_t NumDimG = 0;
+static constexpr ck::index_t NumDimG = 1;
 static constexpr ck::index_t NumDimM = 3;
 static constexpr ck::index_t NumDimN = 2;
 static constexpr ck::index_t NumDimK = 1;
@@ -53,13 +53,13 @@ using DeviceOpInstanceKKNN = ck::tensor_operation::device::
        //############################################|        |        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
        //############################################|        |        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
        //############################################|        |        |        |        |      |      |        |         |           |      |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               1>;
+        DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               8>;
 // clang-format on
 using DeviceOpInstance = DeviceOpInstanceKKNN;
-// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimG,
-template <ck::index_t NumDimM,
+          ck::index_t NumDimM,
          ck::index_t NumDimN,
          ck::index_t NumDimK,
          typename ADataType,
@@ -69,30 +69,31 @@ template <ck::index_t NumDimM,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 3 && NumDimN == 2 && NumDimK == 1, bool> = false>
+          ck::enable_if_t<NumDimG == 1 && NumDimM == 3 && NumDimN == 2 && NumDimK == 1, bool> =
-struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::BaseOperator
+              false>
+struct ReferenceContraction_G1_M3_N2_K1 : public ck::tensor_operation::device::BaseOperator
 {
    // Argument
    struct Argument : public ck::tensor_operation::device::BaseArgument
    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
+                 Tensor<EDataType>& e_gs_ms_ns,
                 AElementwiseOperation a_element_op,
                 BElementwiseOperation b_element_op,
                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
+            : a_gs_ms_ks_{a_gs_ms_ks},
-              b_ns_ks_{b_ns_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
-              e_ms_ns_{e_ms_ns},
+              e_gs_ms_ns_{e_gs_ms_ns},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
              cde_element_op_{cde_element_op}
        {
        }
-        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<ADataType>& a_gs_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
+        Tensor<EDataType>& e_gs_ms_ns_;
        AElementwiseOperation a_element_op_;
        BElementwiseOperation b_element_op_;
@@ -102,12 +103,12 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
    // Invoker
    struct Invoker : public ck::tensor_operation::device::BaseInvoker
    {
-        using Argument = ReferenceContraction_M3_N2_K1::Argument;
+        using Argument = ReferenceContraction_G1_M3_N2_K1::Argument;
        float Run(const Argument& arg)
        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto m2, auto n0, auto n1) {
+            auto f_gs_ms_ns = [&](auto g0, auto m0, auto m1, auto m2, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[3];
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4];
                AccDataType v_acc = 0;
@@ -117,9 +118,10 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
                    AccDataType v_b;
                    arg.a_element_op_(
-                        v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, m2, k0)));
+                        v_a,
+                        ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, m0, m1, m2, k0)));
                    arg.b_element_op_(
-                        v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0)));
+                        v_b, ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, n0, n1, k0)));
                    v_acc += v_a * v_b;
                }
@@ -128,15 +130,16 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
                arg.cde_element_op_(v_c, v_acc);
-                arg.e_ms_ns_(m0, m1, m2, n0, n1) = v_c;
+                arg.e_gs_ms_ns_(g0, m0, m1, m2, n0, n1) = v_c;
            };
-            make_ParallelTensorFunctor(f_ms_ns,
+            make_ParallelTensorFunctor(f_gs_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[4])(
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
                std::thread::hardware_concurrency());
            return 0;
@@ -160,14 +163,15 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
        return true;
    }
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
+                             Tensor<EDataType>& e_gs_ms_ns,
                             AElementwiseOperation a_element_op,
                             BElementwiseOperation b_element_op,
                             CDEElementwiseOperation cde_element_op)
    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
    }
    static auto MakeInvoker() { return Invoker{}; }
@@ -182,7 +186,7 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
        auto str = std::stringstream();
        // clang-format off
-        str << "ReferenceContraction_M3_N2_K1"
+        str << "ReferenceContraction_G1_M3_N2_K1"
            << std::endl;
        // clang-format on
@@ -196,36 +200,33 @@ int main(int argc, char* argv[])
    int init_method      = 1;
    bool time_kernel     = false;
+    ck::index_t G0 = 1;
    ck::index_t M0 = 4;
-    ck::index_t M1 = 32;
+    ck::index_t M1 = 8;
-    ck::index_t M2 = 128;
+    ck::index_t M2 = 256;
-    ck::index_t N0 = 16;
+    ck::index_t N0 = 32;
-    ck::index_t N1 = 256;
+    ck::index_t N1 = 128;
-    ck::index_t K0 = 256;
+    ck::index_t K0 = 1024;
    // A[M0, M1, M2, K0]
-    std::vector<ck::index_t> a_ms_ks_lengths{M0, M1, M2, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, M0, M1, M2, K0};
-    std::vector<ck::index_t> a_ms_ks_strides{M1 * M2 * K0, M2 * K0, K0, 1};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{M0 * M1 * M2 * K0, M1 * M2 * K0, M2 * K0, K0, 1};
    // B[N0, N1, K0]
-    std::vector<ck::index_t> b_ns_ks_lengths{N0, N1, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, N0, N1, K0};
-    std::vector<ck::index_t> b_ns_ks_strides{N1 * K0, K0, 1};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{N0 * N1 * K0, N1 * K0, K0, 1};
-#if 1
-    // D[M0, N0, M1, N1, M2]
-    std::vector<ck::index_t> d_ms_ns_lengths{M0, M1, M2, N0, N1};
-    std::vector<ck::index_t> d_ms_ns_strides{0, 0, 0, N1, 1};
-    // E[M0, N0, M1, N1, M2]
-    std::vector<ck::index_t> e_ms_ns_lengths{M0, M1, M2, N0, N1};
-    std::vector<ck::index_t> e_ms_ns_strides{N0 * M1 * N1 * M2, N1 * M2, 1, M1 * N1 * M2, M2};
-#else
    // D[M0, N0, M1, N1, M2]
-    std::vector<ck::index_t> d_ms_ns_lengths{M0, M1, M2, N0, N1};
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, M0, M1, M2, N0, N1};
-    std::vector<ck::index_t> d_ms_ns_strides{0, 0, 0, N1, 1};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{N0 * N1, 0, 0, 0, N1, 1};
-    // E[M0, N0, M1, N1, M2]
-    std::vector<ck::index_t> e_ms_ns_lengths{M0, M1, M2, N0, N1};
+    // E[M1, M0, N0, M1, N1]
-    std::vector<ck::index_t> e_ms_ns_strides{M1 * M2 * N0 * N1, M2 * N0 * N1, N0 * N1, N1, 1};
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, M0, M1, M2, N0, N1};
-#endif
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        M0 * M1 * M2 * N1 * N0, N0 * M1 * N1, N1, M0 * N0 * M1 * N1, M1 * N1, 1};
    if(argc == 1)
    {
@@ -245,50 +246,51 @@ int main(int argc, char* argv[])
        exit(0);
    }
-    Tensor<ADataType> a_ms_ks(
+    Tensor<ADataType> a_gs_ms_ks(
-        std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
+        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
-        std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
+        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
-    Tensor<BDataType> b_ns_ks(
+    Tensor<BDataType> b_gs_ns_ks(
-        std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()),
+        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
-        std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end()));
+        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
-    Tensor<DDataType> d_ms_ns(
+    Tensor<DDataType> d_gs_ms_ns(
-        std::vector<std::size_t>(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()),
+        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(d_ms_ns_strides.begin(), d_ms_ns_strides.end()));
+        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_host_result(
+    Tensor<EDataType> e_gs_ms_ns_host_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    Tensor<EDataType> e_ms_ns_device_result(
+    Tensor<EDataType> e_gs_ms_ns_device_result(
-        std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-        std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
-    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
    switch(init_method)
    {
    case 0: break;
    case 1:
-        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
        break;
    default:
-        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
        break;
    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
-    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
-    b_device_buf.ToDevice(b_ns_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
-    d_device_buf.ToDevice(d_ms_ns.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
    // set zero
    e_device_buf.SetZero();
@@ -304,14 +306,14 @@ int main(int argc, char* argv[])
                                    b_device_buf.GetDeviceBuffer(),
                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
                                    e_device_buf.GetDeviceBuffer(),
-                                    a_ms_ks_lengths,
+                                    a_gs_ms_ks_lengths,
-                                    a_ms_ks_strides,
+                                    a_gs_ms_ks_strides,
-                                    b_ns_ks_lengths,
+                                    b_gs_ns_ks_lengths,
-                                    b_ns_ks_strides,
+                                    b_gs_ns_ks_strides,
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
-                                    e_ms_ns_lengths,
+                                    e_gs_ms_ns_lengths,
-                                    e_ms_ns_strides,
+                                    e_gs_ms_ns_strides,
                                    a_element_op,
                                    b_element_op,
                                    cde_element_op);
@@ -325,18 +327,18 @@ int main(int argc, char* argv[])
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
+    ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin(),
-                                    e_ms_ns_lengths.begin() + NumDimM,
+                                    e_gs_ms_ns_lengths.begin() + NumDimM,
                                    ck::index_t{1},
                                    std::multiplies<ck::index_t>{});
-    ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
+    ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimM,
-                                    e_ms_ns_lengths.begin() + NumDimM + NumDimN,
+                                    e_gs_ms_ns_lengths.begin() + NumDimM + NumDimN,
                                    ck::index_t{1},
                                    std::multiplies<ck::index_t>{});
-    ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
+    ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimM,
-                                    a_ms_ks_lengths.begin() + NumDimM + NumDimK,
+                                    a_gs_ms_ks_lengths.begin() + NumDimM + NumDimK,
                                    ck::index_t{1},
                                    std::multiplies<ck::index_t>{});
@@ -351,15 +353,16 @@ int main(int argc, char* argv[])
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << op.GetTypeString() << std::endl;
-    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
    if(do_verification)
    {
-        Tensor<CShuffleDataType> c_ms_ns_host_result(
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
-            std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()),
+            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
-            std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end()));
+            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
-        using ReferenceOpInstance = ReferenceContraction_M3_N2_K1<NumDimM,
+        using ReferenceOpInstance = ReferenceContraction_G1_M3_N2_K1<NumDimG,
+                                                                     NumDimM,
                                                                     NumDimN,
                                                                     NumDimK,
                                                                     ADataType,
@@ -373,31 +376,41 @@ int main(int argc, char* argv[])
        auto ref_gemm    = ReferenceOpInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
+        auto ref_argument = ref_gemm.MakeArgument(a_gs_ms_ks,
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+                                                  b_gs_ns_ks,
+                                                  c_gs_ms_ns_host_result,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  PassThrough{});
        ref_invoker.Run(ref_argument);
-        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        for(size_t g0 = 0; g0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[0]; ++g0)
        {
-            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+            for(size_t m0 = 0; m0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[1]; ++m0)
            {
-                for(size_t m2 = 0; m2 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++m2)
+                for(size_t m1 = 0; m1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[2]; ++m1)
                {
-                    for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n0)
+                    for(size_t m2 = 0; m2 < e_gs_ms_ns_host_result.mDesc.GetLengths()[3]; ++m2)
                    {
-                        for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[4]; ++n1)
+                        for(size_t n0 = 0; n0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[4]; ++n0)
                        {
-                            cde_element_op(e_ms_ns_host_result(m0, m1, m2, n0, n1),
+                            for(size_t n1 = 0; n1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[5];
-                                           c_ms_ns_host_result(m0, m1, m2, n0, n1),
+                                ++n1)
-                                           d_ms_ns(m0, m1, m2, n0, n1));
+                            {
+                                cde_element_op(e_gs_ms_ns_host_result(g0, m0, m1, m2, n0, n1),
+                                               c_gs_ms_ns_host_result(g0, m0, m1, m2, n0, n1),
+                                               d_gs_ms_ns(g0, m0, m1, m2, n0, n1));
+                            }
                        }
                    }
                }
            }
        }
-        return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1;
+        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
+                   ? 0
+                   : 1;
    }
    return 0;

--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/layernorm_blockwise.cpp
@@ -46,7 +46,7 @@ using DeviceInstance = ck::tensor_operation::device::DeviceLayernorm<XDataType,
                                                                     8,   // SrcScalarPerVector
                                                                     8,   // GammaScalarPerVector
                                                                     8,   // BetaScalarPerVector
-                                                                     1>;  // OutScalarPerVector
+                                                                     8>;  // OutScalarPerVector
 int main()
 {

--- a/example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
-add_example_executable(example_grouped_convnd_fwd_bias_relu_xdl_fp16 grouped_convnd_fwd_bias_relu_xdl_fp16.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_xdl_fp16 PRIVATE utility)
--- a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
+add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
+target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 PRIVATE utility)
+add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
+target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 PRIVATE utility)
+add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
+target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 PRIVATE utility)
+add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
+target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 PRIVATE utility)
\ No newline at end of file
--- a/example/30_grouped_convnd_fwd_bias_relu/README.md
+++ b/example/30_grouped_convnd_fwd_bias_relu/README.md
@@ -12,17 +12,23 @@
 # <left padding>, (ie LeftPy, LeftPx for 2D)
 # <right padding>, (ie RightPy, RightPx for 2D)
-bin/example_grouped_convnd_fwd_bias_relu_xdl_fp16 1 1 1
+bin/example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 1 1 1
 ```
 Result (MI100)
 ```
-in: dim 5, lengths {1, 128, 192, 71, 71}, strides {6912, 967872, 1, 13632, 192}
+in: dim 5, lengths {2, 128, 192, 71, 71}, strides {192, 1935744, 1, 27264, 384}
-wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {192, 1728, 1, 576, 192}
+wei: dim 5, lengths {2, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
-bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+bias: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
+residual: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+out: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 663552, 1, 18432, 512}
+A[M, K]: {165888, 1728}
+B[N, K]: {256, 1728}
+Ds[M, N]: {165888, 256}
+Ds[M, N]: {165888, 256}
+E[M, N]: {165888, 256}
+launch_and_time_kernel: grid_dim {2592, 1, 1}, block_dim {256, 1, 1}
 Warm up 1 time
 Start running 10 times...
-Perf: 1.19215 ms, 123.112 TFlops, 279.827 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
+Perf: 2.48075 ms, 118.325 TFlops, 268.946 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
 ```
\ No newline at end of file
--- a/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
+++ b/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
@@ -28,18 +28,20 @@ void print_helper_msg()
 template <ck::index_t NDimSpatial,
          typename InDataType,
          typename WeiDataType,
+          typename CShuffleDataType,
          typename OutDataType,
          typename InElementOp,
          typename WeiElementOp,
          typename OutElementOp,
          typename DeviceConvNDFwdInstance>
-int run_grouped_conv_fwd_bias(bool do_verification,
+int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                                       int init_method,
                                       bool time_kernel,
                                       const ck::utils::conv::ConvParam& conv_param,
                                       const HostTensorDescriptor& in_g_n_c_wis_desc,
                                       const HostTensorDescriptor& wei_g_k_c_xs_desc,
                                       const HostTensorDescriptor& bias_g_n_k_wos_desc,
+                                       const HostTensorDescriptor& residual_g_n_k_wos_desc,
                                       const HostTensorDescriptor& out_g_n_k_wos_desc,
                                       const InElementOp& in_element_op,
                                       const WeiElementOp& wei_element_op,
@@ -48,12 +50,14 @@ int run_grouped_conv_fwd_bias(bool do_verification,
    Tensor<InDataType> in(in_g_n_c_wis_desc);
    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
    Tensor<OutDataType> bias(bias_g_n_k_wos_desc);
+    Tensor<OutDataType> residual(residual_g_n_k_wos_desc);
    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
    std::cout << "in: " << in.mDesc << std::endl;
    std::cout << "wei: " << wei.mDesc << std::endl;
    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "residual: " << residual.mDesc << std::endl;
    std::cout << "out: " << out_host.mDesc << std::endl;
    switch(init_method)
@@ -73,18 +77,22 @@ int run_grouped_conv_fwd_bias(bool do_verification,
    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem residual_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpaceSize());
    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    bias_device_buf.ToDevice(bias.mData.data());
+    residual_device_buf.ToDevice(residual.mData.data());
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_strides{};
    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
@@ -98,8 +106,10 @@ int run_grouped_conv_fwd_bias(bool do_verification,
    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
-    copy(bias_g_n_k_wos_desc.GetLengths(), d_g_n_k_wos_lengths);
+    copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths);
-    copy(bias_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
+    copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides);
+    copy(residual_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
+    copy(residual_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
    copy(conv_param.conv_filter_strides_, conv_filter_strides);
@@ -110,17 +120,20 @@ int run_grouped_conv_fwd_bias(bool do_verification,
    // do Conv
    auto conv    = DeviceConvNDFwdInstance{};
    auto invoker = conv.MakeInvoker();
-    auto argument = conv.MakeArgument(
+    auto argument =
-        in_device_buf.GetDeviceBuffer(),
+        conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
                          wei_device_buf.GetDeviceBuffer(),
-        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
+                          std::array<const void*, 2>{bias_device_buf.GetDeviceBuffer(),
+                                                     residual_device_buf.GetDeviceBuffer()},
                          out_device_buf.GetDeviceBuffer(),
                          a_g_n_c_wis_lengths,
                          a_g_n_c_wis_strides,
                          b_g_k_c_xs_lengths,
                          b_g_k_c_xs_strides,
-        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_lengths}},
+                          std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
-        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_strides}},
+                              {d0_g_n_k_wos_lengths, d1_g_n_k_wos_lengths}},
+                          std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
+                              {d0_g_n_k_wos_strides, d1_g_n_k_wos_strides}},
                          e_g_n_k_wos_lengths,
                          e_g_n_k_wos_strides,
                          conv_filter_strides,
@@ -152,12 +165,12 @@ int run_grouped_conv_fwd_bias(bool do_verification,
    {
        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-        Tensor<OutDataType> c_host(out_g_n_k_wos_desc);
+        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                                     InDataType,
                                                                     WeiDataType,
-                                                                     OutDataType,
+                                                                     CShuffleDataType,
                                                                     InElementOp,
                                                                     WeiElementOp,
                                                                     PassThrough>();
@@ -177,8 +190,9 @@ int run_grouped_conv_fwd_bias(bool do_verification,
        ref_invoker.Run(ref_argument);
        // TODO: implement elementwise operation for host
-        out_host.ForEach(
+        out_host.ForEach([&](auto&, auto idx) {
-            [&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); });
+            out_element_op(out_host(idx), c_host(idx), bias(idx), residual(idx));
+        });
        out_device_buf.FromDevice(out_device.mData.data());

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = ck::bhalf_t;
+using WeiDataType      = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using BiasDataType     = ck::bhalf_t;
+using ResidualDataType = ck::bhalf_t;
+using OutDataType      = ck::bhalf_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename ResidualLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, ResidualLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType, ResidualDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+    print_helper_msg();
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    // conventional group conv definition
+    // G = 2
+    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
+    // [K, C,  Y,  X] =  [512, 192,  3,  3]
+    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
+    // CK group conv definition
+    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
+    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
+    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
+    ck::utils::conv::ConvParam conv_param{
+        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        using InLayout       = ctc::G_NW_C;
+        using WeiLayout      = ctc::G_K_X_C;
+        using BiasLayout     = ctc::G_NW_K;
+        using ResidualLayout = ctc::G_NW_K;
+        using OutLayout      = ctc::G_NW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
+            {
+                conv_param.C_,                                                        // g
+                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                    // c
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
+                1,                                                                     // c
+                conv_param.C_                                                          // x
+            });
+        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias_relu_add<1,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  CShuffleDataType,
+                                                  OutDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  DeviceGroupedConvNDFwdInstance<1,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        using InLayout       = ctc::G_NHW_C;
+        using WeiLayout      = ctc::G_K_YX_C;
+        using BiasLayout     = ctc::G_NHW_K;
+        using ResidualLayout = ctc::G_NHW_K;
+        using OutLayout      = ctc::G_NHW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.C_,                                    // n
+                1,                                                                    // c
+                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.K_,
+                                  conv_param.C_,
+                                  conv_param.filter_spatial_lengths_[0],
+                                  conv_param.filter_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
+                                     conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
+                                     1,                                                         // c
+                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
+                                     conv_param.C_                                              // x
+                                 });
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias_relu_add<2,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  CShuffleDataType,
+                                                  OutDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  DeviceGroupedConvNDFwdInstance<2,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        using InLayout       = ctc::G_NDHW_C;
+        using WeiLayout      = ctc::G_K_ZYX_C;
+        using BiasLayout     = ctc::G_NDHW_K;
+        using ResidualLayout = ctc::G_NDHW_K;
+        using OutLayout      = ctc::G_NDHW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1],
+             conv_param.input_spatial_lengths_[2]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                        // c
+                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.C_,                                    // di
+                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1],
+             conv_param.filter_spatial_lengths_[2]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
+                1,                                                         // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_,                                     // z
+                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1],
+             conv_param.output_spatial_lengths_[2]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                         // k
+                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.K_,                                     // do
+                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias_relu_add<3,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  CShuffleDataType,
+                                                  OutDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  DeviceGroupedConvNDFwdInstance<3,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    return 0;
+}
--- a/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "grouped_convnd_fwd_bias_common.hpp"
+#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
@@ -12,6 +12,7 @@ using WeiDataType      = ck::half_t;
 using AccDataType      = float;
 using CShuffleDataType = ck::half_t;
 using BiasDataType     = ck::half_t;
+using ResidualDataType = ck::half_t;
 using OutDataType      = ck::half_t;
 template <ck::index_t... Is>
@@ -19,31 +20,31 @@ using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-#if 1
 template <ck::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename BiasLayout,
+          typename ResidualLayout,
          typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,
-        ck::Tuple<BiasLayout>,
+        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
        InDataType,
        WeiDataType,
        AccDataType,
        CShuffleDataType,
-        ck::Tuple<BiasDataType>,
+        ck::Tuple<BiasDataType, ResidualDataType>,
        OutDataType,
        InElementOp,
        WeiElementOp,
@@ -79,60 +80,6 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        S<1, 32, 1, 8>,
        8>;
-#else
-template <ck::index_t NDimSpatial,
-          typename InLayout,
-          typename WeiLayout,
-          typename BiasLayout,
-          typename OutLayout>
-using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
-        NDimSpatial,
-        InLayout,
-        WeiLayout,
-        ck::Tuple<BiasLayout>,
-        OutLayout,
-        InDataType,
-        WeiDataType,
-        AccDataType,
-        CShuffleDataType,
-        ck::Tuple<BiasDataType>,
-        OutDataType,
-        InElementOp,
-        WeiElementOp,
-        OutElementOp,
-        ConvSpec,    // ConvForwardSpecialization
-        GemmSpec,    // GemmSpecialization
-        1,           //
-        256,         // BlockSize
-        256,         // MPerBlock
-        16,          // NPerBlock
-        32,          // KPerBlock
-        8,           // AK1
-        8,           // BK1
-        16,          // MPerXdl
-        16,          // NPerXdl
-        4,           // MXdlPerWave
-        1,           // NXdlPerWave
-        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
-        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
-        2,           // ABlockTransferSrcVectorDim
-        8,           // ABlockTransferSrcScalarPerVector
-        8,           // ABlockTransferDstScalarPerVector_AK1
-        1,           // ABlockLdsExtraM
-        S<4, 16, 4>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
-        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
-        2,           // BBlockTransferSrcVectorDim
-        2,           // BBlockTransferSrcScalarPerVector
-        2,           // BBlockTransferDstScalarPerVector_BK1
-        1,           // BBlockLdsExtraN
-        4,           // CShuffleMXdlPerWavePerShuffle
-        1,           // CShuffleNXdlPerWavePerShuffle
-        S<1, 256, 1, 1>,
-        1>;
-#endif
 int main(int argc, char* argv[])
 {
@@ -185,6 +132,7 @@ int main(int argc, char* argv[])
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
        using BiasLayout     = ctc::G_NW_K;
+        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
@@ -214,6 +162,15 @@ int main(int argc, char* argv[])
                0              // x
            });
+        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
@@ -223,15 +180,20 @@ int main(int argc, char* argv[])
                conv_param.G_ * conv_param.K_                                          // wo
            });
-        return run_grouped_conv_fwd_bias<
+        return run_grouped_conv_fwd_bias_relu_add<1,
-            1,
                                                  InDataType,
                                                  WeiDataType,
+                                                  CShuffleDataType,
                                                  OutDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
-            DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+                                                  DeviceGroupedConvNDFwdInstance<1,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
@@ -239,6 +201,7 @@ int main(int argc, char* argv[])
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
@@ -249,6 +212,7 @@ int main(int argc, char* argv[])
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
        using BiasLayout     = ctc::G_NHW_K;
+        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
@@ -296,6 +260,20 @@ int main(int argc, char* argv[])
                                     0              // wo
                                 });
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
@@ -311,15 +289,20 @@ int main(int argc, char* argv[])
                conv_param.G_ * conv_param.K_                                          // wo
            });
-        return run_grouped_conv_fwd_bias<
+        return run_grouped_conv_fwd_bias_relu_add<2,
-            2,
                                                  InDataType,
                                                  WeiDataType,
+                                                  CShuffleDataType,
                                                  OutDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
-            DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+                                                  DeviceGroupedConvNDFwdInstance<2,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
@@ -327,6 +310,7 @@ int main(int argc, char* argv[])
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
@@ -337,6 +321,7 @@ int main(int argc, char* argv[])
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
        using BiasLayout     = ctc::G_NDHW_K;
+        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
@@ -393,6 +378,22 @@ int main(int argc, char* argv[])
                                     0              // x
                                 });
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
@@ -411,15 +412,20 @@ int main(int argc, char* argv[])
                conv_param.G_ * conv_param.K_                                          // wo
            });
-        return run_grouped_conv_fwd_bias<
+        return run_grouped_conv_fwd_bias_relu_add<3,
-            3,
                                                  InDataType,
                                                  WeiDataType,
+                                                  CShuffleDataType,
                                                  OutDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
-            DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+                                                  DeviceGroupedConvNDFwdInstance<3,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
@@ -427,6 +433,7 @@ int main(int argc, char* argv[])
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = float;
+using WeiDataType      = float;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using BiasDataType     = float;
+using ResidualDataType = float;
+using OutDataType      = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename ResidualLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, ResidualLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType, ResidualDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        16,          // KPerBlock
+        4,           // AK1
+        4,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        4,           // ABlockTransferSrcScalarPerVector
+        4,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        4,           // BBlockTransferSrcScalarPerVector
+        4,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 16, 1, 16>,
+        4>;
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+    print_helper_msg();
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    // conventional group conv definition
+    // G = 2
+    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
+    // [K, C,  Y,  X] =  [512, 192,  3,  3]
+    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
+    // CK group conv definition
+    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
+    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
+    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
+    ck::utils::conv::ConvParam conv_param{
+        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        using InLayout       = ctc::G_NW_C;
+        using WeiLayout      = ctc::G_K_X_C;
+        using BiasLayout     = ctc::G_NW_K;
+        using ResidualLayout = ctc::G_NW_K;
+        using OutLayout      = ctc::G_NW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
+            {
+                conv_param.C_,                                                        // g
+                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                    // c
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
+                1,                                                                     // c
+                conv_param.C_                                                          // x
+            });
+        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias_relu_add<1,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  CShuffleDataType,
+                                                  OutDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  DeviceGroupedConvNDFwdInstance<1,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        using InLayout       = ctc::G_NHW_C;
+        using WeiLayout      = ctc::G_K_YX_C;
+        using BiasLayout     = ctc::G_NHW_K;
+        using ResidualLayout = ctc::G_NHW_K;
+        using OutLayout      = ctc::G_NHW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.C_,                                    // n
+                1,                                                                    // c
+                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.K_,
+                                  conv_param.C_,
+                                  conv_param.filter_spatial_lengths_[0],
+                                  conv_param.filter_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
+                                     conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
+                                     1,                                                         // c
+                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
+                                     conv_param.C_                                              // x
+                                 });
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias_relu_add<2,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  CShuffleDataType,
+                                                  OutDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  DeviceGroupedConvNDFwdInstance<2,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        using InLayout       = ctc::G_NDHW_C;
+        using WeiLayout      = ctc::G_K_ZYX_C;
+        using BiasLayout     = ctc::G_NDHW_K;
+        using ResidualLayout = ctc::G_NDHW_K;
+        using OutLayout      = ctc::G_NDHW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1],
+             conv_param.input_spatial_lengths_[2]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                        // c
+                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.C_,                                    // di
+                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1],
+             conv_param.filter_spatial_lengths_[2]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
+                1,                                                         // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_,                                     // z
+                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1],
+             conv_param.output_spatial_lengths_[2]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                         // k
+                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.K_,                                     // do
+                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias_relu_add<3,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  CShuffleDataType,
+                                                  OutDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  DeviceGroupedConvNDFwdInstance<3,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    return 0;
+}
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = int8_t;
+using WeiDataType      = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int8_t;
+using BiasDataType     = int8_t;
+using ResidualDataType = int8_t;
+using OutDataType      = int8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename ResidualLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, ResidualLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType, ResidualDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        16>;
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+    print_helper_msg();
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    // conventional group conv definition
+    // G = 2
+    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
+    // [K, C,  Y,  X] =  [512, 192,  3,  3]
+    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
+    // CK group conv definition
+    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
+    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
+    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
+    ck::utils::conv::ConvParam conv_param{
+        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        using InLayout       = ctc::G_NW_C;
+        using WeiLayout      = ctc::G_K_X_C;
+        using BiasLayout     = ctc::G_NW_K;
+        using ResidualLayout = ctc::G_NW_K;
+        using OutLayout      = ctc::G_NW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
+            {
+                conv_param.C_,                                                        // g
+                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                    // c
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
+                1,                                                                     // c
+                conv_param.C_                                                          // x
+            });
+        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias_relu_add<1,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  CShuffleDataType,
+                                                  OutDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  DeviceGroupedConvNDFwdInstance<1,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        using InLayout       = ctc::G_NHW_C;
+        using WeiLayout      = ctc::G_K_YX_C;
+        using BiasLayout     = ctc::G_NHW_K;
+        using ResidualLayout = ctc::G_NHW_K;
+        using OutLayout      = ctc::G_NHW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.C_,                                    // n
+                1,                                                                    // c
+                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.K_,
+                                  conv_param.C_,
+                                  conv_param.filter_spatial_lengths_[0],
+                                  conv_param.filter_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
+                                     conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
+                                     1,                                                         // c
+                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
+                                     conv_param.C_                                              // x
+                                 });
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias_relu_add<2,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  CShuffleDataType,
+                                                  OutDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  DeviceGroupedConvNDFwdInstance<2,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        using InLayout       = ctc::G_NDHW_C;
+        using WeiLayout      = ctc::G_K_ZYX_C;
+        using BiasLayout     = ctc::G_NDHW_K;
+        using ResidualLayout = ctc::G_NDHW_K;
+        using OutLayout      = ctc::G_NDHW_K;
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1],
+             conv_param.input_spatial_lengths_[2]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                        // c
+                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.C_,                                    // di
+                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1],
+             conv_param.filter_spatial_lengths_[2]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
+                1,                                                         // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_,                                     // z
+                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1],
+             conv_param.output_spatial_lengths_[2]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                         // k
+                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.K_,                                     // do
+                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+        return run_grouped_conv_fwd_bias_relu_add<3,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  CShuffleDataType,
+                                                  OutDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  DeviceGroupedConvNDFwdInstance<3,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    return 0;
+}
--- a/example/32_batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_softmax_gemm/CMakeLists.txt
+# TODO: add example batched_gemm_gemm_xdl_fp16
+add_example_executable(example_batched_gemm_softmax_gemm_xdl_fp16 batched_gemm_softmax_gemm_xdl_fp16.cpp)