merge develop

1dbdab56 · Jing Zhang · d2e49b23 · bac7df8f · 1dbdab56 · 1dbdab56
Commit 1dbdab56 authored Aug 18, 2022 by Jing Zhang
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 enable_testing()
 set(ROCM_SYMLINK_LIBS OFF)
-find_package(ROCM 0.8 REQUIRED PATHS /opt/rocm)
+find_package(ROCM REQUIRED PATHS /opt/rocm)
 include(ROCMInstallTargets)
 include(ROCMPackageConfigHelpers)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -18,6 +18,89 @@ def runShell(String command){
    return (output != "")
 }
+def getDockerImageName(){
+    def img = "${env.MIOPEN_IMAGE_URL}:composable_kernels_${params.COMPILER_VERSION}"
+    return img
+}
+def getDockerImage(Map conf=[:]){
+    env.DOCKER_BUILDKIT=1
+    def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
+    def gpu_arch = conf.get("gpu_arch", "gfx908") // prebuilt dockers should have all the architectures enabled so one image can be used for all stages
+    def no_cache = conf.get("no_cache", false)
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+    if(env.CCACHE_HOST)
+    {
+        def check_host = sh(script:"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+        if(check_host == "+PONG")
+        {
+            echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+        }
+        else 
+        {
+            echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+        }
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+        env.CCACHE_DIR = """/tmp/ccache_store"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+    }
+    if(no_cache)
+    {
+        dockerArgs = dockerArgs + " --no-cache "
+    }
+    echo "Docker Args: ${dockerArgs}"
+    def image = getDockerImageName()
+    //Check if image exists 
+    def retimage
+    try 
+    {
+        echo "Pulling down image: ${image}"
+        retimage = docker.image("${image}")
+        retimage.pull()
+    }
+    catch(Exception ex)
+    {
+        error "Unable to locate image: ${image}"
+    }
+    return [retimage, image]
+}
+def buildDocker(install_prefix){
+    show_node_info()
+    env.DOCKER_BUILDKIT=1
+    checkout scm
+    def image_name = getDockerImageName()
+    echo "Building Docker for ${image_name}"
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+    if(env.CCACHE_HOST)
+    {
+        def check_host = sh(script:"""(printf "PING\\r\\n";) | nc  -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+        if(check_host == "+PONG")
+        {
+            echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+        }
+        else 
+        {
+            echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+        }
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+        env.CCACHE_DIR = """/tmp/ccache_store"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+    }
+    echo "Build Args: ${dockerArgs}"
+    try{
+        echo "Checking for image: ${image_name}"
+        sh "docker manifest inspect --insecure ${image_name}"
+        echo "Image: ${image_name} found!! Skipping building image"
+    }
+    catch(Exception ex){
+        echo "Unable to locate image: ${image_name}. Building image now"
+        retimage = docker.build("${image_name}", dockerArgs + ' .')
+        retimage.push()
+    }
+}
 def cmake_build(Map conf=[:]){
    def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
@@ -100,9 +183,10 @@ def buildHipClangJob(Map conf=[:]){
        // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        if (conf.get("enforce_xnack_on", false)) {
-            dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
+        //def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
        if (params.COMPILER_VERSION != "release"){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -113,7 +197,8 @@ def buildHipClangJob(Map conf=[:]){
        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
            try {
-                retimage = docker.build("${image}", dockerArgs + '.')
+                //retimage = docker.build("${image}", dockerArgs + '.')
+                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
@@ -190,9 +275,9 @@ def runCKProfiler(Map conf=[:]){
        // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        if (conf.get("enforce_xnack_on", false)) {
-            dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
        if (params.COMPILER_VERSION != "release"){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -202,7 +287,8 @@ def runCKProfiler(Map conf=[:]){
        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
            try {
-                retimage = docker.build("${image}", dockerArgs + '.')
+                //retimage = docker.build("${image}", dockerArgs + '.')
+                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
@@ -248,7 +334,7 @@ def runCKProfiler(Map conf=[:]){
                            archiveArtifacts "perf_batched_gemm_${gpu_arch}.log"
                            archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
                            archiveArtifacts "perf_conv_fwd_${gpu_arch}.log"
-                            archiveArtifacts "perf_conv_bwd_${gpu_arch}.log"
+                            archiveArtifacts "perf_conv_bwd_data_${gpu_arch}.log"
                            archiveArtifacts "perf_gemm_bilinear_${gpu_arch}.log"
                            archiveArtifacts "perf_reduction_${gpu_arch}.log"
                           // stash perf files to master
@@ -258,7 +344,7 @@ def runCKProfiler(Map conf=[:]){
                            stash name: "perf_batched_gemm_${gpu_arch}.log"
                            stash name: "perf_grouped_gemm_${gpu_arch}.log"
                            stash name: "perf_conv_fwd_${gpu_arch}.log"
-                            stash name: "perf_conv_bwd_${gpu_arch}.log"
+                            stash name: "perf_conv_bwd_data_${gpu_arch}.log"
                            stash name: "perf_gemm_bilinear_${gpu_arch}.log"
                            stash name: "perf_reduction_${gpu_arch}.log"
                            //we will process results on the master node
@@ -308,16 +394,17 @@ def process_results(Map conf=[:]){
    // Jenkins is complaining about the render group 
    def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
    if (conf.get("enforce_xnack_on", false)) {
-        dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+        dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
    }
-    def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
+    def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='release' "
    def variant = env.STAGE_NAME
    def retimage
    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
        try {
-            retimage = docker.build("${image}", dockerArgs + '.')
+            //retimage = docker.build("${image}", dockerArgs + '.')
+            (retimage, image) = getDockerImage(conf)
        }
        catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
            echo "The job was cancelled or aborted"
@@ -337,7 +424,7 @@ def process_results(Map conf=[:]){
                        unstash "perf_batched_gemm_${gpu_arch}.log"
                        unstash "perf_grouped_gemm_${gpu_arch}.log"
                        unstash "perf_conv_fwd_${gpu_arch}.log"
-                        unstash "perf_conv_bwd${gpu_arch}.log"
+                        unstash "perf_conv_bwd_data_${gpu_arch}.log"
                        unstash "perf_gemm_bilinear_${gpu_arch}.log"
                        unstash "perf_reduction_${gpu_arch}.log"
                        sh "./process_qa_data.sh ${gpu_arch}"
@@ -372,14 +459,22 @@ pipeline {
        parallelsAlwaysFailFast()
    }
    parameters {
+        booleanParam(
+            name: "BUILD_DOCKER",
+            defaultValue: true,
+            description: "Force building docker image (default: true)")
        string(
            name: 'COMPILER_VERSION', 
            defaultValue: 'ck-9110', 
-            description: 'Specify which version of compiler to use: ck-9110 (default), release, or amd-mainline-open.')
+            description: 'Specify which version of compiler to use: ck-9110 (default), release, or amd-stg-open.')
        booleanParam(
            name: "RUN_FULL_QA",
            defaultValue: false,
            description: "Select whether to run small set of performance tests (default) or full QA")
+        booleanParam(
+            name: "TEST_NODE_PERFORMANCE",
+            defaultValue: false,
+            description: "Test the node GPU performance (default: false)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -393,7 +488,24 @@ pipeline {
        DOCKER_BUILDKIT = "1"
    }
    stages{
+        stage("Build Docker"){
+            when {
+                expression { params.BUILD_DOCKER.toBoolean() }
+            }
+            parallel{
+                stage('Docker /opt/rocm'){
+                    agent{ label rocmnode("nogpu") }
+                    steps{
+                        buildDocker('/opt/rocm')
+                    }
+                }
+            }
+        }
        stage("Static checks") {
+            when {
+                beforeAgent true
+                expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
+            }
            parallel{
                // enable after we move from hipcc to hip-clang
                // stage('Tidy') {
@@ -427,6 +539,10 @@ pipeline {
        }
 		stage("Tests")
        {
+            when {
+                beforeAgent true
+                expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
+            }
            parallel
            {
                stage("Run Tests: gfx908")
@@ -457,6 +573,10 @@ pipeline {
        }
        stage("Client App")
        {
+            when {
+                beforeAgent true
+                expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
+            }
            parallel
            {
                stage("Run Client App")
@@ -480,7 +600,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
                    }
                    agent{ label rocmnode("gfx908")}
                    environment{
@@ -494,7 +614,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() }
+                        expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
                    }
                    agent{ label rocmnode("gfx90a")}
                    environment{
@@ -513,7 +633,7 @@ pipeline {
                stage("Process results for gfx908"){
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
                    }
                    agent { label 'mici' }
                    steps{
@@ -523,7 +643,7 @@ pipeline {
                stage("Process results for gfx90a"){
                    when {
                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() }
+                        expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
                    }
                    agent { label 'mici' }
                    steps{

--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
@@ -128,11 +128,14 @@ bool RunDeviceNormalize2D(normalize_op_ptr& p_op,
    std::array<void*, 1> output      = {p_y};
    auto normalize_functor           = ck::tensor_operation::element_wise::Normalize{};
-    auto argument_ptr = p_op->MakeArgumentPointer(input,
+    std::array<ck::index_t, 2> xyLengths = {M, N};
+    std::array<ck::index_t, 2> xyStrides = {StrideX, 1};
+    auto argument_ptr = p_op->MakeArgumentPointer(xyLengths,
+                                                  {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
+                                                  {xyStrides},
+                                                  input,
                                                  output,
-                                                  {M, N},
-                                                  {{StrideX, 1}, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
-                                                  {{StrideX, 1}},
                                                  ck::tensor_operation::element_wise::Normalize{});
    if(p_op->IsSupportedArgument(argument_ptr.get()))

--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -4,5 +4,6 @@ add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
 add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
 add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
 add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
+add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
 # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
 add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
--- a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
 add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
+add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
+add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = BF16;
+using D1DataType       = BF16;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = BF16;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+#include "run_gemm_add_add_fastgelu_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstddef>
 #include <iostream>
-#include <numeric>
+#include <stdexcept>
-#include <initializer_list>
+#include <string>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -12,11 +12,12 @@
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/check_err.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -61,189 +62,6 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on
-int main(int argc, char* argv[])
+#include "run_gemm_add_add_fastgelu_example.inc"
-{
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-    // GEMM shape
-    ck::index_t M = 3840;
-    ck::index_t N = 4096;
-    ck::index_t K = 4096;
-    ck::index_t StrideA  = 4096;
-    ck::index_t StrideB  = 4096;
-    ck::index_t StrideD0 = 0;
-    ck::index_t StrideD1 = 4096;
-    ck::index_t StrideE  = 4096;
-    if(argc == 1)
-    {
-        // use default case
-    }
-    else if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 12)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        M = std::stoi(argv[4]);
-        N = std::stoi(argv[5]);
-        K = std::stoi(argv[6]);
-        StrideA  = std::stoi(argv[7]);
-        StrideB  = std::stoi(argv[8]);
-        StrideD0 = std::stoi(argv[9]);
-        StrideD1 = std::stoi(argv[10]);
-        StrideE  = std::stoi(argv[11]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
-               "StrideE\n");
-        exit(0);
-    }
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
-            }
-            else
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
-            }
-        };
-    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
-    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
-    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
-    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
-    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
-    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
-    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
-        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
-        break;
-    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
-        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
-    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
-    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    d0_device_buf.ToDevice(d0_m_n.mData.data());
-    d1_device_buf.ToDevice(d1_m_n.mData.data());
-    auto a_element_op   = AElementOp{};
-    auto b_element_op   = BElementOp{};
-    auto cde_element_op = CDEElementOp{};
-    // do GEMM
-    auto device_op = DeviceOpInstance{};
-    auto invoker   = device_op.MakeInvoker();
-    auto argument =
-        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
-                               b_device_buf.GetDeviceBuffer(),
-                               std::array<const void*, 2>{d0_device_buf.GetDeviceBuffer(),
-                                                          d1_device_buf.GetDeviceBuffer()},
-                               e_device_buf.GetDeviceBuffer(),
-                               M,
-                               N,
-                               K,
-                               StrideA,
-                               StrideB,
-                               std::array<ck::index_t, 2>{StrideD0, StrideD1},
-                               StrideE,
-                               a_element_op,
-                               b_element_op,
-                               cde_element_op);
-    if(!device_op.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error("wrong! this device_op instance does not support this problem");
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    std::size_t flop      = std::size_t(2) * M * N * K;
-    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                            sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
-                            sizeof(EDataType) * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << device_op.GetTypeString() << std::endl;
-    if(do_verification)
-    {
-        Tensor<AccDataType> c_m_n(HostTensorDescriptor(
-            std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
-        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                                BDataType,
-                                                                                AccDataType,
-                                                                                AccDataType,
-                                                                                AElementOp,
-                                                                                BElementOp,
-                                                                                PassThrough>;
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-        auto ref_argument =
-            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
-        ref_invoker.Run(ref_argument);
-        for(int m = 0; m < M; ++m)
-        {
-            for(int n = 0; n < N; ++n)
-            {
-                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
-            }
-        }
-        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
-        return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
-    }
-    return 0;
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
-}
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = F32;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+// clang-format on
+#include "run_gemm_add_add_fastgelu_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using D0DataType       = I8;
+using D1DataType       = I8;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = I8;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+#include "run_gemm_add_add_fastgelu_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
--- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+#pragma once
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+};
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {StrideD0, StrideD1},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+    std::size_t flop      = 2_uz * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
+                            sizeof(EDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+    if(config.do_verification)
+    {
+        Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+        return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
+    }
+    return true;
+}
+bool run_gemm_add_add_fastgelu_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 12)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        problem_size.M = std::stoi(argv[4]);
+        problem_size.N = std::stoi(argv[5]);
+        problem_size.K = std::stoi(argv[6]);
+        problem_size.StrideA  = std::stoi(argv[7]);
+        problem_size.StrideB  = std::stoi(argv[8]);
+        problem_size.StrideD0 = std::stoi(argv[9]);
+        problem_size.StrideD1 = std::stoi(argv[10]);
+        problem_size.StrideE  = std::stoi(argv[11]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << std::endl
+                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
+                     "StrideE"
+                  << std::endl;
+        return true;
+    }
+    return run_gemm_add_add_fastgelu(problem_size, config);
+}
--- a/example/12_reduce/CMakeLists.txt
+++ b/example/12_reduce/CMakeLists.txt
 add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
+add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp)
 add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp)
--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
@@ -2,20 +2,41 @@
 ## Run ```example_reduce_blockwise```
 ```bash
-# -D <xxx> : input 4-d tensor lengths
+# -D <xxx> : input 3d/4d/5d tensor lengths
+# -R <xxx> : reduce dimension ids
 # -v <x> :   verification (0=no, 1=yes)
-#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4)
-#arg2: time kernel (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
-./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
 ```
 Result
 ```
-./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
+./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
-launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
+launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
 Warm up 1 time
 Start running 10 times...
-Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
+Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
+```
+## Run ```example_reduce_multiblock_atomic_add```
+```bash
+# -D <xxx> : input 3d/4d/5d tensor lengths
+# -R <xxx> : reduce dimension ids
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: data type (0: fp32, 1: fp64)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
+```
+Result
+```
+./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
+Perf: 0 ms, inf GB/s, DeviceReduceMultiBlock<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
+echo $?
+0
 ```
 # Instructions for ```example_reduce_blockwise_two_call```

--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -2,64 +2,17 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
-#include <numeric>
 #include <initializer_list>
 #include <cstdlib>
 #include <getopt.h>
-#include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
-#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "reduce_blockwise_impl.hpp"
-#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
+#include "reduce_example_common.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_reduction.hpp"
 using namespace ck;
 using namespace ck::tensor_operation::device;
-using InDataType  = ck::half_t;
-using OutDataType = ck::half_t;
-using AccDataType = float;
-constexpr int Rank         = 4;
-constexpr int NumReduceDim = 3;
-constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
-constexpr bool PropagateNan         = true;
-constexpr bool OutputIndex          = false;
-using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
-using InElementwiseOperation =
-    typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
-using AccElementwiseOperation =
-    typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
-using DeviceReduceInstance = DeviceReduceMultiBlock<InDataType,
-                                                    AccDataType,
-                                                    OutDataType,
-                                                    Rank,
-                                                    NumReduceDim,
-                                                    ReduceOperation,
-                                                    InElementwiseOperation,
-                                                    AccElementwiseOperation,
-                                                    InMemoryDataOperationEnum::Set,
-                                                    PropagateNan,
-                                                    OutputIndex,
-                                                    false, // HaveIndexInputIfOutputIndex
-                                                    256,
-                                                    4,
-                                                    64,
-                                                    1,
-                                                    1,
-                                                    0,
-                                                    1,
-                                                    1>;
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
                                       {"verify", required_argument, nullptr, 'v'},
                                       {"help", no_argument, nullptr, '?'},
@@ -72,10 +25,12 @@ class SimpleAppArgs
    public:
    std::vector<size_t> inLengths = {16, 64, 32, 960};
+    std::vector<int> reduceDims   = {0, 1, 2};
    std::vector<float> scales     = {1.0f, 0.0f};
    bool do_verification = true;
-    int init_method      = 1;
+    int data_type        = 1;
+    int init_method      = 2;
    bool time_kernel     = true;
    public:
@@ -84,13 +39,17 @@ class SimpleAppArgs
        std::cout << "Usage of " << cmd << std::endl;
        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
                  << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
+                  << std::endl;
        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
                     "comparing with the host-based reduction"
                  << std::endl;
-        std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4)"
+                  << std::endl;
+        std::cout << "Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer "
                     "value, 3=decimal value)"
                  << std::endl;
-        std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl;
+        std::cout << "Arg3 -- time kernel (0=no, 1=yes)" << std::endl;
    };
    int processArgs(int argc, char* argv[])
@@ -101,7 +60,7 @@ class SimpleAppArgs
        while(1)
        {
-            ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index);
+            ch = getopt_long(argc, argv, "D:R:v:l:", long_options, &option_index);
            if(ch == -1)
                break;
            switch(ch)
@@ -112,6 +71,12 @@ class SimpleAppArgs
                inLengths = getTypeValuesFromString<size_t>(optarg);
                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
            case 'v':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");
@@ -129,9 +94,12 @@ class SimpleAppArgs
            };
        };
-        if(optind + 2 > argc)
+        if(optind + 3 > argc)
+        {
            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+        };
+        data_type   = std::atoi(argv[optind++]);
        init_method = std::atoi(argv[optind++]);
        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
@@ -145,198 +113,152 @@ class SimpleAppArgs
    };
 };
-int main(int argc, char* argv[])
+template <typename InOutDataType,
+          typename AccDataType,
+          ReduceTensorOp ReduceOpId,
+          index_t PropagateNan,
+          index_t OutputIndex>
+bool reduce_blockwise_test(bool do_verification,
+                           int init_method,
+                           bool time_kernel,
+                           const std::vector<size_t>& inLengths,
+                           const std::vector<int>& reduceDims,
+                           float alpha,
+                           float beta)
 {
-    const std::vector<int> reduceDims{0, 1, 2};
+    bool matched = false;
-    const std::vector<int> invariantDims{3};
+    int result   = 0;
-    SimpleAppArgs args;
+    const auto tuple_object = reduce_shape_instances{};
-    if(argc > 1)
+    static_for<0, std::tuple_size<reduce_shape_instances>::value, 1>{}([&](auto i) {
-    {
+        if(matched)
-        if(args.processArgs(argc, argv) < 0)
+            return;
-            return (-1);
-    };
-    constexpr bool op_support_indices =
-        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
-         ReduceOpId == ReduceTensorOp::AMAX);
-    // if input is half type, no reason to use float for indiced reduction operation and must use
-    // float for non-indiced reduction operation for accuracy
-    constexpr bool invalid_reduce_1 =
-        std::is_same<InDataType, ck::half_t>::value &&
-        ((!op_support_indices && !std::is_same<AccDataType, float>::value) ||
-         (op_support_indices && !std::is_same<AccDataType, ck::half_t>::value));
-    // if input is float type, no reason to use double for indiced reduction operation
-    constexpr bool invalid_reduce_2 =
-        std::is_same<InDataType, float>::value &&
-        (op_support_indices && !std::is_same<AccDataType, float>::value);
-    // indices option can only be used when it is really needed
-    constexpr bool invalid_reduce_3 = (!op_support_indices && OutputIndex);
-    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);
+        using ShapeType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
-    if constexpr(invalid_reduce)
+        if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
-        std::cout << "Reduction setting is not supported, exiting!" << std::endl;
+            return;
-    Tensor<InDataType> in(args.inLengths);
+        result = reduce_blockwise_impl<InOutDataType,
+                                       AccDataType,
+                                       ReduceOpId,
+                                       ShapeType::Rank_,
+                                       ShapeType::NumReduceDim_,
+                                       PropagateNan,
+                                       OutputIndex>(
+            do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta);
-    std::vector<size_t> outLengths;
+        matched = true;
+    });
-    if(invariantDims.empty())
+    return (result == 0) ? true : false;
-        outLengths.push_back(1);
+};
-    else
-        for(auto dim : invariantDims)
-            outLengths.push_back(args.inLengths[dim]);
-    Tensor<OutDataType> out_ref(outLengths);
-    Tensor<OutDataType> out(outLengths);
-    Tensor<int> out_indices_ref(outLengths);
-    Tensor<int> out_indices(outLengths);
-    auto inStrides  = in.mDesc.GetStrides();
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
-    auto outStrides = out.mDesc.GetStrides();
+constexpr bool PropagateNan         = true;
+constexpr bool OutputIndex          = false;
-    size_t invariant_total_length = out.mDesc.GetElementSize();
+int main(int argc, char* argv[])
-    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+{
+    bool pass = true;
-    float alpha = args.scales[0];
+    if(argc > 1)
-    float beta  = args.scales[1];
+    {
+        SimpleAppArgs arg;
-    std::size_t num_thread = 1;
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
-    if(args.do_verification)
+        if(arg.data_type == 0)
-    {
-        switch(args.init_method)
        {
-        case 0: break;
+            pass = reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
-        case 1:
+                arg.do_verification,
-            in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+                arg.init_method,
-            if(beta != 0.0f)
+                arg.time_kernel,
-                out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+                arg.inLengths,
-            break;
+                arg.reduceDims,
-        case 2:
+                arg.scales[0],
-            in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+                arg.scales[1]);
-            if(beta != 0.0f)
-                out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-            break;
-        default:
-            in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-            if(beta != 0.0f)
-                out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
        }
+        else if(arg.data_type == 1)
-        if(beta != 0.0f)
-            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
-                out.mData[i] = out_ref.mData[i];
-    };
-    // these buffers are usually provided by the user application
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
-    in_dev.ToDevice(in.mData.data());
-    if(beta != 0.0f)
-        out_dev.ToDevice(out.mData.data());
-    size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
-    DeviceMem out_index_dev(indicesSizeInBytes);
-    InElementwiseOperation in_elementwise_op;
-    AccElementwiseOperation acc_elementwise_op;
-    std::tie(in_elementwise_op, acc_elementwise_op) =
-        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
-            static_cast<int32_t>(reduce_total_length));
-    if(args.do_verification)
-    {
-        ReductionHost<InDataType,
-                      AccDataType,
-                      OutDataType,
-                      ReduceOperation,
-                      InElementwiseOperation,
-                      AccElementwiseOperation,
-                      Rank,
-                      NumReduceDim,
-                      PropagateNan,
-                      OutputIndex>
-            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-        hostReduce.Run(alpha,
-                       in.mData.data(),
-                       beta,
-                       out_ref.mData.data(),
-                       out_indices_ref.mData.data(),
-                       in_elementwise_op,
-                       acc_elementwise_op);
-    };
-    std::vector<ck::index_t> i_inLengths;
-    std::vector<ck::index_t> i_inStrides;
-    std::vector<ck::index_t> i_outLengths;
-    std::vector<ck::index_t> i_outStrides;
-    i_inLengths.assign(args.inLengths.begin(), args.inLengths.end());
-    i_inStrides.assign(inStrides.begin(), inStrides.end());
-    i_outLengths.assign(outLengths.begin(), outLengths.end());
-    i_outStrides.assign(outStrides.begin(), outStrides.end());
-    auto reduce = DeviceReduceInstance{};
-    auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths,
-                                                   i_inStrides,
-                                                   i_outLengths,
-                                                   i_outStrides,
-                                                   reduceDims,
-                                                   alpha,
-                                                   beta,
-                                                   in_dev.GetDeviceBuffer(),
-                                                   nullptr,
-                                                   out_dev.GetDeviceBuffer(),
-                                                   out_index_dev.GetDeviceBuffer(),
-                                                   in_elementwise_op,
-                                                   acc_elementwise_op);
-    if(!reduce.IsSupportedArgument(argument_ptr.get()))
-    {
-        std::cout
-            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
-            << std::endl;
-    };
-    std::string reduce_name = reduce.GetTypeString();
-    auto invoker_ptr = reduce.MakeInvokerPointer();
-    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});
-    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
-                            invariant_total_length * sizeof(OutDataType);
-    float gb_per_sec = num_bytes / 1.E6 / avg_time;
-    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
-              << std::endl;
-    bool pass = true;
-    if(args.do_verification)
-    {
-        out_dev.FromDevice(out.mData.data());
-        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
-        if(OutputIndex)
        {
-            out_index_dev.FromDevice(out_indices.mData.data());
+            pass = reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
-            pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
+                arg.do_verification,
-        };
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 3)
+        {
+            pass = reduce_blockwise_test<int8_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 5)
+        {
+            pass = reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 6)
+        {
+            pass = reduce_blockwise_test<double, double, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+    }
+    else
+    {
+        // for testing half_t
+        pass =
+            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+        // for testing float
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+        // for testing double
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+        // for testing bhalf_t
+        pass = pass &&
+               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                   true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+        // for testing int8_t
+        pass =
+            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+        // for testing 3D input
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 960}, {0, 1}, 1.0f, 0.0f);
+        // for testing 5D input
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 32, 2, 960}, {0, 1, 2, 3}, 1.0f, 0.0f);
    };
    return (pass ? 0 : 1);
-}
+};
--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_reduction.hpp"
+#include "reduce_example_common.hpp"
+template <typename InOutDataType,
+          typename AccDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          ck::index_t Rank,
+          ck::index_t NumReduceDim,
+          bool PropagateNan,
+          bool OutputIndex>
+int reduce_blockwise_impl(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const std::vector<size_t>& inLengths,
+                          const std::vector<int>& reduceDims,
+                          float alpha,
+                          float beta)
+{
+    using namespace ck;
+    using namespace ck::tensor_operation::device;
+    constexpr bool op_support_indices =
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
+         ReduceOpId == ReduceTensorOp::AMAX);
+    constexpr bool invalid_reduce_1 = OutputIndex && !op_support_indices;
+    // 1) If InOutDataType is half_t, must use half_t as AccDataType for indexable reduction
+    // operations 2) If InOutDataType is half_t, must use float as AccDataType for non-indexable
+    // reduction operations
+    constexpr bool invalid_reduce_2 =
+        std::is_same<InOutDataType, half_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, float>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, half_t>::value));
+    // 1) If InOutDataType is float, must use float as AccDataType for indexable reduction
+    // operations
+    constexpr bool invalid_reduce_3 =
+        std::is_same<InOutDataType, float>::value &&
+        (op_support_indices && !std::is_same<AccDataType, float>::value);
+    // 1) If InOutDataType is int8_t, must use int8_t as AccDataType for indexable reduction
+    // operations 2) If InOutDataType is int8_t, must use int32_t as AccDataType for non-indexable
+    // reduction operations
+    constexpr bool invalid_reduce_4 =
+        std::is_same<InOutDataType, int8_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, int8_t>::value));
+    // 1) If InOutDataType is int8_t, the supported operation must be either indexable operations or
+    // ADD/AVG
+    constexpr bool invalid_reduce_5 = std::is_same<InOutDataType, int8_t>::value &&
+                                      (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
+                                       ReduceOpId != ReduceTensorOp::AVG);
+    // 1) If InOutDataType is bhalf_t, must use float as AccDataType for all reduction operations
+    constexpr bool invalid_reduce_6 =
+        std::is_same<InOutDataType, bhalf_t>::value && !std::is_same<AccDataType, float>::value;
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
+                                     invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
+    if(invalid_reduce)
+    {
+        std::cerr << "The reduction setting is invalid, exiting!" << std::endl;
+        return (-1);
+    };
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+    using DeviceReduceInstance =
+        ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataType,
+                                                             AccDataType,
+                                                             InOutDataType,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             ReduceOperation,
+                                                             InElementwiseOperation,
+                                                             AccElementwiseOperation,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             PropagateNan,
+                                                             OutputIndex,
+                                                             false, // HaveIndexInputIfOutputIndex
+                                                             256,   // BlockSize
+                                                             4,     // MThreadClusterSize
+                                                             64,    // KThreadClusterSize
+                                                             1,     // MThreadSliceSize
+                                                             1,     // KThreadSliceSize
+                                                             0,     // InSrcVectorDim
+                                                             1,     // InSrceVectorSize
+                                                             1>;    // OutDstVectorSize
+    Tensor<InOutDataType> in(inLengths);
+    std::vector<size_t> outLengths;
+    std::vector<int> invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+    if(invariantDims.empty())
+        outLengths.push_back(1);
+    else
+        for(auto dim : invariantDims)
+            outLengths.push_back(inLengths[dim]);
+    Tensor<InOutDataType> out_ref(outLengths);
+    Tensor<InOutDataType> out(outLengths);
+    Tensor<int> out_indices_ref(outLengths);
+    Tensor<int> out_indices(outLengths);
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+    std::size_t num_thread = 1;
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
+                                            num_thread);
+        }
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
+    in_dev.ToDevice(in.mData.data());
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+    size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
+    DeviceMem out_index_dev(indicesSizeInBytes);
+    InElementwiseOperation in_elementwise_op;
+    AccElementwiseOperation acc_elementwise_op;
+    std::tie(in_elementwise_op, acc_elementwise_op) =
+        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+            static_cast<int32_t>(reduce_total_length));
+    if(do_verification)
+    {
+        ReductionHost<InOutDataType,
+                      AccDataType,
+                      InOutDataType,
+                      ReduceOperation,
+                      InElementwiseOperation,
+                      AccElementwiseOperation,
+                      Rank,
+                      NumReduceDim,
+                      PropagateNan,
+                      OutputIndex>
+            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+        hostReduce.Run(alpha,
+                       in.mData.data(),
+                       beta,
+                       out_ref.mData.data(),
+                       out_indices_ref.mData.data(),
+                       in_elementwise_op,
+                       acc_elementwise_op);
+    };
+    std::vector<ck::index_t> i_inLengths;
+    std::vector<ck::index_t> i_inStrides;
+    std::vector<ck::index_t> i_outLengths;
+    std::vector<ck::index_t> i_outStrides;
+    i_inLengths.assign(inLengths.begin(), inLengths.end());
+    i_inStrides.assign(inStrides.begin(), inStrides.end());
+    i_outLengths.assign(outLengths.begin(), outLengths.end());
+    i_outStrides.assign(outStrides.begin(), outStrides.end());
+    auto reduce = DeviceReduceInstance{};
+    auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths,
+                                                   i_inStrides,
+                                                   i_outLengths,
+                                                   i_outStrides,
+                                                   reduceDims,
+                                                   alpha,
+                                                   beta,
+                                                   in_dev.GetDeviceBuffer(),
+                                                   nullptr,
+                                                   out_dev.GetDeviceBuffer(),
+                                                   out_index_dev.GetDeviceBuffer(),
+                                                   in_elementwise_op,
+                                                   acc_elementwise_op);
+    if(!reduce.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cerr
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+        return (-2);
+    };
+    std::string reduce_name = reduce.GetTypeString();
+    auto invoker_ptr = reduce.MakeInvokerPointer();
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
+                            invariant_total_length * sizeof(InOutDataType);
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+              << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        out_dev.FromDevice(out.mData.data());
+        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
+        if(OutputIndex)
+        {
+            out_index_dev.FromDevice(out_indices.mData.data());
+            pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
+        };
+    };
+    return (pass ? 0 : 1);
+}
--- a/example/12_reduce/reduce_example_common.hpp
+++ b/example/12_reduce/reduce_example_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/ck.hpp"
+template <ck::index_t Rank, ck::index_t NumReduceDim>
+std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
+{
+    assert(NumReduceDim == reduceDims.size());
+    int reduceFlag = 0;
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+    std::vector<int> invariantDims;
+    // collect invariant dimensions
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            invariantDims.push_back(i);
+        };
+    return invariantDims;
+};
+template <ck::index_t Rank, ck::index_t NumReduceDim>
+struct ReduceShape
+{
+    static constexpr ck::index_t Rank_         = Rank;
+    static constexpr ck::index_t NumReduceDim_ = NumReduceDim;
+};
+using reduce_shape_instances = std::tuple<ReduceShape<3, 1>,
+                                          ReduceShape<3, 2>,
+                                          ReduceShape<4, 1>,
+                                          ReduceShape<4, 2>,
+                                          ReduceShape<4, 3>,
+                                          ReduceShape<5, 1>,
+                                          ReduceShape<5, 2>,
+                                          ReduceShape<5, 3>,
+                                          ReduceShape<5, 4>>;
--- a/example/12_reduce/reduce_multiblock_atomic_add.cpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+#include "ck/utility/reduction_enums.hpp"
+#include "reduce_multiblock_atomic_add_impl.hpp"
+#include "reduce_example_common.hpp"
+using namespace ck;
+using namespace ck::tensor_operation::device;
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+    public:
+    std::vector<size_t> inLengths = {16, 64, 32, 960};
+    std::vector<int> reduceDims   = {0, 1, 2};
+    std::vector<float> scales     = {1.0f, 0.0f};
+    bool do_verification = true;
+    int data_type        = 1;
+    int init_method      = 2;
+    bool time_kernel     = true;
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1: data type (0: fp32, 1: fp64)" << std::endl;
+        std::cout << "Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg3 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+        int ch;
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+        if(optind + 3 > argc)
+        {
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+        };
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+        return (0);
+    };
+};
+template <typename InOutDataType,
+          typename AccDataType,
+          ReduceTensorOp ReduceOpId,
+          index_t PropagateNan>
+bool reduce_multiblock_atomic_add_test(bool do_verification,
+                                       int init_method,
+                                       bool time_kernel,
+                                       const std::vector<size_t>& inLengths,
+                                       const std::vector<int>& reduceDims,
+                                       float alpha,
+                                       float beta)
+{
+    bool matched = false;
+    int result   = 0;
+    const auto tuple_object = reduce_shape_instances{};
+    static_for<0, std::tuple_size<reduce_shape_instances>::value, 1>{}([&](auto i) {
+        if(matched)
+            return;
+        using ShapeType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
+        if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
+            return;
+        result = reduce_multiblock_atomic_add_impl<InOutDataType,
+                                                   AccDataType,
+                                                   ReduceOpId,
+                                                   ShapeType::Rank_,
+                                                   ShapeType::NumReduceDim_,
+                                                   PropagateNan>(
+            do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta);
+        matched = true;
+    });
+    return (result == 0) ? true : false;
+};
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
+constexpr bool PropagateNan         = true;
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+    if(argc > 1)
+    {
+        SimpleAppArgs arg;
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+        if(arg.data_type == 0)
+        {
+            pass = reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 1)
+        {
+            pass = reduce_multiblock_atomic_add_test<double, double, ReduceOpId, PropagateNan>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+    }
+    else
+    {
+        // for testing float
+        pass = pass && reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+        // for testing double
+        pass = pass && reduce_multiblock_atomic_add_test<double, double, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+        // for testing 3D input
+        pass = pass && reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 960}, {0, 1}, 1.0f, 0.0f);
+        // for testing 5D input
+        pass = pass && reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 32, 2, 960}, {0, 1, 2, 3}, 1.0f, 0.0f);
+    };
+    return (pass ? 0 : 1);
+};
--- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_reduction.hpp"
+#include "reduce_example_common.hpp"
+template <typename InOutDataType,
+          typename AccDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          ck::index_t Rank,
+          ck::index_t NumReduceDim,
+          bool PropagateNan>
+int reduce_multiblock_atomic_add_impl(bool do_verification,
+                                      int init_method,
+                                      bool time_kernel,
+                                      const std::vector<size_t>& inLengths,
+                                      const std::vector<int>& reduceDims,
+                                      float alpha,
+                                      float beta)
+{
+    using namespace ck;
+    using namespace ck::tensor_operation::device;
+    constexpr bool op_support_atomic_add =
+        (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG);
+    constexpr bool invalid_reduce_1 = !op_support_atomic_add;
+    constexpr bool invalid_reduce_2 =
+        !(std::is_same<InOutDataType, float>::value || std::is_same<InOutDataType, double>::value);
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2);
+    if(invalid_reduce)
+    {
+        std::cerr << "The reduction setting is invalid, exiting!" << std::endl;
+        return (-1);
+    };
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+    using DeviceReduceInstance =
+        ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataType,
+                                                             AccDataType,
+                                                             InOutDataType,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             ReduceOperation,
+                                                             InElementwiseOperation,
+                                                             AccElementwiseOperation,
+                                                             InMemoryDataOperationEnum::AtomicAdd,
+                                                             PropagateNan,
+                                                             false,
+                                                             false, // HaveIndexInputIfOutputIndex
+                                                             256,
+                                                             4,
+                                                             64,
+                                                             1,
+                                                             1,
+                                                             0,
+                                                             1,
+                                                             1>;
+    Tensor<InOutDataType> in(inLengths);
+    std::vector<size_t> outLengths;
+    std::vector<int> invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+    if(invariantDims.empty())
+        outLengths.push_back(1);
+    else
+        for(auto dim : invariantDims)
+            outLengths.push_back(inLengths[dim]);
+    Tensor<InOutDataType> out_ref(outLengths);
+    Tensor<InOutDataType> out(outLengths);
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+    std::size_t num_thread = 1;
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
+                                            num_thread);
+        }
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
+    in_dev.ToDevice(in.mData.data());
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+    InElementwiseOperation in_elementwise_op;
+    AccElementwiseOperation acc_elementwise_op;
+    std::tie(in_elementwise_op, acc_elementwise_op) =
+        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+            static_cast<int32_t>(reduce_total_length));
+    if(do_verification)
+    {
+        ReductionHost<InOutDataType,
+                      AccDataType,
+                      InOutDataType,
+                      ReduceOperation,
+                      InElementwiseOperation,
+                      AccElementwiseOperation,
+                      Rank,
+                      NumReduceDim,
+                      PropagateNan,
+                      false>
+            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+        hostReduce.Run(alpha,
+                       in.mData.data(),
+                       beta,
+                       out_ref.mData.data(),
+                       nullptr,
+                       in_elementwise_op,
+                       acc_elementwise_op);
+    };
+    std::vector<ck::index_t> i_inLengths;
+    std::vector<ck::index_t> i_inStrides;
+    std::vector<ck::index_t> i_outLengths;
+    std::vector<ck::index_t> i_outStrides;
+    i_inLengths.assign(inLengths.begin(), inLengths.end());
+    i_inStrides.assign(inStrides.begin(), inStrides.end());
+    i_outLengths.assign(outLengths.begin(), outLengths.end());
+    i_outStrides.assign(outStrides.begin(), outStrides.end());
+    auto reduce = DeviceReduceInstance{};
+    auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths,
+                                                   i_inStrides,
+                                                   i_outLengths,
+                                                   i_outStrides,
+                                                   reduceDims,
+                                                   alpha,
+                                                   beta,
+                                                   in_dev.GetDeviceBuffer(),
+                                                   nullptr,
+                                                   out_dev.GetDeviceBuffer(),
+                                                   nullptr,
+                                                   in_elementwise_op,
+                                                   acc_elementwise_op);
+    if(!reduce.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cerr
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+        return (-2);
+    };
+    std::string reduce_name = reduce.GetTypeString();
+    auto invoker_ptr = reduce.MakeInvokerPointer();
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
+                            invariant_total_length * sizeof(InOutDataType);
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+              << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        out_dev.FromDevice(out.mData.data());
+        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
+    };
+    return (pass ? 0 : 1);
+}
--- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)
+add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
+#exclude GEMM+max exampe from testing, since there is random failure on gfx908
+#https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/358
+#TODO: fix the failure and re-enable this test
+add_example_executable_no_testing(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+// DataType
+using ADataType         = F16;
+using BDataType         = F16;
+using GemmAccDataType   = F32;
+using CShuffleDataType  = F32;
+using D0DataType        = F16;
+using D1DataType        = F16;
+using DsDataType        = ck::Tuple<D0DataType, D1DataType>;
+using EDataType         = F16;
+using ReduceAccDataType = F32;
+using R0DataType        = F32;
+using R1DataType        = F32;
+using RsDataType        = ck::Tuple<R0DataType, R1DataType>;
+// Layout
+using ALayout  = Row;
+using BLayout  = Col;
+using D1Layout = Row;
+using ELayout  = D1Layout;
+// Elementwise op
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using AddAdd       = ck::tensor_operation::element_wise::AddAdd;
+using Square       = ck::tensor_operation::element_wise::UnarySquare;
+using Div          = ck::tensor_operation::element_wise::UnaryDivide;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAdd;
+using QsElementOp  = ck::Tuple<PassThrough, Square>;
+using RsElementOp  = ck::Tuple<Div, Div>;
+// ReduceOp
+using R0ThreadReduceOp = ck::reduce::Add;
+using R1ThreadReduceOp = ck::reduce::Add;
+using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
+static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+//######| ALayout| BLayout| ELayout|     AData|     BData|     GemmAccData|         CShuffle|     DsData|     EData|     ReduceAccData|     RsData|           A|           B|          CDE|          Qs|          Rs|           Thread|           Global|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|    CDRThreadTransfer|                  CDE|    RThreadTransfer|
+//######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
+//######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
+//######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        EDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+template <typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType,
+          typename R0DataType,
+          typename R1DataType>
+void DumpPerf(float ave_time, int M, int N, int K)
+{
+    std::size_t flop          = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
+    std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                sizeof(D0DataType) * M * N + sizeof(D1DataType) * M * N +
+                                sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
+                                sizeof(R1DataType) * M;
+    float tflops          = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
+              << " GB/s, " << std::endl;
+}
+auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+    return HostTensorDescriptor(std::vector<std::size_t>({len}),
+                                std::vector<std::size_t>({stride}));
+};
+auto f_host_tensor_descriptor2d =
+    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                        std::vector<std::size_t>({stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                        std::vector<std::size_t>({1, stride}));
+        }
+    };
+int main()
+{
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+    ck::index_t StrideA  = 1024;
+    ck::index_t StrideB  = 1024;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 1024;
+    ck::index_t StrideE  = 1024;
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor2d(M, N, StrideD1, D1Layout{}));
+    Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<R1DataType> r1_m(f_host_tensor_descriptor1d(M, 1));
+    a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
+    b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
+    d0_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-1, 1});
+    d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-1, 1});
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
+    DeviceMem r1_device_buf(sizeof(R1DataType) * r1_m.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto qs_element_op  = QsElementOp{};
+    auto rs_element_op  = RsElementOp{N, N};
+    // Prepare GEMM, mean, mean_square
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               {r0_device_buf.GetDeviceBuffer(), r1_device_buf.GetDeviceBuffer()},
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {StrideD0, StrideD1},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op,
+                               qs_element_op,
+                               rs_element_op);
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+    // init reducetion buffer to 0
+    r0_device_buf.SetZero();
+    r1_device_buf.SetZero();
+    invoker.Run(argument, StreamConfig{nullptr, false});
+    bool do_verification = true;
+    bool pass            = true;
+    if(do_verification)
+    {
+        auto I0 = ck::Number<0>{};
+        auto I1 = ck::Number<1>{};
+        Tensor<EDataType> e_m_n_host(e_m_n.mDesc);
+        Tensor<R0DataType> r0_m_host(r0_m.mDesc);
+        Tensor<R1DataType> r1_m_host(r1_m.mDesc);
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        auto reduce0_op = R0ThreadReduceOp{};
+        auto reduce1_op = R1ThreadReduceOp{};
+        for(int m = 0; m < M; ++m)
+        {
+            auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+            auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
+            for(int n = 0; n < N; ++n)
+            {
+                ReduceAccDataType square_e_val;
+                auto e_val  = ck::type_convert<GemmAccDataType>(e_m_n_host(m, n));
+                auto d0_val = ck::type_convert<GemmAccDataType>(d0_n(n));
+                auto d1_val = ck::type_convert<GemmAccDataType>(d1_m_n(m, n));
+                cde_element_op(e_val, e_val, d0_val, d1_val);
+                e_m_n_host(m, n) = ck::type_convert<EDataType>(e_val);
+                auto e_val_reduce = ck::type_convert<ReduceAccDataType>(e_val);
+                qs_element_op[I1](square_e_val, e_val_reduce);
+                reduce0_op(reduce0_acc, e_val_reduce);
+                reduce1_op(reduce1_acc, square_e_val);
+            }
+            rs_element_op[I0](reduce0_acc, reduce0_acc);
+            rs_element_op[I1](reduce1_acc, reduce1_acc);
+            r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
+            r1_m_host(m) = ck::type_convert<R1DataType>(reduce1_acc);
+        }
+        e_device_buf.FromDevice(e_m_n.mData.data());
+        r0_device_buf.FromDevice(r0_m.mData.data());
+        r1_device_buf.FromDevice(r1_m.mData.data());
+        pass = ck::utils::check_err(
+            e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results c", 1e-2, 1e-2);
+        pass &= ck::utils::check_err(
+            r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2);
+        pass &= ck::utils::check_err(
+            r1_m.mData, r1_m_host.mData, "Error: Incorrect results d1", 1e-2, 1e-2);
+    }
+    bool time_kernel = true;
+    if(time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+        DumpPerf<ADataType, BDataType, D0DataType, D1DataType, EDataType, R0DataType, R1DataType>(
+            ave_time, M, N, K);
+    }
+    return pass ? 0 : 1;
+}