Merge remote-tracking branch 'origin/develop' into group_conv

8a370fbb · Chao Liu · d8fdd226 · 85978e02 · 8a370fbb · 8a370fbb
Commit 8a370fbb authored Jul 26, 2022 by Chao Liu
20 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -11,6 +11,12 @@ def show_node_info() {
    """
 }
+def runShell(String command){
+    def responseCode = sh returnStatus: true, script: "${command} &> tmp.txt"
+    def output = readFile(file: "tmp.txt")
+    return (output != "")
+}
 def cmake_build(Map conf=[:]){
    def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
@@ -60,7 +66,7 @@ def cmake_build(Map conf=[:]){
        """
    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
    // reduce parallelism when compiling, clang uses too much memory
-    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 1 )) ${config_targets}")
+    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 2 )) ${config_targets}")
    def execute_cmd = conf.get("execute_cmd", "")
    def cmd = conf.get("cmd", """
@@ -113,7 +119,14 @@ def buildHipClangJob(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + '.')
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            echo "GPU not found"
+                            throw e
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
                    }
                }
            }
@@ -125,7 +138,14 @@ def buildHipClangJob(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            echo "GPU not found"
+                            throw e
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
                    }
                }
            }
@@ -133,7 +153,14 @@ def buildHipClangJob(Map conf=[:]){
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 5, unit: 'HOURS')
                {
-                    sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                    sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                    if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                        echo "GPU not found"
+                        throw e
+                    }
+                    else{
+                        echo "GPU is OK"
+                    }
                    cmake_build(conf)
                }
            }
@@ -145,7 +172,6 @@ def reboot(){
    build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
 }
 def buildHipClangJobAndReboot(Map conf=[:]){
    try{
        buildHipClangJob(conf)
@@ -162,7 +188,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
    }
 }
 def runCKProfiler(Map conf=[:]){
        show_node_info()
@@ -189,7 +214,6 @@ def runCKProfiler(Map conf=[:]){
        }
        def variant = env.STAGE_NAME
        def retimage
        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
@@ -197,7 +221,14 @@ def runCKProfiler(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + '.')
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            echo "GPU not found"
+                            throw e
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
                    }
                }
            }
@@ -209,89 +240,69 @@ def runCKProfiler(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            echo "GPU not found"
+                            throw e
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
                    }
                }
            }
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 5, unit: 'HOURS')
+                timeout(time: 24, unit: 'HOURS')
                {
                    cmake_build(conf)
 					dir("script"){
-                        //run gemm performance tests
+                        if (params.RUN_FULL_QA){
-                        def gemm_log = "perf_gemm_${gpu_arch}.log"
+                            def qa_log = "qa_${gpu_arch}.log"
-                        sh "rm -f ${gemm_log}"
+                            if (params.USE_9110){
-                        sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
+                                sh "./run_full_performance_tests.sh 1 QA_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
-                        sh "echo Node name: ${NODE_NAME} >> ${gemm_log}"
+                            }
-                        sh "echo GPU_arch name: ${gpu_arch}  >> ${gemm_log}"
+                            else{
-                        sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
+                                sh "./run_full_performance_tests.sh 1 QA_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
-                        sh "hipcc --version | grep -e 'HIP version'  >> ${gemm_log}"
+                            }
-                        if (params.USE_9110){
+                            archiveArtifacts "perf_gemm_${gpu_arch}.log"
-                            sh "echo Environment type: CI_9110  >> ${gemm_log}"
+                            archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
-                        }
+                            archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
-                        else{
+                            archiveArtifacts "perf_bathced_gemm_${gpu_arch}.log"
-                            sh "echo Environment type: CI_release  >> ${gemm_log}"
+                            archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
-                        }
+                            archiveArtifacts "perf_fwd_conv_${gpu_arch}.log"
-                        sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
+                            archiveArtifacts "perf_bwd_conv_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
+                            archiveArtifacts "perf_fusion_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
+                            archiveArtifacts "perf_reduction_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}"
+                           // stash perf files to master
-                        sh "./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}"
+                            stash name: "perf_gemm_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}"
+                            stash name: "perf_resnet50_N256_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}"
+                            stash name: "perf_resnet50_N4_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}"
+                            stash name: "perf_bathced_gemm_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}"
+                            stash name: "perf_grouped_gemm_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}"
+                            stash name: "perf_fwd_conv_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}"
+                            stash name: "perf_bwd_conv_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}"
+                            stash name: "perf_fusion_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}"
+                            stash name: "perf_reduction_${gpu_arch}.log"
-                        sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}"
+                            //we will process results on the master node
-                        sh "./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}"
-                        //results will be parsed, stored, and analyzed within the python script
-                        //the script will return 0 if the performance criteria are met
-                        //or return 1 if the criteria are not met
-                        archiveArtifacts  "${gemm_log}"
-                        sh "python3 process_perf_data.py ${gemm_log} "
-                        //run resnet50 test
-                        def resnet256_log = "perf_resnet50_N256_${gpu_arch}.log"
-                        sh "rm -f ${resnet256_log}"
-                        sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet256_log}"
-                        sh "echo Node name: ${NODE_NAME} >> ${resnet256_log}"
-                        sh "echo GPU_arch name: ${gpu_arch}  >> ${resnet256_log}"
-                        sh "rocminfo | grep 'Compute Unit:' >> ${resnet256_log} "
-                        sh "hipcc --version | grep -e 'HIP version'  >> ${resnet256_log}"
-                        if (params.USE_9110){
-                            sh "echo Environment type: CI_9110  >> ${resnet256_log}"
                        }
                        else{
-                            sh "echo Environment type: CI_release  >> ${resnet256_log}"
+                            if (params.USE_9110){
+                                sh "./run_performance_tests.sh 0 CI_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            else{
+                                sh "./run_performance_tests.sh 0 CI_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            archiveArtifacts "perf_gemm_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
+                            // stash perf files to master
+                            stash name: "perf_gemm_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N256_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N4_${gpu_arch}.log"
+                            //we will process the results on the master node
                        }
-                        sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet256_log}"
-                        //first run tests with N=256
-                        sh "./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet256_log}"
-                        archiveArtifacts  "${resnet256_log}"
-                        sh "python3 process_perf_data.py ${resnet256_log} "
-                        //then run with N=4
-                        def resnet4_log = "perf_resnet50_N4_${gpu_arch}.log"
-                        sh "rm -f ${resnet4_log}"
-                        sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet4_log}"
-                        sh "echo Node name: ${NODE_NAME} >> ${resnet4_log}"
-                        sh "echo GPU_arch name: ${gpu_arch}  >> ${resnet4_log}"
-                        sh "rocminfo | grep 'Compute Unit:' >> ${resnet4_log} "
-                        sh "hipcc --version | grep -e 'HIP version'  >> ${resnet4_log}"
-                        if (params.USE_9110){
-                            sh "echo Environment type: CI_9110  >> ${resnet4_log}"
-                        }
-                        else{
-                            sh "echo Environment type: CI_release  >> ${resnet4_log}"
-                        }
-                        sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet4_log}"
-                        sh "./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet4_log}"
-                        archiveArtifacts  "${resnet4_log}"
-                        sh "python3 process_perf_data.py ${resnet4_log} "
 					}
                }
            }
@@ -299,7 +310,6 @@ def runCKProfiler(Map conf=[:]){
        return retimage
 }
 def runPerfTest(Map conf=[:]){
    try{
        runCKProfiler(conf)
@@ -316,8 +326,76 @@ def runPerfTest(Map conf=[:]){
    }
 }
+def process_results(Map conf=[:]){
+    env.HSA_ENABLE_SDMA=0
+    checkout scm
+    def image = "composable_kernels"
+    def prefixpath = "/opt/rocm"
+    def gpu_arch = conf.get("gpu_arch", "gfx908")
+    // Jenkins is complaining about the render group 
+    def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+    if (conf.get("enforce_xnack_on", false)) {
+        dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+    }
+    def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
+    def variant = env.STAGE_NAME
+    def retimage
+    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+        try {
+            retimage = docker.build("${image}", dockerArgs + '.')
+        }
+        catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+            echo "The job was cancelled or aborted"
+            throw e
+        }
+    }
+    withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+        timeout(time: 1, unit: 'HOURS'){
+            try{
+                dir("script"){
+                    if (params.RUN_FULL_QA){
+                        // unstash perf files to master
+                        unstash "perf_gemm_${gpu_arch}.log"
+                        unstash "perf_resnet50_N256_${gpu_arch}.log"
+                        unstash "perf_resnet50_N4_${gpu_arch}.log"
+                        unstash "perf_bathced_gemm_${gpu_arch}.log"
+                        unstash "perf_grouped_gemm_${gpu_arch}.log"
+                        unstash "perf_fwd_conv_${gpu_arch}.log"
+                        unstash "perf_bwd_conv_${gpu_arch}.log"
+                        unstash "perf_fusion_${gpu_arch}.log"
+                        unstash "perf_reduction_${gpu_arch}.log"
+                        sh "./process_qa_data.sh ${gpu_arch}"
+                    }
+                    else{
+                        // unstash perf files to master
+                        unstash "perf_gemm_${gpu_arch}.log"
+                        unstash "perf_resnet50_N256_${gpu_arch}.log"
+                        unstash "perf_resnet50_N4_${gpu_arch}.log"
+                        sh "./process_perf_data.sh ${gpu_arch}"
+                    }
+                }
+            }
+            catch(e){
+                echo "throwing error exception while processing performance test results"
+                echo 'Exception occurred: ' + e.toString()
+                throw e
+            }
+        }
+    }
+}
+//launch develop branch daily at 23:00 in FULL_QA mode
+//CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;USE_9110=true''' : ""
 pipeline {
    agent none
+    //triggers {
+    //    cron(CRON_SETTINGS)
+    //}
    options {
        parallelsAlwaysFailFast()
    }
@@ -325,7 +403,11 @@ pipeline {
        booleanParam(
            name: "USE_9110",
            defaultValue: true,
-            description: "")
+            description: "Select compiler version: 9110 (default) or release")
+        booleanParam(
+            name: "RUN_FULL_QA",
+            defaultValue: false,
+            description: "Select whether to run small set of performance tests (default) or full QA")
    }
    environment{
        dbuser = "${dbuser}"
@@ -438,6 +520,25 @@ pipeline {
                }
            }
        }
+        stage("Process Performance Test Results")
+        {
+            parallel
+            {
+                stage("Process results for gfx908"){
+                    agent { label 'mici' }
+                    steps{
+                        process_results(gpu_arch: "gfx908")
+                    }
+                }
+                stage("Process results for gfx90a"){
+                    agent { label 'mici' }
+                    steps{
+                        process_results(gpu_arch: "gfx90a")
+                    }
+                }
+            }
+        }
        /* enable after the cmake file supports packaging
        stage("Packages") {
            when {

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -29,34 +29,40 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ADataType   = ck::half_t;
+using ADataType        = F16;
-using BDataType   = ck::half_t;
+using BDataType        = F16;
-using CDataType   = ck::half_t;
+using AccDataType      = F32;
-using AccDataType = float;
+using CShuffleDataType = F16;
+using DsDataType       = ck::Tuple<>;
-using ALayout = ck::tensor_layout::gemm::RowMajor;
+using EDataType        = F16;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
+using ALayout  = Row;
+using BLayout  = Col;
-using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using DsLayout = ck::Tuple<>;
-using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using ELayout  = Row;
-using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-// static constexpr auto GemmMNPadding =
-// ck::tensor_operation::device::GemmSpecialization::MNPadding;
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
-// clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|      Num|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-//######|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|         |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
-//######|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |
-        <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1>;
 // clang-format on
-using ReferenceGemmInstance = ck::tensor_operation::host::
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+                                                                        BDataType,
+                                                                        EDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
 int main(int argc, char* argv[])
 {
@@ -81,11 +87,11 @@ int main(int argc, char* argv[])
    int group_count = rand() % 16 + 1;
    // GEMM shape
-    std::vector<ck::tensor_operation::device::GemmShape> gemm_shapes;
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
    std::vector<const void*> p_a, p_b;
    std::vector<void*> p_c;
-    gemm_shapes.reserve(group_count);
+    gemm_descs.reserve(group_count);
    for(int i = 0; i < group_count; i++)
    {
@@ -93,7 +99,11 @@ int main(int argc, char* argv[])
        int N = 128 + 128 * i;
        int K = 64 + 64 * i;
-        gemm_shapes.push_back({M, N, K, K, K, N});
+        int stride_A = K;
+        int stride_B = K;
+        int stride_C = N;
+        gemm_descs.push_back({M, N, K, stride_A, stride_B, stride_C, {}});
    }
    auto f_host_tensor_descriptor =
@@ -111,10 +121,9 @@ int main(int argc, char* argv[])
        };
    std::vector<Tensor<ADataType>> a_tensors;
-    ;
    std::vector<Tensor<BDataType>> b_tensors;
-    std::vector<Tensor<CDataType>> c_host_tensors;
+    std::vector<Tensor<EDataType>> c_host_tensors;
-    std::vector<Tensor<CDataType>> c_device_tensors;
+    std::vector<Tensor<EDataType>> c_device_tensors;
    a_tensors.reserve(group_count);
    b_tensors.reserve(group_count);
@@ -131,25 +140,25 @@ int main(int argc, char* argv[])
    std::size_t flop = 0, num_btype = 0;
-    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_descs.size(); i++)
    {
        a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor(
-            gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{})));
+            gemm_descs[i].M_, gemm_descs[i].K_, gemm_descs[i].stride_A_, ALayout{})));
        b_tensors.push_back(Tensor<BDataType>(f_host_tensor_descriptor(
-            gemm_shapes[i].K, gemm_shapes[i].N, gemm_shapes[i].StrideB, BLayout{})));
+            gemm_descs[i].K_, gemm_descs[i].N_, gemm_descs[i].stride_B_, BLayout{})));
-        c_host_tensors.push_back(Tensor<CDataType>(f_host_tensor_descriptor(
+        c_host_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
-            gemm_shapes[i].M, gemm_shapes[i].N, gemm_shapes[i].StrideC, CLayout{})));
+            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
-        c_device_tensors.push_back(Tensor<CDataType>(f_host_tensor_descriptor(
+        c_device_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
-            gemm_shapes[i].M, gemm_shapes[i].N, gemm_shapes[i].StrideC, CLayout{})));
+            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
        std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].mDesc
                  << " b_k_n: " << b_tensors[i].mDesc << " c_m_n: " << c_device_tensors[i].mDesc
                  << std::endl;
-        flop += std::size_t(2) * gemm_shapes[i].M * gemm_shapes[i].K * gemm_shapes[i].N;
+        flop += std::size_t(2) * gemm_descs[i].M_ * gemm_descs[i].K_ * gemm_descs[i].N_;
        num_btype += sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize() +
                     sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize() +
-                     sizeof(CDataType) * c_device_tensors[i].mDesc.GetElementSize();
+                     sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSize();
        switch(init_method)
        {
@@ -168,14 +177,14 @@ int main(int argc, char* argv[])
        }
    }
-    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_descs.size(); i++)
    {
        a_tensors_device.emplace_back(std::make_unique<DeviceMem>(
            sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpaceSize()));
        b_tensors_device.emplace_back(std::make_unique<DeviceMem>(
            sizeof(BDataType) * b_tensors[i].mDesc.GetElementSpaceSize()));
        c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
-            sizeof(CDataType) * c_device_tensors[i].mDesc.GetElementSpaceSize()));
+            sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSpaceSize()));
        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
@@ -187,14 +196,16 @@ int main(int argc, char* argv[])
    auto a_element_op = AElementOp{};
    auto b_element_op = BElementOp{};
-    auto c_element_op = CElementOp{};
+    auto c_element_op = CDEElementOp{};
    auto gemm    = DeviceGemmInstance{};
    auto invoker = gemm.MakeInvoker();
+    std::vector<std::array<const void*, 0>> p_Ds = {};
    // do GEMM
-    auto argument =
+    auto argument = gemm.MakeArgument(
-        gemm.MakeArgument(p_a, p_b, p_c, gemm_shapes, a_element_op, b_element_op, c_element_op);
+        p_a, p_b, p_Ds, p_c, gemm_descs, a_element_op, b_element_op, c_element_op);
    DeviceMem gemm_desc_workspace(gemm.GetWorkSpaceSize(&argument));
@@ -219,7 +230,7 @@ int main(int argc, char* argv[])
    bool pass = true;
    if(do_verification)
    {
-        for(std::size_t i = 0; i < gemm_shapes.size(); i++)
+        for(std::size_t i = 0; i < gemm_descs.size(); i++)
        {
            c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
            auto ref_gemm    = ReferenceGemmInstance{};

--- a/example/24_batched_gemm_e_permute/batched_gemm_e_permute_xdl_fp16.cpp
+++ b/example/24_batched_gemm_e_permute/batched_gemm_e_permute_xdl_fp16.cpp
@@ -26,31 +26,29 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ADataType        = ck::half_t;
+using ADataType        = F16;
-using BDataType        = ck::half_t;
+using BDataType        = F16;
-using AccDataType      = float;
+using AccDataType      = F32;
-using CShuffleDataType = ck::half_t;
+using CShuffleDataType = F16;
-using EDataType        = ck::half_t;
+using EDataType        = F16;
-using ALayout = ck::tensor_layout::gemm::RowMajor;
+using ALayout = Row;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using BLayout = Col;
+using ELayout = Row;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp   = PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = PassThrough;
-// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-// static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
-static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-// clang-format off
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmEPermuteXdl
-//######| ALayout| BLayout| AData| BData| AccData| CShuffle| EData|           A|           B|           C|          GEMM|      Num| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|     CShuffle|    CShuffle| CBlockTransferClusterLengths|   CBlockTransfer|
+    // clang-format off
-//######|        |        |  Type|  Type|    Type|     Data|  Type| Elementwise| Elementwise| Elementwise|Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl|  ScalarPerVector|
+//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//######|        |        |      |      |        |     Type|      |   Operation|   Operation|   Operation|              |         |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |   PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|    _NWaveNPerXdl|
+//######|        |        |        |      Type|      Type|        Type|         DataType|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######|        |        |      |      |        |         |      |            |            |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |             |            |                             |                 |
+//######|        |        |        |          |          |            |                 |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//      <     Row,     Col,   F16,   F16,     F32,      F16,   F16, PassThrough, PassThrough, PassThrough,     MNPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,            1,           1,               S<1, 32, 1, 8>,                8>;
+//######|        |        |        |          |          |            |                 |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        <     Row,     Col,   F16,   F16,     F32,      F16,   F16, PassThrough, PassThrough, PassThrough,    MNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,            1,           1,               S<1, 32, 1, 8>,                8>;
+        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on
 using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
@@ -62,15 +60,18 @@ int main(int argc, char* argv[])
    int init_method      = 1;
    bool time_kernel     = false;
-    const int M = 88;
+    const int M = 256;
-    const int N = 64;
+    const int N = 128;
-    const int K = 88;
+    const int K = 64;
    const int stride_A = K;
    const int stride_B = K;
-    const int G0 = 1024;
+    const int batch_stride_A = M * K;
-    const int G1 = 10;
+    const int batch_stride_B = K * N;
+    const int G0 = 16;
+    const int G1 = 8;
    const int batch_count = G0 * G1;
@@ -102,21 +103,24 @@ int main(int argc, char* argv[])
                                       std::size_t row,
                                       std::size_t col,
                                       std::size_t stride,
+                                       std::size_t batch_stride,
                                       auto layout) {
        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
-                                        std::vector<std::size_t>({row * stride, stride, 1}));
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
        }
        else
        {
            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
-                                        std::vector<std::size_t>({col * stride, 1, stride}));
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
        }
    };
-    Tensor<ADataType> a_g_m_k(f_host_tensor_descriptor(batch_count, M, K, stride_A, ALayout{}));
+    Tensor<ADataType> a_g_m_k(
-    Tensor<BDataType> b_g_k_n(f_host_tensor_descriptor(batch_count, K, N, stride_B, BLayout{}));
+        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
    auto f_host_e_tensor_descriptor = [](std::size_t G0_,
                                         std::size_t G1_,
@@ -169,7 +173,7 @@ int main(int argc, char* argv[])
    auto gemm    = DeviceGemmInstance{};
    auto invoker = gemm.MakeInvoker();
-    // do GEMM
+    // do GEM
    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
                                      static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()),
@@ -178,11 +182,13 @@ int main(int argc, char* argv[])
                                      K,
                                      stride_A,
                                      stride_B,
+                                      batch_stride_A,
+                                      batch_stride_B,
                                      batched_gemm_e_permute_desc,
+                                      batch_count,
                                      a_element_op,
                                      b_element_op,
-                                      cde_element_op,
+                                      cde_element_op);
-                                      batch_count);
    if(!gemm.IsSupportedArgument(argument))
    {

--- a/example/28_grouped_gemm_bias/CMakeLists.txt
+++ b/example/28_grouped_gemm_bias/CMakeLists.txt
+add_example_executable(example_grouped_gemm_bias_xdl_fp16 grouped_gemm_bias_xdl_fp16.cpp)
--- a/example/28_grouped_gemm_bias/grouped_gemm_bias_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias/grouped_gemm_bias_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+using ALayout  = Row;
+using BLayout  = Col;
+using DLayout  = Row;
+using DsLayout = ck::Tuple<DLayout>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+    int group_count = rand() % 16 + 1;
+    // GEMM shape
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+    std::vector<const void*> p_a, p_b;
+    std::vector<std::array<const void*, 1>> p_ds;
+    std::vector<void*> p_c;
+    gemm_descs.reserve(group_count);
+    for(int i = 0; i < group_count; i++)
+    {
+        int M = 256 + 256 * i;
+        int N = 128 + 128 * i;
+        int K = 64 + 64 * i;
+        int stride_A = K;
+        int stride_B = K;
+        int stride_C = N;
+        std::vector<ck::index_t> stride_Ds = {0};
+        gemm_descs.push_back({M, N, K, stride_A, stride_B, stride_C, stride_Ds});
+    }
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+    std::vector<Tensor<ADataType>> a_tensors;
+    std::vector<Tensor<BDataType>> b_tensors;
+    std::vector<Tensor<DDataType>> d_tensors;
+    std::vector<Tensor<EDataType>> e_host_tensors;
+    std::vector<Tensor<EDataType>> e_device_tensors;
+    a_tensors.reserve(group_count);
+    b_tensors.reserve(group_count);
+    d_tensors.reserve(group_count);
+    e_host_tensors.reserve(group_count);
+    e_device_tensors.reserve(group_count);
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+    std::vector<DeviceMemPtr> a_tensors_device, b_tensors_device, d_tensors_device,
+        e_tensors_device;
+    a_tensors_device.reserve(group_count);
+    b_tensors_device.reserve(group_count);
+    d_tensors_device.reserve(group_count);
+    e_tensors_device.reserve(group_count);
+    std::size_t flop = 0, num_btype = 0;
+    for(std::size_t i = 0; i < gemm_descs.size(); i++)
+    {
+        a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor(
+            gemm_descs[i].M_, gemm_descs[i].K_, gemm_descs[i].stride_A_, ALayout{})));
+        b_tensors.push_back(Tensor<BDataType>(f_host_tensor_descriptor(
+            gemm_descs[i].K_, gemm_descs[i].N_, gemm_descs[i].stride_B_, BLayout{})));
+        d_tensors.push_back(Tensor<DDataType>(f_host_tensor_descriptor(
+            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_Ds_[0], ELayout{})));
+        e_host_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
+        e_device_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
+        std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].mDesc
+                  << " b_k_n: " << b_tensors[i].mDesc << " c_m_n: " << e_device_tensors[i].mDesc
+                  << std::endl;
+        flop += std::size_t(2) * gemm_descs[i].M_ * gemm_descs[i].K_ * gemm_descs[i].N_;
+        num_btype += sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize() +
+                     sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize() +
+                     sizeof(EDataType) * e_device_tensors[i].mDesc.GetElementSize();
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            d_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            break;
+        case 2:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+            d_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+            break;
+        default:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            d_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+        }
+    }
+    for(std::size_t i = 0; i < gemm_descs.size(); i++)
+    {
+        a_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpaceSize()));
+        b_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(BDataType) * b_tensors[i].mDesc.GetElementSpaceSize()));
+        d_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(DDataType) * d_tensors[i].mDesc.GetElementSpaceSize()));
+        e_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(EDataType) * e_device_tensors[i].mDesc.GetElementSpaceSize()));
+        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
+        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
+        d_tensors_device[i]->ToDevice(d_tensors[i].mData.data());
+        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
+        p_b.push_back(b_tensors_device[i]->GetDeviceBuffer());
+        p_ds.push_back({d_tensors_device[i]->GetDeviceBuffer()});
+        p_c.push_back(e_tensors_device[i]->GetDeviceBuffer());
+    }
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    // do GEMM
+    auto argument = gemm.MakeArgument(
+        p_a, p_b, p_ds, p_c, gemm_descs, a_element_op, b_element_op, cde_element_op);
+    DeviceMem gemm_desc_workspace(gemm.GetWorkSpaceSize(&argument));
+    gemm.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer());
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                EDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        for(std::size_t i = 0; i < gemm_descs.size(); i++)
+        {
+            e_tensors_device[i]->FromDevice(e_device_tensors[i].mData.data());
+            auto ref_gemm    = ReferenceGemmInstance{};
+            auto ref_invoker = ref_gemm.MakeInvoker();
+            auto ref_argument = ref_gemm.MakeArgument(a_tensors[i],
+                                                      b_tensors[i],
+                                                      e_host_tensors[i],
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      PassThrough{});
+            ref_invoker.Run(ref_argument);
+            for(int m = 0; m < gemm_descs[i].M_; ++m)
+            {
+                for(int n = 0; n < gemm_descs[i].N_; ++n)
+                {
+                    cde_element_op(
+                        e_host_tensors[i](m, n), e_host_tensors[i](m, n), d_tensors[i](m, n));
+                }
+            }
+            pass &= ck::utils::check_err(e_device_tensors[i].mData, e_host_tensors[i].mData);
+        }
+    }
+    return pass ? 0 : 1;
+}
--- a/example/29_batched_gemm_multi_d/CMakeLists.txt
+++ b/example/29_batched_gemm_multi_d/CMakeLists.txt
+add_example_executable(example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_bias_xdl_fp16 batched_gemm_bias_xdl_fp16.cpp)
--- a/example/29_batched_gemm_multi_d/batched_gemm_bias_xdl_fp16.cpp
+++ b/example/29_batched_gemm_multi_d/batched_gemm_bias_xdl_fp16.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+using ALayout  = Row;
+using BLayout  = Col;
+using DLayout  = Row;
+using DsLayout = ck::Tuple<DLayout>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+// static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    const int M = 256 * (rand() % 16 + 1);
+    const int N = 128 * (rand() % 16 + 1);
+    const int K = 64 * (rand() % 16 + 1);
+    const int stride_A = K;
+    const int stride_B = K;
+    const int stride_D = 0;
+    const int stride_E = N;
+    const int batch_stride_A = M * K;
+    const int batch_stride_B = K * N;
+    const int batch_stride_D = N;
+    const int batch_stride_E = M * N;
+    const int batch_count = 16;
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+    // GEMM shape
+    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+        }
+    };
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
+    Tensor<DDataType> d_g_m_n(
+        f_host_tensor_descriptor(batch_count, M, N, stride_D, batch_stride_D, DLayout{}));
+    Tensor<EDataType> e_g_m_n_device_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_E, batch_stride_E, ELayout{}));
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "d_g_m_n: " << d_g_m_n.mDesc << std::endl;
+    std::cout << "e_g_m_n: " << e_g_m_n_device_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_g_m_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_g_m_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+    d_device_buf.ToDevice(d_g_m_n.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    // do GEMM
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {d_device_buf.GetDeviceBuffer()},
+                                      c_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      batch_count,
+                                      stride_A,
+                                      stride_B,
+                                      {stride_D},
+                                      stride_E,
+                                      batch_stride_A,
+                                      batch_stride_B,
+                                      {batch_stride_D},
+                                      batch_stride_E,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = std::size_t(2) * batch_count * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * batch_count * M * K +
+                            sizeof(BDataType) * batch_count * K * N +
+                            sizeof(EDataType) * batch_count * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(e_g_m_n_device_result.mData.data());
+        using ReferenceBatchedGemmInstance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                             BDataType,
+                                                             EDataType,
+                                                             AElementOp,
+                                                             BElementOp,
+                                                             PassThrough>;
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+        Tensor<EDataType> e_g_m_n_host_result(
+            f_host_tensor_descriptor(batch_count, M, N, stride_E, batch_stride_E, ELayout{}));
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, e_g_m_n_host_result, a_element_op, b_element_op, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        for(int g = 0; g < batch_count; g++)
+        {
+            for(int m = 0; m < M; ++m)
+            {
+                for(int n = 0; n < N; ++n)
+                {
+                    cde_element_op(e_g_m_n_host_result(g, m, n),
+                                   e_g_m_n_host_result(g, m, n),
+                                   d_g_m_n(g, m, n));
+                }
+            }
+        }
+        pass = ck::utils::check_err(
+            e_g_m_n_host_result.mData, e_g_m_n_device_result.mData, "Error: Incorrect results c");
+    }
+    return pass ? 0 : 1;
+}
--- a/example/29_batched_gemm_multi_d/batched_gemm_xdl_fp16.cpp
+++ b/example/29_batched_gemm_multi_d/batched_gemm_xdl_fp16.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F16;
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+// static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
+    ReferenceBatchedGemm<ADataType, BDataType, EDataType, AElementOp, BElementOp, CDEElementOp>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    const int M = 256 * (rand() % 16 + 1);
+    const int N = 128 * (rand() % 16 + 1);
+    const int K = 64 * (rand() % 16 + 1);
+    const int stride_A = K;
+    const int stride_B = K;
+    const int stride_C = N;
+    const int batch_stride_A = M * K;
+    const int batch_stride_B = K * N;
+    const int batch_stride_C = M * N;
+    const int batch_count = 16;
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+    // GEMM shape
+    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+        }
+    };
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
+    Tensor<EDataType> e_g_m_n_device_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "e_g_m_n: " << e_g_m_n_device_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    // do GEMM
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {},
+                                      c_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      batch_count,
+                                      stride_A,
+                                      stride_B,
+                                      {},
+                                      stride_C,
+                                      batch_stride_A,
+                                      batch_stride_B,
+                                      {},
+                                      batch_stride_C,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = std::size_t(2) * batch_count * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * batch_count * M * K +
+                            sizeof(BDataType) * batch_count * K * N +
+                            sizeof(EDataType) * batch_count * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(e_g_m_n_device_result.mData.data());
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+        Tensor<EDataType> e_g_m_n_host_result(
+            f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, e_g_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+        ref_invoker.Run(ref_argument);
+        pass = ck::utils::check_err(
+            e_g_m_n_host_result.mData, e_g_m_n_device_result.mData, "Error: Incorrect results c");
+    }
+    return pass ? 0 : 1;
+}
--- a/example/28_group_convnd_fwd_bias_relu/CMakeLists.txt
+++ b/example/28_group_convnd_fwd_bias_relu/CMakeLists.txt
--- a/example/28_group_convnd_fwd_bias_relu/README.md
+++ b/example/28_group_convnd_fwd_bias_relu/README.md
--- a/example/28_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
+++ b/example/28_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
--- a/example/28_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
+++ b/example/28_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -42,4 +42,6 @@ add_subdirectory(24_batched_gemm_e_permute)
 add_subdirectory(25_gemm_bias_e_permute)
 add_subdirectory(26_contraction)
 add_subdirectory(27_layernorm)
-add_subdirectory(28_group_convnd_fwd_bias_relu)
+add_subdirectory(28_grouped_gemm_bias)
+add_subdirectory(29_batched_gemm_multi_d)
+add_subdirectory(30_grouped_convnd_fwd_bias_relu)
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_c_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_c_permute_xdl.hpp
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
@@ -14,7 +14,13 @@ struct BatchedGemmEPermuteDesc
    ck::index_t stride_G0_, stride_G1_, stride_M_, stride_N_;
 };
-template <typename AElementwiseOperation,
+template <typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CDEElementwiseOperation>
 struct DeviceBatchedGemmEPermute : public BaseOperator
@@ -28,11 +34,13 @@ struct DeviceBatchedGemmEPermute : public BaseOperator
                        index_t K,
                        index_t stride_A,
                        index_t stride_B,
+                        index_t batch_stride_A,
+                        index_t batch_stride_B,
                        BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                        index_t BatchCount,
                        AElementwiseOperation a_element_op,
                        BElementwiseOperation b_element_op,
-                        CDEElementwiseOperation cde_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
-                        ck::index_t BatchCount) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp
@@ -118,6 +118,7 @@ __global__ void
 template <typename ALayout,
          typename BLayout,
+          typename ELayout,
          typename ADataType,
          typename BDataType,
          typename AccDataType,
@@ -157,7 +158,13 @@ template <typename ALayout,
          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CDEBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched = make_default_loop_scheduler()>
-struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementwiseOperation,
+struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
+                                                                       BLayout,
+                                                                       ELayout,
+                                                                       ADataType,
+                                                                       BDataType,
+                                                                       EDataType,
+                                                                       AElementwiseOperation,
                                                                       BElementwiseOperation,
                                                                       CDEElementwiseOperation>
 {
@@ -389,11 +396,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
                 index_t K,
                 index_t stride_A,
                 index_t stride_B,
+                 index_t batch_stride_A,
+                 index_t batch_stride_B,
                 BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                 index_t BatchCount,
                 AElementwiseOperation a_element_op,
                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op,
+                 CDEElementwiseOperation cde_element_op)
-                 index_t BatchCount)
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_e_grid_{p_e_grid},
@@ -419,10 +428,7 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
                                                          batched_gemm_e_permute_desc.stride_G1_,
                                                          batched_gemm_e_permute_desc.stride_M_,
                                                          batched_gemm_e_permute_desc.stride_N_)},
-              compute_ptr_offset_of_batch_{
+              compute_ptr_offset_of_batch_{batch_stride_A, batch_stride_B, e_grid_desc_g0_g1_m_n_},
-                  type_convert<index_t>(a_grid_desc_ak0_m_ak1_.GetElementSpaceSize()),
-                  type_convert<index_t>(b_grid_desc_bk0_n_bk1_.GetElementSpaceSize()),
-                  e_grid_desc_g0_g1_m_n_},
              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
@@ -584,11 +590,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
                             index_t K,
                             index_t stride_A,
                             index_t stride_B,
+                             index_t batch_stride_A,
+                             index_t batch_stride_B,
                             BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                             index_t BatchCount,
                             AElementwiseOperation a_element_op,
                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op,
+                             CDEElementwiseOperation cde_element_op)
-                             index_t BatchCount)
    {
        return Argument{p_a,
                        p_b,
@@ -598,11 +606,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
                        K,
                        stride_A,
                        stride_B,
+                        batch_stride_A,
+                        batch_stride_B,
                        batched_gemm_e_permute_desc,
+                        BatchCount,
                        a_element_op,
                        b_element_op,
-                        cde_element_op,
+                        cde_element_op};
-                        BatchCount};
    }
    static auto MakeInvoker() { return Invoker{}; }
@@ -617,11 +627,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
                        index_t K,
                        index_t stride_A,
                        index_t stride_B,
+                        index_t batch_stride_A,
+                        index_t batch_stride_B,
                        BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                        index_t BatchCount,
                        AElementwiseOperation a_element_op,
                        BElementwiseOperation b_element_op,
-                        CDEElementwiseOperation cde_element_op,
+                        CDEElementwiseOperation cde_element_op) override
-                        index_t BatchCount) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
@@ -631,11 +643,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
                                          K,
                                          stride_A,
                                          stride_B,
+                                          batch_stride_A,
+                                          batch_stride_B,
                                          batched_gemm_e_permute_desc,
+                                          BatchCount,
                                          a_element_op,
                                          b_element_op,
-                                          cde_element_op,
+                                          cde_element_op);
-                                          BatchCount);
    }
    // polymorphic

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedGemmMultiD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsisiten NumDTensor");
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
@@ -12,12 +12,6 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-struct GemmShape
-{
-    ck::index_t M, N, K;
-    ck::index_t StrideA, StrideB, StrideC;
-};
 template <typename ALayout,
          typename BLayout,
          typename CLayout,
@@ -46,29 +40,6 @@ struct DeviceGemm : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
-template <typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct DeviceGroupedGemm : public BaseOperator
-{
-    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(std::vector<const void*>& p_a,
-                                                              std::vector<const void*>& p_b,
-                                                              std::vector<void*>& p_c,
-                                                              std::vector<GemmShape>& gemm_shapes,
-                                                              AElementwiseOperation a_element_op,
-                                                              BElementwiseOperation b_element_op,
-                                                              CElementwiseOperation c_element_op,
-                                                              ck::index_t KBatch = 1) = 0;
-    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-};
-template <typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-using DeviceGroupedGemmPtr = std::unique_ptr<
-    DeviceGroupedGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -349,13 +349,13 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                                           e_grid_desc_m_n_,
                                           block_2_etile_map_))
            {
-                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                        ds_grid_desc_m_n_);
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
            }
        }
@@ -411,8 +411,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                                            arg.e_grid_desc_m_n_,
                                            arg.block_2_etile_map_))
            {
-                throw std::runtime_error(
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
-                    "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
            }
            const index_t grid_size =