Rebase branch 'develop' of...

Rebase branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into contraction_hipTENSOR

Rebase branch 'develop' of...
Rebase branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into contraction_hipTENSOR
aea62819 · Chaitanya Inumella · 75af5450 · 75ab874e · aea62819 · aea62819
Commit aea62819 authored Aug 03, 2022 by Chaitanya Inumella
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,6 +2,7 @@ FROM ubuntu:18.04

 ARG ROCMVERSION=5.1
 ARG OSDB_BKC_VERSION
+ARG compiler_version

 RUN set -xe

@@ -23,8 +24,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    cmake-data=3.15.1-0kitware1 \
    cmake=3.15.1-0kitware1 \
    curl \
-    g++ \
-    gdb \
    git \
    hip-rocclr \
    jq \
@@ -61,17 +60,7 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
 RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
 RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb

-# Install cget
-RUN pip install cget
-
-# Install rclone
-RUN pip install https://github.com/pfultz2/rclone/archive/master.tar.gz
-
 ARG PREFIX=/opt/rocm
-# Install dependencies
-RUN cget install pfultz2/rocm-recipes
-# Install rbuild
-RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/6d78a0553babdaea8d2da5de15cbda7e869594b8.tar.gz
 # Install packages for processing the performance results
 RUN pip3 install --upgrade pip
 RUN pip3 install sqlalchemy
@@ -84,12 +73,26 @@ ENV UBSAN_OPTIONS=print_stacktrace=1

 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
-ADD rbuild.ini /rbuild.ini
 ADD dev-requirements.txt dev-requirements.txt
-RUN rbuild prepare -s develop -d $PREFIX
 RUN groupadd -f render

 # Install the new rocm-cmake version
 RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git  && \
  cd rocm-cmake && mkdir build && cd build && \
  cmake  .. && cmake --build . && cmake --build . --target install
+
+WORKDIR /
+
+ENV compiler_version=$compiler_version
+RUN sh -c "echo compiler version = '$compiler_version'"
+
+RUN if [ "$compiler_version" = "9110" ]; then \
+        git clone -b ck-9110 https://github.com/RadeonOpenCompute/llvm-project.git && \
+        cd llvm-project && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+        make -j 8 ; \
+    else echo "using the release compiler"; \
+    fi
+
+#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
+#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -11,6 +11,13 @@ def show_node_info() {
    """
 }

+def runShell(String command){
+    def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
+    def output = readFile(file: "tmp.txt")
+    echo "tmp.txt contents: $output"
+    return (output != "")
+}
+
 def cmake_build(Map conf=[:]){

    def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
@@ -60,7 +67,7 @@ def cmake_build(Map conf=[:]){
        """
    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
    // reduce parallelism when compiling, clang uses too much memory
-    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 1 )) ${config_targets}")
+    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 2 )) ${config_targets}")
    def execute_cmd = conf.get("execute_cmd", "")

    def cmd = conf.get("cmd", """
@@ -95,49 +102,56 @@ def buildHipClangJob(Map conf=[:]){
        if (conf.get("enforce_xnack_on", false)) {
            dockerOpts = dockerOpts + " --env HSA_XNACK=1"
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
+        def dockerArgs
+        if (params.USE_9110){
+            dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='9110' "
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+        }
+        else{
+            dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
+        }

        def variant = env.STAGE_NAME

        def retimage

        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
-            if (params.USE_DOCKERFILE){
-                try {
-                    retimage = docker.build("${image}", dockerArgs + '.')
-                    withDockerContainer(image: image, args: dockerOpts) {
-                        timeout(time: 5, unit: 'MINUTES')
-                        {
-                            sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+            try {
+                retimage = docker.build("${image}", dockerArgs + '.')
+                withDockerContainer(image: image, args: dockerOpts) {
+                    timeout(time: 5, unit: 'MINUTES'){
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            throw new Exception ("GPU not found")
                        }
-                    }
-                }
-                catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
-                    echo "The job was cancelled or aborted"
-                    throw e
-                }
-                catch(Exception ex) {
-                    retimage = docker.build("${image}", dockerArgs + "--no-cache .")
-                    withDockerContainer(image: image, args: dockerOpts) {
-                        timeout(time: 5, unit: 'MINUTES')
-                        {
-                            sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        else{
+                            echo "GPU is OK"
                        }
                    }
                }
            }
-            else{
-                timeout(time: 3, unit: 'HOURS'){
-                    retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
-                    image="b56f8ac0d6ea"
-                    sh "docker images"
+            catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+                echo "The job was cancelled or aborted"
+                throw e
+            }
+            catch(Exception ex) {
+                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
+                withDockerContainer(image: image, args: dockerOpts) {
+                    timeout(time: 5, unit: 'MINUTES'){
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            throw new Exception ("GPU not found")
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
+                    }
                }
            }

            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 5, unit: 'HOURS')
                {
-                    sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
                    cmake_build(conf)
                }
            }
@@ -149,10 +163,6 @@ def reboot(){
    build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
 }

-
-
-
-
 def buildHipClangJobAndReboot(Map conf=[:]){
    try{
        buildHipClangJob(conf)
@@ -169,7 +179,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
    }
 }

-
 def runCKProfiler(Map conf=[:]){
        show_node_info()

@@ -186,96 +195,103 @@ def runCKProfiler(Map conf=[:]){
        if (conf.get("enforce_xnack_on", false)) {
            dockerOpts = dockerOpts + " --env HSA_XNACK=1"
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
+        def dockerArgs
+        if (params.USE_9110){
+            dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='9110' "
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+        }
+        else{
+            dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
+        }

        def variant = env.STAGE_NAME
-
        def retimage

        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
-            if (params.USE_DOCKERFILE){
-                try {
-                    retimage = docker.build("${image}", dockerArgs + '.')
-                    withDockerContainer(image: image, args: dockerOpts) {
-                        timeout(time: 5, unit: 'MINUTES')
-                        {
-                            sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+            try {
+                retimage = docker.build("${image}", dockerArgs + '.')
+                withDockerContainer(image: image, args: dockerOpts) {
+                    timeout(time: 5, unit: 'MINUTES'){
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            throw new Exception ("GPU not found")
                        }
-                    }
-                }
-                catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
-                    echo "The job was cancelled or aborted"
-                    throw e
-                }
-                catch(Exception ex) {
-                    retimage = docker.build("${image}", dockerArgs + "--no-cache .")
-                    withDockerContainer(image: image, args: dockerOpts) {
-                        timeout(time: 5, unit: 'MINUTES')
-                        {
-                            sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        else{
+                            echo "GPU is OK"
                        }
                    }
                }
            }
-            else{
-                timeout(time: 3, unit: 'HOURS'){
-                    retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
-                    image="b56f8ac0d6ea"
-                    sh "docker images"
+            catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+                echo "The job was cancelled or aborted"
+                throw e
+            }
+            catch(Exception ex) {
+                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
+                withDockerContainer(image: image, args: dockerOpts) {
+                    timeout(time: 5, unit: 'MINUTES'){
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            throw new Exception ("GPU not found")
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
+                    }
                }
            }

            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 5, unit: 'HOURS')
+                timeout(time: 24, unit: 'HOURS')
                {
                    cmake_build(conf)
 					dir("script"){
-                        //run gemm performance tests
-                        def gemm_log = "perf_gemm_${gpu_arch}.log"
-                        sh "rm -f ${gemm_log}"
-                        sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
-                        sh "echo Node name: ${NODE_NAME} >> ${gemm_log}"
-                        sh "echo GPU_arch name: ${gpu_arch}  >> ${gemm_log}"
-                        sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
-                        sh "hipcc --version | grep -e 'HIP version'  >> ${gemm_log}"
-                        sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}"
-                        //results will be parsed, stored, and analyzed within the python script
-                        //the script will return 0 if the performance criteria are met
-                        //or return 1 if the criteria are not met
-                        archiveArtifacts  "${gemm_log}"
-                        sh "python3 parse_perf_data.py ${gemm_log} "
-                        //run resnet50 test
-                        def resnet_log = "perf_resnet50_${gpu_arch}.log"
-                        sh "rm -f ${resnet_log}"
-                        sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}"
-                        sh "echo Node name: ${NODE_NAME} >> ${resnet_log}"
-                        sh "echo GPU_arch name: ${gpu_arch}  >> ${resnet_log}"
-                        sh "rocminfo | grep 'Compute Unit:' >> ${resnet_log} "
-                        sh "hipcc --version | grep -e 'HIP version'  >> ${resnet_log}"
-                        sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}"
-                        //first run tests with N=256
-                        sh "./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}"
-                        //then run with N=4
-                        sh "./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}"
-                        archiveArtifacts  "${resnet_log}"
-                        //the script will put the results from N=256 and N=4 runs into separate tables
-                        sh "python3 parse_perf_data.py ${resnet_log} "
+                        if (params.RUN_FULL_QA){
+                            def qa_log = "qa_${gpu_arch}.log"
+                            if (params.USE_9110){
+                                sh "./run_full_performance_tests.sh 1 QA_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            else{
+                                sh "./run_full_performance_tests.sh 1 QA_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            archiveArtifacts "perf_gemm_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
+                            archiveArtifacts "perf_batched_gemm_${gpu_arch}.log"
+                            archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
+                            archiveArtifacts "perf_fwd_conv_${gpu_arch}.log"
+                            archiveArtifacts "perf_bwd_conv_${gpu_arch}.log"
+                            archiveArtifacts "perf_fusion_${gpu_arch}.log"
+                            archiveArtifacts "perf_reduction_${gpu_arch}.log"
+                           // stash perf files to master
+                            stash name: "perf_gemm_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N256_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N4_${gpu_arch}.log"
+                            stash name: "perf_batched_gemm_${gpu_arch}.log"
+                            stash name: "perf_grouped_gemm_${gpu_arch}.log"
+                            stash name: "perf_fwd_conv_${gpu_arch}.log"
+                            stash name: "perf_bwd_conv_${gpu_arch}.log"
+                            stash name: "perf_fusion_${gpu_arch}.log"
+                            stash name: "perf_reduction_${gpu_arch}.log"
+                            //we will process results on the master node
+                        }
+                        else{
+                            if (params.USE_9110){
+                                sh "./run_performance_tests.sh 0 CI_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            else{
+                                sh "./run_performance_tests.sh 0 CI_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            archiveArtifacts "perf_gemm_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
+                            // stash perf files to master
+                            stash name: "perf_gemm_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N256_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N4_${gpu_arch}.log"
+                            //we will process the results on the master node
+                        }
+
 					}
                }
            }
@@ -283,7 +299,6 @@ def runCKProfiler(Map conf=[:]){
        return retimage
 }

-
 def runPerfTest(Map conf=[:]){
    try{
        runCKProfiler(conf)
@@ -300,16 +315,88 @@ def runPerfTest(Map conf=[:]){
    }
 }

+def process_results(Map conf=[:]){
+    env.HSA_ENABLE_SDMA=0
+    checkout scm
+    def image = "composable_kernels"
+    def prefixpath = "/opt/rocm"
+    def gpu_arch = conf.get("gpu_arch", "gfx908")
+
+    // Jenkins is complaining about the render group 
+    def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+    if (conf.get("enforce_xnack_on", false)) {
+        dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+    }
+    def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
+
+    def variant = env.STAGE_NAME
+    def retimage
+
+    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+        try {
+            retimage = docker.build("${image}", dockerArgs + '.')
+        }
+        catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+            echo "The job was cancelled or aborted"
+            throw e
+        }
+    }
+
+    withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+        timeout(time: 1, unit: 'HOURS'){
+            try{
+                dir("script"){
+                    if (params.RUN_FULL_QA){
+                        // unstash perf files to master
+                        unstash "perf_gemm_${gpu_arch}.log"
+                        unstash "perf_resnet50_N256_${gpu_arch}.log"
+                        unstash "perf_resnet50_N4_${gpu_arch}.log"
+                        unstash "perf_batched_gemm_${gpu_arch}.log"
+                        unstash "perf_grouped_gemm_${gpu_arch}.log"
+                        unstash "perf_fwd_conv_${gpu_arch}.log"
+                        unstash "perf_bwd_conv_${gpu_arch}.log"
+                        unstash "perf_fusion_${gpu_arch}.log"
+                        unstash "perf_reduction_${gpu_arch}.log"
+                        sh "./process_qa_data.sh ${gpu_arch}"
+                    }
+                    else{
+                        // unstash perf files to master
+                        unstash "perf_gemm_${gpu_arch}.log"
+                        unstash "perf_resnet50_N256_${gpu_arch}.log"
+                        unstash "perf_resnet50_N4_${gpu_arch}.log"
+                        sh "./process_perf_data.sh ${gpu_arch}"
+                    }
+                }
+            }
+            catch(e){
+                echo "throwing error exception while processing performance test results"
+                echo 'Exception occurred: ' + e.toString()
+                throw e
+            }
+        }
+    }
+}
+
+//launch develop branch daily at 23:00 in FULL_QA mode
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;USE_9110=true''' : ""
+
 pipeline {
    agent none
+    triggers {
+        parameterizedCron(CRON_SETTINGS)
+    }
    options {
        parallelsAlwaysFailFast()
    }
    parameters {
        booleanParam(
-            name: "USE_DOCKERFILE",
+            name: "USE_9110",
            defaultValue: true,
-            description: "")
+            description: "Select compiler version: 9110 (default) or release")
+        booleanParam(
+            name: "RUN_FULL_QA",
+            defaultValue: false,
+            description: "Select whether to run small set of performance tests (default) or full QA")
    }
    environment{
        dbuser = "${dbuser}"
@@ -369,6 +456,10 @@ pipeline {
                }
                stage("Run Tests: gfx90a")
                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FULL_QA.toBoolean() }
+                    }
                    agent{ label rocmnode("gfx90a")}
                    environment{
                        setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
@@ -402,6 +493,10 @@ pipeline {
            {
                stage("Run ckProfiler: gfx908")
                {
+                    when {
+                        beforeAgent true
+                        expression { !params.RUN_FULL_QA.toBoolean() }
+                    }
                    agent{ label rocmnode("gfx908")}
                    environment{
                        setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
@@ -412,6 +507,10 @@ pipeline {
                }
                stage("Run ckProfiler: gfx90a")
                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FULL_QA.toBoolean() }
+                    }
                    agent{ label rocmnode("gfx90a")}
                    environment{
                        setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
@@ -422,6 +521,33 @@ pipeline {
                }
            }
        }
+        stage("Process Performance Test Results")
+        {
+            parallel
+            {
+                stage("Process results for gfx908"){
+                    when {
+                        beforeAgent true
+                        expression { !params.RUN_FULL_QA.toBoolean() }
+                    }
+                    agent { label 'mici' }
+                    steps{
+                        process_results(gpu_arch: "gfx908")
+                    }
+                }
+                stage("Process results for gfx90a"){
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FULL_QA.toBoolean() }
+                    }
+                    agent { label 'mici' }
+                    steps{
+                        process_results(gpu_arch: "gfx90a")
+                    }
+                }
+            }
+        }
+
        /* enable after the cmake file supports packaging
        stage("Packages") {
            when {

--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ rocm/tensorflow:rocm5.1-tf2.6-dev              \
 /bin/bash
 ```

-# Install the new rocm-cmake version
+# Install newer version of rocm-cmake
 https://github.com/RadeonOpenCompute/rocm-cmake

 ## Build
@@ -54,6 +54,7 @@ make install
 ```

 ## Using CK as pre-built kernel library
+Instructions for using CK as a pre-built kernel library are under ```client_example/```

 ## Caveat
 ### Kernel Timing and Verification

--- a/client_example/01_gemm/gemm.cpp
+++ b/client_example/01_gemm/gemm.cpp
@@ -63,7 +63,7 @@ int main(int argc, char* argv[])
    {
        // use default case
    }
-    else if(argc == 5)
+    else if(argc == 7)
    {
        M = std::stoi(argv[1]);
        N = std::stoi(argv[2]);

--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
@@ -31,11 +31,11 @@ using D0DataType = F16;
 using D1DataType = F16;
 using EDataType  = F16;

-using ALayout   = Row;
-using BLayout   = Col;
-using DDELayout = Row;
-using DDELayout = Row;
-using DELayout  = Row;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout  = Row;

 struct SimpleDeviceMem
 {
@@ -105,16 +105,16 @@ int main(int argc, char* argv[])
    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
    SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
-                                      f_matrix_space_size(M, N, StrideD0, DDELayout{}));
+                                      f_matrix_space_size(M, N, StrideD0, D0Layout{}));
    SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
-                                      f_matrix_space_size(M, N, StrideD1, DDELayout{}));
-    SimpleDeviceMem e_device_buf(sizeof(EDataType) *
-                                 f_matrix_space_size(M, N, StrideE, DELayout{}));
+                                      f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));

    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
        ALayout,
        BLayout,
-        DDELayout,
+        ck::Tuple<D0Layout, D1Layout>,
+        ELayout,
        ADataType,
        BDataType,
        ck::Tuple<D0DataType, D1DataType>,

--- a/example/01_gemm/gemm_dl_fp16.cpp
+++ b/example/01_gemm/gemm_dl_fp16.cpp
@@ -12,9 +12,9 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 template <ck::index_t... Is>
@@ -142,9 +142,9 @@ int main(int argc, char* argv[])
        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
    }

-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());

    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/01_gemm/gemm_dl_fp32.cpp
+++ b/example/01_gemm/gemm_dl_fp32.cpp
@@ -12,9 +12,9 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 template <ck::index_t... Is>
@@ -141,9 +141,9 @@ int main(int argc, char* argv[])
        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
    }

-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());

    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/01_gemm/gemm_dl_int8.cpp
+++ b/example/01_gemm/gemm_dl_int8.cpp
@@ -12,9 +12,9 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 template <ck::index_t... Is>
@@ -139,9 +139,9 @@ int main(int argc, char* argv[])
        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
    }

-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());

    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -11,9 +11,9 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

@@ -170,9 +170,9 @@ int main(int argc, char* argv[])
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
    }

-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());

    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -13,9 +13,9 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 template <ck::index_t... Is>
@@ -155,9 +155,9 @@ int main(int argc, char* argv[])
        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
    }

-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());

    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -12,9 +12,9 @@
 #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

@@ -165,9 +165,9 @@ int main(int argc, char* argv[])
        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
    }

-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());

    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -13,9 +13,9 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 template <ck::index_t... Is>
@@ -167,9 +167,9 @@ int main(int argc, char* argv[])
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
    }

-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());

    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -11,9 +11,9 @@
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

@@ -51,33 +51,34 @@ using BDataType        = F16;
 using AccDataType      = F32;
 using CShuffleDataType = F32;
 using DDataType        = F16;
-using DsDataType       = ck::Tuple<DDataType>;
 using EDataType        = F16;

-using ALayout  = Row;
-using BLayout  = Col;
-using DELayout = Row;
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;

 using AElementOp   = PassThrough;
 using BElementOp   = PassThrough;
 using CDEElementOp = AlphaBetaAdd;

-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

 using DeviceOpInstance =
    ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
                                                                   BLayout,
-                                                                   DELayout,
+                                                                   ck::Tuple<DLayout>,
+                                                                   ELayout,
                                                                   ADataType,
                                                                   BDataType,
                                                                   AccDataType,
                                                                   CShuffleDataType,
-                                                                   DsDataType,
+                                                                   ck::Tuple<DDataType>,
                                                                   EDataType,
                                                                   AElementOp,
                                                                   BElementOp,
                                                                   CDEElementOp,
-                                                                   GemmDefault,
+                                                                   GemmSpec,
                                                                   1,
                                                                   256,
                                                                   256,
@@ -190,9 +191,9 @@ int main(int argc, char* argv[])

    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DELayout{}));
-    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{}));
-    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));

    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -213,10 +214,10 @@ int main(int argc, char* argv[])
        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
    }

-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpace());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());

    a_device_buf.ToDevice(a_m_k.mData.data());
    b_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -12,9 +12,9 @@
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

@@ -47,33 +47,34 @@ using BDataType        = F16;
 using AccDataType      = F32;
 using CShuffleDataType = F16;
 using DDataType        = F16;
-using DsDataType       = ck::Tuple<DDataType>;
 using EDataType        = F16;

 using ALayout = Row;
 using BLayout = Col;
+using DLayout = Row;
 using ELayout = Row;

 using AElementOp   = PassThrough;
 using BElementOp   = PassThrough;
 using CDEElementOp = AddRelu;

-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

 using DeviceOpInstance =
    ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
                                                                   BLayout,
+                                                                   ck::Tuple<DLayout>,
                                                                   ELayout,
                                                                   ADataType,
                                                                   BDataType,
                                                                   AccDataType,
                                                                   CShuffleDataType,
-                                                                   DsDataType,
+                                                                   ck::Tuple<DDataType>,
                                                                   EDataType,
                                                                   AElementOp,
                                                                   BElementOp,
                                                                   CDEElementOp,
-                                                                   GemmDefault,
+                                                                   GemmSpec,
                                                                   1,
                                                                   256,
                                                                   256,
@@ -191,10 +192,10 @@ int main(int argc, char* argv[])
        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
    }

-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpace());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());

    a_device_buf.ToDevice(a_m_k.mData.data());
    b_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -12,9 +12,9 @@
 #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

@@ -43,6 +43,7 @@ using ALayout  = Row;
 using BLayout  = Col;
 using D0Layout = Row;
 using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
 using ELayout  = Row;

 using AElementOp   = PassThrough;
@@ -53,11 +54,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa

 // clang-format off
 using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
-//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on

 int main(int argc, char* argv[])
@@ -156,11 +157,11 @@ int main(int argc, char* argv[])
        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
    }

-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpace());
-    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpace());
-    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());

    a_device_buf.ToDevice(a_m_k.mData.data());
    b_device_buf.ToDevice(b_k_n.mData.data());

--- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
-add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util)
--- a/example/06_conv2d_fwd_bias_relu/README.md
+++ b/example/06_conv2d_fwd_bias_relu/README.md
-# Instructions for ```example_conv_xdl_bias_relu```
-
-## Run ```example_conv_xdl_bias_relu```
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./bin/example_conv_xdl_bias_relu 0 1 5
-```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-bias_k: dim 1, lengths {256}, strides {1}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.39009 ms, 105.581 TFlops, 239.981 GB/s
-```
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/conv_util.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
-
-namespace {
-
-using InDataType  = ck::half_t;
-using WeiDataType = ck::half_t;
-using OutDataType = ck::half_t;
-using AccDataType = float;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using InLayout  = ck::tensor_layout::convolution::NHWC;
-using WeiLayout = ck::tensor_layout::convolution::KYXC;
-using OutLayout = ck::tensor_layout::convolution::NHWK;
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
-
-static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-// clang-format off
-using DeviceConvFwdInstance = ck::tensor_operation::device::
-    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
-        InDataType,                   // InDataType
-        WeiDataType,                  // WeiDataType
-        OutDataType,                  // OutDataType
-        AccDataType,                  // AccDataType
-        InElementOp,                  // InElementwiseOperation
-        WeiElementOp,                 // WeiElementwiseOperation
-        OutElementOp,                 // OutElementwiseOperation
-        MemorySet,                    // OutGlobalMemoryDataOperation
-        ConvFwdDefault,               // ConvForwardSpecialization
-        256,                          // BlockSize
-        128,                          // MPerBlock
-        256,                          // NPerBlock
-        4,                            // K0PerBlock
-        8,                            // K1
-        32,                           // MPerXdl
-        32,                           // NPerXdl
-        2,                            // MXdlPerWave
-        4,                            // NXdlPerWave
-        S<4, 64, 1>,                  // ABlockTransferThreadClusterLengths_K0_M_K1
-        S<1, 0, 2>,                   // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,                   // ABlockTransferSrcAccessOrder
-        2,                            // ABlockTransferSrcVectorDim
-        8,                            // ABlockTransferSrcScalarPerVector
-        8,                            // ABlockTransferDstScalarPerVector_K1
-        true,                         // ABlockLdsAddExtraM
-        S<4, 64, 1>,                  // BBlockTransferThreadClusterLengths_K0_N_K1
-        S<1, 0, 2>,                   // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,                   // BBlockTransferSrcAccessOrder
-        2,                            // BBlockTransferSrcVectorDim
-        8,                            // BBlockTransferSrcScalarPerVector
-        8,                            // BBlockTransferDstScalarPerVector_K1
-        true,                         // BBlockLdsAddExtraN
-        1,                            // CShuffleMXdlPerWavePerShuffle
-        1,                            // CShuffleNXdlPerWavePerShuffle
-        S<1, 1, 32, 1, 1, 8>,         // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-        8>;                           // CBlockTransferScalarPerVector_NWaveNPerXdl
-// clang-format on
-
-using ReferenceConvFwdInstance =
-    ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation<InDataType,
-                                                                 WeiDataType,
-                                                                 OutDataType,
-                                                                 InElementOp,
-                                                                 WeiElementOp,
-                                                                 OutElementOp>;
-
-void PrintUseMsg()
-{
-    std::cout << "arg1: verification (0=no, 1=yes)\n"
-              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: time kernel (0=n0, 1=yes)\n"
-              << "Following arguments:\n"
-              << " N, K, C, \n"
-              << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
-              << " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
-              << " <strides>, (ie Sy, Sx for 2D)\n"
-              << " <dilations>, (ie Dy, Dx for 2D)\n"
-              << " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
-              << " <right padding>, (ie RightPy, RightPx for 2D)\n"
-              << std::endl;
-}
-
-ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
-{
-    // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
-    int num_dim_spatial = 2;
-    int conv_args       = 3 + num_dim_spatial * 6;
-    int cmdline_nargs   = conv_args + 4;
-    if(cmdline_nargs != argc)
-    {
-        PrintUseMsg();
-        exit(0);
-    }
-
-    ck::utils::conv::ConvParams params;
-    int arg_idx = 4;
-
-    params.num_dim_spatial_ = num_dim_spatial;
-    params.N_               = std::stoi(argv[arg_idx++]);
-    params.K_               = std::stoi(argv[arg_idx++]);
-    params.C_               = std::stoi(argv[arg_idx++]);
-
-    params.filter_spatial_lengths_.resize(num_dim_spatial);
-    for(int i = 0; i < num_dim_spatial; ++i)
-    {
-        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
-    }
-    params.input_spatial_lengths_.resize(num_dim_spatial);
-    for(int i = 0; i < num_dim_spatial; ++i)
-    {
-        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
-    }
-    params.conv_filter_strides_.resize(num_dim_spatial);
-    for(int i = 0; i < num_dim_spatial; ++i)
-    {
-        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
-    }
-    params.conv_filter_dilations_.resize(num_dim_spatial);
-    for(int i = 0; i < num_dim_spatial; ++i)
-    {
-        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
-    }
-    params.input_left_pads_.resize(num_dim_spatial);
-    for(int i = 0; i < num_dim_spatial; ++i)
-    {
-        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
-    }
-    params.input_right_pads_.resize(num_dim_spatial);
-    for(int i = 0; i < num_dim_spatial; ++i)
-    {
-        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
-    }
-
-    return params;
-}
-
-} // anonymous namespace
-
-int main(int argc, char* argv[])
-{
-    using namespace ck::utils::conv;
-
-    bool do_verification      = true;
-    int init_method           = 1;
-    bool time_kernel          = false;
-    const int num_dim_spatial = 2;
-
-    ck::utils::conv::ConvParams params;
-
-    if(argc >= 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-
-    if(argc >= 5)
-    {
-        params = ParseConvParams(argc, argv);
-    }
-
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
-                                        static_cast<std::size_t>(params.C_)};
-    input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths_),
-                      std::end(params.input_spatial_lengths_));
-
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
-                                         static_cast<std::size_t>(params.C_)};
-    filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths_),
-                       std::end(params.filter_spatial_lengths_));
-
-    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
-                                         static_cast<std::size_t>(params.K_)};
-    output_dims.insert(std::end(output_dims),
-                       std::begin(output_spatial_lengths),
-                       std::end(output_spatial_lengths));
-
-    Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
-    Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
-    Tensor<OutDataType> host_output(
-        get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
-    Tensor<OutDataType> device_output(
-        get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
-    // bias: assume contiguous 1d vector
-    Tensor<OutDataType> bias(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
-
-    std::cout << "input: " << input.mDesc << std::endl;
-    std::cout << "weights: " << weights.mDesc << std::endl;
-    std::cout << "output: " << host_output.mDesc << std::endl;
-    std::cout << "bias: " << bias.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        break;
-    default:
-        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
-    }
-
-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace());
-
-    in_device_buf.ToDevice(input.mData.data());
-    wei_device_buf.ToDevice(weights.mData.data());
-    bias_device_buf.ToDevice(bias.mData.data());
-
-    auto conv    = DeviceConvFwdInstance{};
-    auto invoker = conv.MakeInvoker();
-    auto argument =
-        conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
-                          static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                          static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                          static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
-                          params.N_,
-                          params.K_,
-                          params.C_,
-                          params.input_spatial_lengths_,
-                          params.filter_spatial_lengths_,
-                          output_spatial_lengths,
-                          params.conv_filter_strides_,
-                          params.conv_filter_dilations_,
-                          params.input_left_pads_,
-                          params.input_right_pads_,
-                          InElementOp{},
-                          WeiElementOp{},
-                          OutElementOp{});
-
-    if(!conv.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device operator with the specified compilation parameters does "
-            "not support this problem");
-    }
-
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-
-    std::size_t flop = get_flops(
-        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
-    std::size_t num_btype =
-        get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
-                                                        params.C_,
-                                                        params.K_,
-                                                        params.input_spatial_lengths_,
-                                                        params.filter_spatial_lengths_,
-                                                        output_spatial_lengths) +
-        sizeof(OutDataType) * (params.K_);
-
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-
-    if(do_verification)
-    {
-        auto ref_conv    = ReferenceConvFwdInstance{};
-        auto ref_invoker = ref_conv.MakeInvoker();
-
-        auto ref_argument = ref_conv.MakeArgument(input,
-                                                  weights,
-                                                  host_output,
-                                                  bias,
-                                                  params.conv_filter_strides_,
-                                                  params.conv_filter_dilations_,
-                                                  params.input_left_pads_,
-                                                  params.input_right_pads_,
-                                                  InElementOp{},
-                                                  WeiElementOp{},
-                                                  OutElementOp{});
-        ref_invoker.Run(ref_argument);
-        out_device_buf.FromDevice(device_output.mData.data());
-        return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
-    }
-
-    return 0;
-}
--- a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
-# FIXME: should fix validation failure
-add_example_executable_no_testing(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_util)
--- a/example/07_conv2d_fwd_bias_relu_add/README.md
+++ b/example/07_conv2d_fwd_bias_relu_add/README.md
-# Instructions for ```example_conv_xdl_bias_relu_add```
-
-
-## Run ```example_conv_xdl_bias_relu_add```
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./bin/example_conv_xdl_bias_relu_add 0 1 5
-```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-bias_k: dim 1, lengths {256}, strides {1}
-resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.44711 ms, 101.421 TFlops, 289.218 GB/s
-```