Merge branch 'develop' into wavelet_model

7e493730 · Adam Osewski · b89a88b5 · 40942b90 · 7e493730 · 7e493730
Commit 7e493730 authored Oct 13, 2022 by Adam Osewski
20 changed files
--- a/CITATION.cff
+++ b/CITATION.cff
+cff-version: 1.2.0
+title: Composable Kernel
+message: If you use this software, please cite using the following metadata.
+type: software
+authors:
+  - given-names: Chao
+    family-names: Liu
+    email: chao.liu2@amd.com
+    affiliation: AMD
+  - given-names: Jing
+    family-names: Zhang
+    email: jing.zhang3@amd.com
+    affiliation: AMD
+  - given-names: Letao
+    family-names: Qin
+    email: letao.qin@amd.com
+    affiliation: AMD
+  - given-names: Qianfeng
+    family-names: Zhang
+    email: qianfeng.zhang@amd.com
+    affiliation: AMD
+  - given-names: Liang
+    family-names: Huang
+    email: carlus.huang@amd.com
+    affiliation: AMD
+  - given-names: Shaojie
+    family-names: Wang
+    email: shaojie.wang@amd.com
+    affiliation: AMD
+  - given-names: Anthony
+    family-names: Chang
+    email: antc@amd.com
+    affiliation: AMD
+  - given-names: Chunyu
+    family-names: Lai
+    email: chunyu.lai@amd.com
+    affiliation: AMD
+  - given-names: Illia
+    family-names: Silin
+    email: illia.silin@amd.com
+    affiliation: AMD
+  - given-names: Adam
+    family-names: Osewski
+    email: adam.osewski@amd.com
+    affiliation: AMD
+  - given-names: Poyen
+    family-names: Chen
+    email: poyen.chen@amd.com
+    affiliation: AMD
+  - given-names: Rosty
+    family-names: Geyyer
+    email: rosty.geyyer@amd.com
+    affiliation: AMD
+  - given-names: Hanwen
+    family-names: Chen
+  - given-names: Tejash
+    family-names: Shah
+  - given-names: Xiaoyan
+    family-names: Zhou
+  - given-names: Jianfeng
+    family-names: Yan
+repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel'
+abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
+keywords:
+  - 'CK, Composable Kernel, Tensor Coordinate Transformation'
+license: MIT
+license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
+# Composable Kernel Developers and Contributors
+This is the list of developers and contributors to Composable Kernel library
+## Developers
+[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2022
+[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022
+[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), 2022
+Hanwen Chang, 2019-2021,
+Tejash Shah, 2019-2020
+Xiaoyan Zhou, 2020
+[Jianfeng Yan](https://github.com/j4yan), 2021-2022
+## Product Manager
+[Jun Liu](https://github.com/junliume)
+## Contributors
+[Dan Yao](https://github.com/danyao12), [Guangzhao Lu](https://github.com/guangzlu), [Raman Jana](https://github.com/ramjana), [Jehandad Khan](https://github.com/JehandadKhan), [Wen-Heng (Jack) Chung](https://github.com/whchung)
+## Acknowledgement
+CK team works closely with Meta [AITemplate](https://github.com/facebookincubator/AITemplate) team ([Bing Xu](https://github.com/antinucleon), [Hao Lu](https://github.com/hlu1), [Ying Zhang](https://github.com/ipiszy), etc). Most of the lucrative graph optimization opportunities in ML models were identified by AITemplate team, and we also co-designed many high performance fused kernels for AMD GPUs. Without this collaboration, CK would not reach its current potential.
--- a/Config.cmake.in
+++ b/Config.cmake.in
 @PACKAGE_INIT@
-set(_composable_kernel_supported_components device_operations host_tensor)
+set(_composable_kernel_supported_components device_operations utility)
 foreach(_comp ${composable_kernel_FIND_COMPONENTS})
 	if(NOT _comp IN_LIST _composable_kernel_supported_components)

--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:20.04
-ARG ROCMVERSION=5.2.3
+ARG ROCMVERSION=5.3
-ARG compiler_version
+ARG compiler_version="release"
+ARG compiler_commit=""
 RUN set -xe
@@ -12,12 +13,13 @@ RUN apt-get install -y wget gnupg
 RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
 RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
-RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
    apt-utils \
    build-essential \
+    ccache \
    cmake-data \
    cmake \
    curl \
@@ -68,7 +70,6 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
-ADD dev-requirements.txt dev-requirements.txt
 RUN groupadd -f render
 # Install the new rocm-cmake version
@@ -79,9 +80,16 @@ RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git  &&
 WORKDIR /
 ENV compiler_version=$compiler_version
+ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'"
+RUN sh -c "echo compiler commit = '$compiler_commit'"
-RUN --mount=type=ssh if [ "$compiler_version" != "release" ]; then \
+RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \
+        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \
+        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \
+    fi
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \
        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
        cd llvm-project && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
@@ -89,5 +97,14 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ]; then \
    else echo "using the release compiler"; \
    fi
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \
+        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
+        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+        make -j 8 ; \
+    else echo "using the release compiler"; \
+    fi
 #ENV HIP_CLANG_PATH='/llvm-project/build/bin'
 #RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -19,10 +19,24 @@ def runShell(String command){
 }
 def getDockerImageName(){
-    def img = "${env.CK_IMAGE_URL}:ck_ub20.04_rocm5.2.3_${params.COMPILER_VERSION}"
+    def img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
    return img
 }
+def check_host() {
+    if ("${env.CK_CCACHE}" != "null"){
+        def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}"
+        echo "ccache server: ${CCACHE_SERVER}"
+        sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt'''
+        def output = readFile(file: "tmp.txt")
+        echo "tmp.txt contents: \$output"
+        return (output != "0")
+    }
+    else{
+        return 1
+    }
+}
 def build_compiler(){
    def compiler
    if (params.BUILD_COMPILER == "hipcc"){
@@ -42,23 +56,22 @@ def build_compiler(){
 def getDockerImage(Map conf=[:]){
    env.DOCKER_BUILDKIT=1
    def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
-    def gpu_arch = conf.get("gpu_arch", "gfx908") // prebuilt dockers should have all the architectures enabled so one image can be used for all stages
    def no_cache = conf.get("no_cache", false)
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    if(env.CCACHE_HOST)
+    echo "ccache server: ${env.CK_CCACHE}"
+    if(env.CK_CCACHE)
    {
-        def check_host = sh(script:"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+        if(check_host())
-        if(check_host == "+PONG")
        {
-            echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
        }
        else 
        {
-            echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
        }
-        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
        env.CCACHE_DIR = """/tmp/ccache_store"""
-        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
    }
    if(no_cache)
    {
@@ -87,29 +100,37 @@ def buildDocker(install_prefix){
    checkout scm
    def image_name = getDockerImageName()
    echo "Building Docker for ${image_name}"
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    if(env.CCACHE_HOST)
+    echo "ccache server: ${env.CK_CCACHE}"
+    if(env.CK_CCACHE)
    {
-        def check_host = sh(script:"""(printf "PING\\r\\n";) | nc  -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+        if(check_host())
-        if(check_host == "+PONG")
        {
-            echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
        }
        else 
        {
-            echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
        }
-        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
        env.CCACHE_DIR = """/tmp/ccache_store"""
-        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
    }
    echo "Build Args: ${dockerArgs}"
    try{
+        if(params.BUILD_DOCKER){
+            //force building the new docker if that parameter is true
+            echo "Building image: ${image_name}"
+            retimage = docker.build("${image_name}", dockerArgs + ' .')
+            retimage.push()
+        }
+        else{
            echo "Checking for image: ${image_name}"
            sh "docker manifest inspect --insecure ${image_name}"
            echo "Image: ${image_name} found!! Skipping building image"
        }
+    }
    catch(Exception ex){
        echo "Unable to locate image: ${image_name}. Building image now"
        retimage = docker.build("${image_name}", dockerArgs + ' .')
@@ -154,6 +175,11 @@ def cmake_build(Map conf=[:]){
    }else{
        setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
    }
+    if(env.CK_CCACHE)
+    {
+        setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
+    }
+    echo "ccache server: ${env.CK_CCACHE}"
    def pre_setup_cmd = """
            echo \$HSA_ENABLE_SDMA
@@ -191,17 +217,15 @@ def buildHipClangJob(Map conf=[:]){
        env.HSA_ENABLE_SDMA=0
        checkout scm
-        def image = "composable_kernels_${params.COMPILER_VERSION}"
+        def image = getDockerImageName() 
        def prefixpath = conf.get("prefixpath", "/opt/rocm")
-        def gpu_arch = conf.get("gpu_arch", "gfx908")
        // Jenkins is complaining about the render group 
-        // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
-        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        if (conf.get("enforce_xnack_on", false)) {
-            dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
        if (params.COMPILER_VERSION != "release"){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -210,9 +234,61 @@ def buildHipClangJob(Map conf=[:]){
        def retimage
+        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+                timeout(time: 5, unit: 'HOURS')
+                {
+                    cmake_build(conf)
+                }
+            }
+        }
+        return retimage
+}
+def reboot(){
+    build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
+}
+def buildHipClangJobAndReboot(Map conf=[:]){
+    try{
+        buildHipClangJob(conf)
+    }
+    catch(e){
+        echo "throwing error exception for the stage"
+        echo 'Exception occurred: ' + e.toString()
+        throw e
+    }
+    finally{
+        if (!conf.get("no_reboot", false)) {
+            reboot()
+        }
+    }
+}
+def runCKProfiler(Map conf=[:]){
+        show_node_info()
+        env.HSA_ENABLE_SDMA=0
+        checkout scm
+        def image = getDockerImageName()
+        def prefixpath = conf.get("prefixpath", "/opt/rocm")
+        // Jenkins is complaining about the render group 
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        if (conf.get("enforce_xnack_on", false)) {
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+        }
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+        if (params.COMPILER_VERSION != "release"){
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+        }
+        def variant = env.STAGE_NAME
+        def retimage
        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
            try {
-                //retimage = docker.build("${image}", dockerArgs + '.')
                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
@@ -234,7 +310,7 @@ def buildHipClangJob(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
                            throw new Exception ("GPU not found")
                        }
@@ -246,25 +322,72 @@ def buildHipClangJob(Map conf=[:]){
            }
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 5, unit: 'HOURS')
+                timeout(time: 24, unit: 'HOURS')
                {
-                    cmake_build(conf)
+                    //cmake_build(conf)
+                    //instead of building, just unstash the ckProfiler and install it
+                    sh """
+                        rm -rf build
+                        mkdir build
+                    """
+                    dir("build"){
+                        unstash 'ckProfiler.tar.gz'
+                        sh 'tar -xvf ckProfiler.tar.gz'
+                    }
+					dir("script"){
+                        if (params.RUN_FULL_QA){
+                            sh "./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            archiveArtifacts "perf_gemm.log"
+                            archiveArtifacts "perf_resnet50_N256.log"
+                            archiveArtifacts "perf_resnet50_N4.log"
+                            archiveArtifacts "perf_batched_gemm.log"
+                            archiveArtifacts "perf_grouped_gemm.log"
+                            archiveArtifacts "perf_conv_fwd.log"
+                            archiveArtifacts "perf_conv_bwd_data.log"
+                            archiveArtifacts "perf_gemm_bilinear.log"
+                            archiveArtifacts "perf_reduction.log"
+                            archiveArtifacts "perf_splitK_gemm_verify.log"
+                            archiveArtifacts "perf_splitK_gemm.log"
+                            archiveArtifacts "perf_onnx_gemm.log"
+                           // stash perf files to master
+                            stash name: "perf_gemm.log"
+                            stash name: "perf_resnet50_N256.log"
+                            stash name: "perf_resnet50_N4.log"
+                            stash name: "perf_batched_gemm.log"
+                            stash name: "perf_grouped_gemm.log"
+                            stash name: "perf_conv_fwd.log"
+                            stash name: "perf_conv_bwd_data.log"
+                            stash name: "perf_gemm_bilinear.log"
+                            stash name: "perf_reduction.log"
+                            stash name: "perf_splitK_gemm.log"
+                            stash name: "perf_onnx_gemm.log"
+                            //we will process results on the master node
+                        }
+                        else{
+                            sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            archiveArtifacts "perf_gemm.log"
+                            archiveArtifacts "perf_resnet50_N256.log"
+                            archiveArtifacts "perf_resnet50_N4.log"
+                            // stash perf files to master
+                            stash name: "perf_gemm.log"
+                            stash name: "perf_resnet50_N256.log"
+                            stash name: "perf_resnet50_N4.log"
+                            //we will process the results on the master node
+                        }
+					}
                }
            }
        }
        return retimage
 }
-def reboot(){
+def runPerfTest(Map conf=[:]){
-    build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
-}
-def buildHipClangJobAndReboot(Map conf=[:]){
    try{
-        buildHipClangJob(conf)
+        runCKProfiler(conf)
    }
    catch(e){
-        echo "throwing error exception for the stage"
+        echo "throwing error exception in performance tests"
        echo 'Exception occurred: ' + e.toString()
        throw e
    }
@@ -275,24 +398,21 @@ def buildHipClangJobAndReboot(Map conf=[:]){
    }
 }
-def runCKProfiler(Map conf=[:]){
+def Build_CK(Map conf=[:]){
        show_node_info()
        env.HSA_ENABLE_SDMA=0
        checkout scm
+        def image = getDockerImageName() 
-        def image = "composable_kernels_${params.COMPILER_VERSION}"
        def prefixpath = conf.get("prefixpath", "/opt/rocm")
-        def gpu_arch = conf.get("gpu_arch", "gfx908")
        // Jenkins is complaining about the render group 
-        // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
-        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        if (conf.get("enforce_xnack_on", false)) {
-            dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
        if (params.COMPILER_VERSION != "release"){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -302,7 +422,6 @@ def runCKProfiler(Map conf=[:]){
        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
            try {
-                //retimage = docker.build("${image}", dockerArgs + '.')
                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
@@ -324,7 +443,7 @@ def runCKProfiler(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
                            throw new Exception ("GPU not found")
                        }
@@ -334,52 +453,16 @@ def runCKProfiler(Map conf=[:]){
                    }
                }
            }
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 24, unit: 'HOURS')
                {
                    cmake_build(conf)
-					dir("script"){
+                    dir("build"){
-                        if (params.RUN_FULL_QA){
+                        //run tests and examples
-                            def qa_log = "qa_${gpu_arch}.log"
+                        sh 'make -j check'
-                            sh "./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                        //we only need the ckProfiler to run the performance tests, so we pack and stash it
-                            archiveArtifacts "perf_gemm_${gpu_arch}.log"
+                        sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
-                            archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
+                        stash "ckProfiler.tar.gz"
-                            archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
-                            archiveArtifacts "perf_batched_gemm_${gpu_arch}.log"
-                            archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
-                            archiveArtifacts "perf_conv_fwd_${gpu_arch}.log"
-                            archiveArtifacts "perf_conv_bwd_data_${gpu_arch}.log"
-                            archiveArtifacts "perf_gemm_bilinear_${gpu_arch}.log"
-                            archiveArtifacts "perf_reduction_${gpu_arch}.log"
-                            archiveArtifacts "perf_splitK_gemm_${gpu_arch}.log"
-                            archiveArtifacts "perf_onnx_gemm_${gpu_arch}.log"
-                           // stash perf files to master
-                            stash name: "perf_gemm_${gpu_arch}.log"
-                            stash name: "perf_resnet50_N256_${gpu_arch}.log"
-                            stash name: "perf_resnet50_N4_${gpu_arch}.log"
-                            stash name: "perf_batched_gemm_${gpu_arch}.log"
-                            stash name: "perf_grouped_gemm_${gpu_arch}.log"
-                            stash name: "perf_conv_fwd_${gpu_arch}.log"
-                            stash name: "perf_conv_bwd_data_${gpu_arch}.log"
-                            stash name: "perf_gemm_bilinear_${gpu_arch}.log"
-                            stash name: "perf_reduction_${gpu_arch}.log"
-                            stash name: "perf_splitK_gemm_${gpu_arch}.log"
-                            stash name: "perf_onnx_gemm_${gpu_arch}.log"
-                            //we will process results on the master node
-                        }
-                        else{
-                            sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
-                            archiveArtifacts "perf_gemm_${gpu_arch}.log"
-                            archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
-                            archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
-                            // stash perf files to master
-                            stash name: "perf_gemm_${gpu_arch}.log"
-                            stash name: "perf_resnet50_N256_${gpu_arch}.log"
-                            stash name: "perf_resnet50_N4_${gpu_arch}.log"
-                            //we will process the results on the master node
-                        }
                    }
                }
            }
@@ -387,12 +470,12 @@ def runCKProfiler(Map conf=[:]){
        return retimage
 }
-def runPerfTest(Map conf=[:]){
+def Build_CK_and_Reboot(Map conf=[:]){
    try{
-        runCKProfiler(conf)
+        Build_CK(conf)
    }
    catch(e){
-        echo "throwing error exception in performance tests"
+        echo "throwing error exception while building CK"
        echo 'Exception occurred: ' + e.toString()
        throw e
    }
@@ -406,23 +489,20 @@ def runPerfTest(Map conf=[:]){
 def process_results(Map conf=[:]){
    env.HSA_ENABLE_SDMA=0
    checkout scm
-    def image = "composable_kernels_${params.COMPILER_VERSION}"
+    def image = getDockerImageName() 
    def prefixpath = "/opt/rocm"
-    def gpu_arch = conf.get("gpu_arch", "gfx908")
    // Jenkins is complaining about the render group 
    def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
    if (conf.get("enforce_xnack_on", false)) {
-        dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
+        dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
    }
-    def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='release' "
    def variant = env.STAGE_NAME
    def retimage
    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
        try {
-            //retimage = docker.build("${image}", dockerArgs + '.')
            (retimage, image) = getDockerImage(conf)
        }
        catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
@@ -437,25 +517,25 @@ def process_results(Map conf=[:]){
                dir("script"){
                    if (params.RUN_FULL_QA){
                        // unstash perf files to master
-                        unstash "perf_gemm_${gpu_arch}.log"
+                        unstash "perf_gemm.log"
-                        unstash "perf_resnet50_N256_${gpu_arch}.log"
+                        unstash "perf_resnet50_N256.log"
-                        unstash "perf_resnet50_N4_${gpu_arch}.log"
+                        unstash "perf_resnet50_N4.log"
-                        unstash "perf_batched_gemm_${gpu_arch}.log"
+                        unstash "perf_batched_gemm.log"
-                        unstash "perf_grouped_gemm_${gpu_arch}.log"
+                        unstash "perf_grouped_gemm.log"
-                        unstash "perf_conv_fwd_${gpu_arch}.log"
+                        unstash "perf_conv_fwd.log"
-                        unstash "perf_conv_bwd_data_${gpu_arch}.log"
+                        unstash "perf_conv_bwd_data.log"
-                        unstash "perf_gemm_bilinear_${gpu_arch}.log"
+                        unstash "perf_gemm_bilinear.log"
-                        unstash "perf_reduction_${gpu_arch}.log"
+                        unstash "perf_reduction.log"
-                        unstash "perf_splitK_gemm_${gpu_arch}.log"
+                        unstash "perf_splitK_gemm.log"
-                        unstash "perf_onnx_gemm_${gpu_arch}.log"
+                        unstash "perf_onnx_gemm.log"
-                        sh "./process_qa_data.sh ${gpu_arch}"
+                        sh "./process_qa_data.sh"
                    }
                    else{
                        // unstash perf files to master
-                        unstash "perf_gemm_${gpu_arch}.log"
+                        unstash "perf_gemm.log"
-                        unstash "perf_resnet50_N256_${gpu_arch}.log"
+                        unstash "perf_resnet50_N256.log"
-                        unstash "perf_resnet50_N4_${gpu_arch}.log"
+                        unstash "perf_resnet50_N4.log"
-                        sh "./process_perf_data.sh ${gpu_arch}"
+                        sh "./process_perf_data.sh"
                    }
                }
            }
@@ -482,12 +562,20 @@ pipeline {
    parameters {
        booleanParam(
            name: "BUILD_DOCKER",
-            defaultValue: true,
+            defaultValue: false,
-            description: "Force building docker image (default: true)")
+            description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
+        string(
+            name: 'ROCMVERSION', 
+            defaultValue: '5.3', 
+            description: 'Specify which ROCM version to use: 5.2.3, or 5.3 (default), etc.')
        string(
            name: 'COMPILER_VERSION', 
-            defaultValue: 'ck-9110', 
+            defaultValue: 'release', 
-            description: 'Specify which version of compiler to use: ck-9110 (default), release, or amd-stg-open.')
+            description: 'Specify which version of compiler to use: ck-9110, release (default), or amd-stg-open.')
+        string(
+            name: 'COMPILER_COMMIT', 
+            defaultValue: '', 
+            description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit (default), or use 8a82e4eb7ba28521ba9a9424a0315a8a16590424 commit of amd-stg-open branch.')
        string(
            name: 'BUILD_COMPILER', 
            defaultValue: 'hipcc', 
@@ -496,10 +584,6 @@ pipeline {
            name: "RUN_FULL_QA",
            defaultValue: false,
            description: "Select whether to run small set of performance tests (default) or full QA")
-        booleanParam(
-            name: "TEST_NODE_PERFORMANCE",
-            defaultValue: false,
-            description: "Test the node GPU performance (default: false)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -514,9 +598,10 @@ pipeline {
    }
    stages{
        stage("Build Docker"){
-            when {
+            //when {
-                expression { params.BUILD_DOCKER.toBoolean() }
+            //    beforeAgent true
-            }
+            //    expression { params.BUILD_DOCKER.toBoolean() }
+            //}
            parallel{
                stage('Docker /opt/rocm'){
                    agent{ label rocmnode("nogpu") }
@@ -527,22 +612,7 @@ pipeline {
            }
        }
        stage("Static checks") {
-            when {
-                beforeAgent true
-                expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
-            }
            parallel{
-                // enable after we move from hipcc to hip-clang
-                // stage('Tidy') {
-                //     agent{ label rocmnode("nogpu") }
-                //     environment{
-                //         // setup_cmd = "CXX='/opt/rocm/bin/hipcc' cmake -DBUILD_DEV=On .. "
-                //         build_cmd = "make -j\$(nproc) -k analyze"
-                //     }
-                //     steps{
-                //         buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug')
-                //     }
-                // }
                stage('Clang Format') {
                    agent{ label rocmnode("nogpu") }
                    environment{
@@ -562,101 +632,57 @@ pipeline {
                }
            }
        }
-		stage("Tests")
+		stage("Build CK and run Tests")
        {
-            when {
-                beforeAgent true
-                expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
-            }
            parallel
            {
-                stage("Run Tests: gfx908")
+                stage("Build CK and run Tests")
                {
-                    agent{ label rocmnode("gfx908")}
+                    agent{ label rocmnode("gfx908 || gfx90a") }
                    environment{
-                        //setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
+                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" """ : """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 " """ }"
-                        setup_args = "${params.COMPILER_VERSION == "release" ? """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ : """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """}"
+                        execute_args = "${params.COMPILER_VERSION == "ck-9110" ? """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908,gfx90a" -DCMAKE_CXX_FLAGS="-O3" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ }"
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908")
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                    }
                }
-                stage("Run Tests: gfx90a")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() }
-                    }
-                    options { retry(2) }
-                    agent{ label rocmnode("gfx90a")}
-                    environment{
-                        //setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
-                        setup_args = "${params.COMPILER_VERSION == "release" ? """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """ : """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx90a -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """}"
            }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a")
-                    }
-                }
-            }
-        }
-        stage("Client App")
-        {
-            when {
-                beforeAgent true
-                expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
        }
-            parallel
-            {
-                stage("Run Client App")
-                {
-                    agent{ label rocmnode("gfx908")}
-                    environment{
-                        //setup_args = """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """
-                        setup_args = "${params.COMPILER_VERSION == "release" ? """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """ : """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" """ }"
-                        //execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
-                        execute_args = "${params.COMPILER_VERSION == "release" ? """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ }"
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
-                    }
-                }
-            }
-        }
        stage("Performance Tests")
        {
            parallel
            {
-                stage("Run ckProfiler: gfx908")
+                stage("Run ckProfiler: gfx908 or gfx90a")
                {
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() }
                    }
                    options { retry(2) }
-                    agent{ label rocmnode("gfx908")}
+                    agent{ label rocmnode("gfx908 || gfx90a")}
                    environment{
-                        //setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
+                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}"
-                        setup_args = "${params.COMPILER_VERSION == "release" ? """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ : """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """}"
                   }
                    steps{
-                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908")
+                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
                    }
                }
                stage("Run ckProfiler: gfx90a")
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
+                        expression { params.RUN_FULL_QA.toBoolean() }
                    }
                    options { retry(2) }
                    agent{ label rocmnode("gfx90a")}
                    environment{
-                        //setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
+                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}"
-                        setup_args = "${params.COMPILER_VERSION == "release" ? """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """ : """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx90a -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """}"
                    }
                    steps{
-                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a")
+                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
                    }
                }
            }
@@ -665,43 +691,13 @@ pipeline {
        {
            parallel
            {
-                stage("Process results for gfx908"){
+                stage("Process results"){
-                    when {
-                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
-                    }
                    agent { label 'mici' }
                    steps{
-                        process_results(gpu_arch: "gfx908")
+                        process_results()
-                    }
-                }
-                stage("Process results for gfx90a"){
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
-                    }
-                    agent { label 'mici' }
-                    steps{
-                        process_results(gpu_arch: "gfx90a")
-                    }
-                }
-            }
-        }
-        /* enable after the cmake file supports packaging
-        stage("Packages") {
-            when {
-                expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA }
-            }
-            parallel {
-                stage("Package /opt/rocm") {
-                    agent{ label rocmnode("nogpu") }
-                    steps{
-                        buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a")
                    }
                }
            }
        }
-        */
    }
 }
--- a/README.md
+++ b/README.md
-## Docker script
+# Composable Kernel
+## Methodology
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+CK utilizes two concepts to achieve performance portability and code maintainability:
+* A tile-based programming model
+* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
+![ALT](/doc/image/ck_component.png "CK Components")
+## Code Structure
+Current CK library are structured into 4 layers:
+* "Templated Tile Operators" layer
+* "Templated Kernel and Invoker" layer
+* "Instantiated Kernel and Invoker" layer
+* "Client API" layer
+![ALT](/doc/image/ck_layer.png "CK Layers")
+## Contributors
+The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
+## Citation
+If you use CK, please use following citations:
+* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
+* [CITATION.cff](/CITATION.cff)
+## License
+CK is released under the MIT license. [License File](/LICENSE)
+# Build CK
+## Build docker image
+```bash
+DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
+```
+## Launch docker
 ```bash
 docker run                                     \
 -it                                            \
@@ -6,47 +45,38 @@ docker run                                     \
 --group-add sudo                               \
 -w /root/workspace                             \
 -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace  \
-rocm/tensorflow:rocm5.1-tf2.6-dev              \
+ck:latest                                      \
 /bin/bash
 ```
-# Install newer version of rocm-cmake
+## Build CK
-https://github.com/RadeonOpenCompute/rocm-cmake
-## Build
 ```bash
 mkdir build && cd build
-```
-```bash
+# Need to specify target ID, example below is for gfx908 and gfx90a
-# Need to specify target ID, example below is gfx908 and gfx90a
 cmake                                                                                             \
-D BUILD_DEV=OFF                                                      \
-D CMAKE_BUILD_TYPE=Release                                           \
-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
-D CMAKE_INSTALL_PREFIX=${PATH_TO_CK_INSTALL_DIRECTORY}               \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+-D CMAKE_CXX_FLAGS="-O3"                                                                          \
+-D CMAKE_BUILD_TYPE=Release                                                                       \
+-D GPU_TARGETS=gfx908;gfx90a                                                                      \
 ..
 ```
-### Build and Run Examples
+### Build examples and tests
-```bash
- make -j examples
-```
-Instructions for running each individual examples are under ```example/```
-## Tests
 ```bash
 make -j examples tests
 make test
 ```
+Instructions for running each individual examples are under [example](/example)
 ## Build ckProfiler
 ```bash
 make -j ckProfiler
 ```
-Instructions for running ckProfiler are under ```profiler/```
+Instructions for running ckProfiler are under [profiler](/profiler)
 ## Install CK
 ```bash
@@ -54,13 +84,13 @@ make install
 ```
 ## Using CK as pre-built kernel library
-Instructions for using CK as a pre-built kernel library are under ```client_example/```
+Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
 ## Caveat
 ### Kernel Timing and Verification
 CK's own kernel timer will warn up kernel once, and then run it multiple times
 to get average kernel time. For some kernels that use atomic add, this will cause
-output buffer to be accumulated multiple times, causing verfication failure.
+output buffer to be accumulated multiple times, causing verification failure.
 To work around it, do not use CK's own timer and do verification at the same time.
 CK's own timer and verification in each example and ckProfiler can be enabled or
 disabled from command line.
--- a/client_example/05_layernorm/layernorm2d.cpp
+++ b/client_example/05_layernorm/layernorm2d.cpp
@@ -81,8 +81,8 @@ int main(int argc, char* argv[])
        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N},      // lengths
                                                        {Stride, 1}, // xStrides
-                                                        {1},         // gammaStrides
+                                                        {0, 1},      // gammaStrides
-                                                        {1},         // betaStrides
+                                                        {0, 1},      // betaStrides
                                                        {Stride, 1}, // yStrides
                                                        {1},         // reduceDims
                                                        1e-4,

--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -6,9 +6,10 @@ find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
 find_package(hip REQUIRED PATHS /opt/rocm)
 message(STATUS "Build with HIP ${hip_VERSION}")
-add_subdirectory(01_gemm)
+# add all example subdir
-add_subdirectory(02_gemm_add_add_fastgelu)
+file(GLOB dir_list LIST_DIRECTORIES true *)
-add_subdirectory(03_gemm_layernorm)
+FOREACH(subdir ${dir_list})
-add_subdirectory(04_contraction)
+    IF(IS_DIRECTORY "${subdir}" AND (NOT "${subdir}" MATCHES "build"))
-add_subdirectory(05_layernorm)
+        add_subdirectory(${subdir})
-add_subdirectory(06_softmax)
+    ENDIF()
+ENDFOREACH()
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
 ROCmSoftwarePlatform/rocm-recipes
+RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build
 # 1.90+
 danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f
\ No newline at end of file
--- a/doc/image/ck_component.png
+++ b/doc/image/ck_component.png
--- a/doc/image/ck_layer.png
+++ b/doc/image/ck_layer.png
--- a/example/24_batched_gemm_e_permute/batched_gemm_e_permute_xdl_fp16.cpp
+++ b/example/24_batched_gemm_e_permute/batched_gemm_e_permute_xdl_fp16.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using F16 = ck::half_t;
-using F32 = float;
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ADataType        = F16;
-using BDataType        = F16;
-using AccDataType      = F32;
-using CShuffleDataType = F16;
-using EDataType        = F16;
-using ALayout = Row;
-using BLayout = Col;
-using ELayout = Row;
-using AElementOp   = PassThrough;
-using BElementOp   = PassThrough;
-using CDEElementOp = PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmEPermuteXdl
-    // clang-format off
-//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//######|        |        |        |      Type|      Type|        Type|         DataType|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######|        |        |        |          |          |            |                 |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|        |        |        |          |          |            |                 |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
-// clang-format on
-using ReferenceBatchedGemmInstance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
-                                                                                      BDataType,
-                                                                                      EDataType,
-                                                                                      AccDataType,
-                                                                                      AElementOp,
-                                                                                      BElementOp,
-                                                                                      CDEElementOp>;
-int main(int argc, char* argv[])
-{
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-    const int M = 256;
-    const int N = 128;
-    const int K = 64;
-    const int stride_A = K;
-    const int stride_B = K;
-    const int batch_stride_A = M * K;
-    const int batch_stride_B = K * N;
-    const int G0 = 16;
-    const int G1 = 8;
-    const int batch_count = G0 * G1;
-    // output layout - [G0, M, G1, N]
-    const int stride_G0 = M * G1 * N;
-    const int stride_G1 = N;
-    const int stride_M  = G1 * N;
-    const int stride_N  = 1;
-    if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        exit(0);
-    }
-    // GEMM shape
-    ck::tensor_operation::device::BatchedGemmEPermuteDesc batched_gemm_e_permute_desc{
-        G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N};
-    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
-                                       std::size_t row,
-                                       std::size_t col,
-                                       std::size_t stride,
-                                       std::size_t batch_stride,
-                                       auto layout) {
-        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
-                                        std::vector<std::size_t>({batch_stride, stride, 1}));
-        }
-        else
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
-                                        std::vector<std::size_t>({batch_stride, 1, stride}));
-        }
-    };
-    Tensor<ADataType> a_g_m_k(
-        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
-    Tensor<BDataType> b_g_k_n(
-        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
-    auto f_host_e_tensor_descriptor = [](std::size_t G0_,
-                                         std::size_t G1_,
-                                         std::size_t M_,
-                                         std::size_t N_,
-                                         std::size_t stride_G0_,
-                                         std::size_t stride_G1_,
-                                         std::size_t stride_M_,
-                                         std::size_t stride_N_) {
-        return HostTensorDescriptor(
-            std::vector<std::size_t>({G0_, G1_, M_, N_}),
-            std::vector<std::size_t>({stride_G0_, stride_G1_, stride_M_, stride_N_}));
-    };
-    Tensor<EDataType> e_g0_g1_m_n_host_result(
-        f_host_e_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
-    Tensor<EDataType> e_g0_g1_m_n_device_result(
-        f_host_e_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
-    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
-    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
-    std::cout << "e_g0_g1_m_n: " << e_g0_g1_m_n_host_result.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        break;
-    default:
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        break;
-    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) *
-                           e_g0_g1_m_n_device_result.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a_g_m_k.mData.data());
-    b_device_buf.ToDevice(b_g_k_n.mData.data());
-    auto a_element_op   = AElementOp{};
-    auto b_element_op   = BElementOp{};
-    auto cde_element_op = CDEElementOp{};
-    auto gemm    = DeviceGemmInstance{};
-    auto invoker = gemm.MakeInvoker();
-    // do GEM
-    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                      static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()),
-                                      M,
-                                      N,
-                                      K,
-                                      stride_A,
-                                      stride_B,
-                                      batch_stride_A,
-                                      batch_stride_B,
-                                      batched_gemm_e_permute_desc,
-                                      batch_count,
-                                      a_element_op,
-                                      b_element_op,
-                                      cde_element_op);
-    if(!gemm.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device_gemm with the specified compilation parameters does "
-            "not support this GEMM problem");
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    std::size_t flop      = std::size_t(2) * batch_count * M * N * K;
-    std::size_t num_btype = sizeof(ADataType) * batch_count * M * K +
-                            sizeof(BDataType) * batch_count * K * N +
-                            sizeof(EDataType) * batch_count * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << gemm.GetTypeString() << std::endl;
-    bool pass = true;
-    if(do_verification)
-    {
-        e_device_buf.FromDevice(e_g0_g1_m_n_device_result.mData.data());
-        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
-        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
-        Tensor<EDataType> c_g_m_n_host_result = HostTensorDescriptor(
-            std::vector<std::size_t>({batch_count, M, N}), std::vector<std::size_t>({M * N, N, 1}));
-        auto ref_argument = ref_batched_gemm.MakeArgument(
-            a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, cde_element_op);
-        ref_invoker.Run(ref_argument);
-        for(int g0 = 0; g0 < G0; g0++)
-        {
-            for(int g1 = 0; g1 < G1; g1++)
-            {
-                for(int m = 0; m < M; m++)
-                {
-                    for(int n = 0; n < N; n++)
-                    {
-                        int g = g0 * G1 + g1;
-                        e_g0_g1_m_n_host_result(g0, g1, m, n) = c_g_m_n_host_result(g, m, n);
-                    }
-                }
-            }
-        }
-        pass = ck::utils::check_err(e_g0_g1_m_n_host_result.mData,
-                                    e_g0_g1_m_n_device_result.mData,
-                                    "Error: Incorrect results c");
-    }
-    return pass ? 0 : 1;
-}
--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/layernorm_blockwise.cpp
@@ -29,7 +29,8 @@ using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
 constexpr int Rank         = 2;
 constexpr int NumReduceDim = 1;
-using DeviceInstance = ck::tensor_operation::device::DeviceLayernormImpl<XDataType,
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceLayernormImpl<XDataType,
                                                      GammaDataType,
                                                      BetaDataType,
                                                      AccDataType,
@@ -44,7 +45,9 @@ using DeviceInstance = ck::tensor_operation::device::DeviceLayernormImpl<XDataTy
                                                      8,   // SliceK
                                                      1,   // SrcVecDim (0=M, 1=K)
                                                      8,   // SrcScalarPerVector
+                                                      1,   // GammaVecDim (0=M, 1=K)
                                                      8,   // GammaScalarPerVector
+                                                      1,   // BetaVecDim (0=M, 1=K)
                                                      8,   // BetaScalarPerVector
                                                      8>;  // OutScalarPerVector
@@ -88,8 +91,8 @@ int main()
    auto argument_ptr    = device_instance.MakeArgumentPointer(
        {M, N},
        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
-        std::vector<ck::index_t>{gamma.mDesc.GetStrides().begin(), gamma.mDesc.GetStrides().end()},
+        {0, 1},
-        std::vector<ck::index_t>{beta.mDesc.GetStrides().begin(), beta.mDesc.GetStrides().end()},
+        {0, 1},
        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
        {1},
        1e-4,

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
 add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
 add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
-add_example_executable(example_padded_batched_gemm_scale_softmax_gemm_xdl_fp16 padded_batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
+add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
-add_custom_target(example_batched_gemm_scale_softmax_gemm)
+add_custom_target(example_gemm_scale_softmax_gemm)
-add_dependencies(example_batched_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16)
-add_dependencies(example_batched_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16)
-add_dependencies(example_batched_gemm_scale_softmax_gemm example_padded_batched_gemm_scale_softmax_gemm_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16)
--- a/example/32_batched_gemm_scale_softmax_gemm/padded_batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/padded_batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
@@ -16,7 +16,8 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -47,7 +48,9 @@ using CDataType        = F16;
 using ALayout  = Row;
 using B0Layout = Col;
 using B1Layout = Row;
-using CLayout  = Row;
+using CPermuteNumDims_G_M_O =
+    S<2, 1, 1>; // "using CLayout = Row" has been replaced by CPermuteNumDims_G_M_O
 using AElementOp    = PassThrough;
 using B0ElementOp   = PassThrough;
@@ -55,13 +58,14 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
 using B1ElementOp   = PassThrough;
 using CElementOp    = PassThrough;
-static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
        ALayout,
        B0Layout,
        B1Layout,
-    CLayout,
+        CPermuteNumDims_G_M_O,
        ADataType,
        B0DataType,
        B1DataType,
@@ -73,7 +77,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
        Acc0ElementOp,
        B1ElementOp,
        CElementOp,
-    MNPadding,
+        GemmSpec,
        1,
        256,
        128,         // MPerBlock
@@ -113,7 +117,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
        1,              // CShuffleMXdlPerWavePerShuffle
        2,              // CShuffleNXdlPerWavePerShuffle
        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        true>;          // MaskOutUpperTriangle
 // Ref Gemm0: fp16 in, fp32 out
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
@@ -143,22 +148,26 @@ int main(int argc, char* argv[])
    int init_method      = 1;
    bool time_kernel     = false;
-    // GEMM shape
+    // GEMM shape for A/B0/B1/C
-    ck::index_t M             = 1020;
+    // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o
-    ck::index_t N             = 1020;
+    ck::index_t M             = 512;
+    ck::index_t N             = 512;
    ck::index_t K             = 64;
    ck::index_t O             = 128;
-    ck::index_t BatchCount    = 4;
    ck::index_t StrideA       = -1;
    ck::index_t StrideB0      = -1;
    ck::index_t StrideB1      = -1;
-    ck::index_t StrideC       = -1;
    ck::index_t BatchStrideA  = -1;
    ck::index_t BatchStrideB0 = -1;
    ck::index_t BatchStrideB1 = -1;
-    ck::index_t BatchStrideC  = -1;
    float alpha               = 1;
+    // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape
+    // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o])
+    // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3])
+    ck::index_t G0 = 7;
+    ck::index_t G1 = 13;
    if(argc == 1)
    {
        // use default case
@@ -169,7 +178,7 @@ int main(int argc, char* argv[])
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
-    else if(argc == 9)
+    else if(argc == 11)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
@@ -179,64 +188,41 @@ int main(int argc, char* argv[])
        N  = std::stoi(argv[5]);
        K  = std::stoi(argv[6]);
        O  = std::stoi(argv[7]);
+        G0 = std::stoi(argv[8]);
+        G1 = std::stoi(argv[9]);
-        BatchCount = std::stoi(argv[8]);
+        alpha = std::stof(argv[10]);
-    }
-    else if(argc == 18)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        M = std::stoi(argv[4]);
-        N = std::stoi(argv[5]);
-        K = std::stoi(argv[6]);
-        O = std::stoi(argv[7]);
-        BatchCount = std::stoi(argv[8]);
-        StrideA  = std::stoi(argv[9]);
-        StrideB0 = std::stoi(argv[10]);
-        StrideB1 = std::stoi(argv[11]);
-        StrideC  = std::stoi(argv[12]);
-        BatchStrideA  = std::stoi(argv[13]);
-        BatchStrideB0 = std::stoi(argv[14]);
-        BatchStrideB1 = std::stoi(argv[15]);
-        BatchStrideC  = std::stoi(argv[16]);
-        alpha = std::stof(argv[17]);
    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 16: M, N, K, O, Batch, StrideA, StrideB0, StrideB1, StrideC, BatchStrideA, "
+        printf("arg4 to 11: M, N, K, O, G0, G1\n");
-               "BatchStrideB0, BatchStrideB1, BatchStrideC\n");
+        printf("arg10: scale (alpha)\n");
-        printf("arg17: scale (alpha)\n");
        exit(0);
    }
+    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
    const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
-    const int DefaultStrideC  = ck::is_same_v<CLayout, Row> ? O : M;
    StrideA  = (StrideA < 0) ? DefaultStrideA : StrideA;
    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
-    StrideC  = (StrideC < 0) ? DefaultStrideC : StrideC;
    const int DefaultBatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
-    const int DefaultBatchStrideC  = (ck::is_same_v<CLayout, Col> ? O : M) * StrideC;
    BatchStrideA  = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA;
    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
-    BatchStrideC  = BatchStrideC < 0 ? DefaultBatchStrideC : BatchStrideC;
+    const int BatchCount = G0 * G1;
    auto f_host_tensor_descriptor = [](std::size_t batch_count,
                                       std::size_t row,
@@ -263,15 +249,17 @@ int main(int argc, char* argv[])
        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
    Tensor<B1DataType> b1_g_n_o(
        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
-    Tensor<CDataType> c_g_m_o_host_result(
+    Tensor<CDataType> c_gs_ms_os_host_result(
-        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+        std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
-    Tensor<CDataType> c_g_m_o_device_result(
+        std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
-        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_gs_ms_os_device_result(
+        std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
+        std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
-    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
+    std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl;
    switch(init_method)
    {
@@ -300,8 +288,8 @@ int main(int argc, char* argv[])
    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSpaceSize());
    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSpaceSize());
-    DeviceMem c_g_m_o_device_buf(sizeof(CDataType) *
+    DeviceMem c_gs_ms_os_device_buf(sizeof(CDataType) *
-                                 c_g_m_o_device_result.mDesc.GetElementSpaceSize());
+                                    c_gs_ms_os_device_result.mDesc.GetElementSpaceSize());
    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
@@ -320,20 +308,20 @@ int main(int argc, char* argv[])
        gemm.MakeArgument(static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
                          static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
                          static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
-                          static_cast<CDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
+                          static_cast<CDataType*>(c_gs_ms_os_device_buf.GetDeviceBuffer()),
                          M,
                          N,
                          K,
                          O,
                          BatchCount,
+                          c_gs_ms_os_lengths,
+                          c_gs_ms_os_strides,
                          StrideA,
                          StrideB0,
                          StrideB1,
-                          StrideC,
                          BatchStrideA,
                          BatchStrideB0,
                          BatchStrideB1,
-                          BatchStrideC,
                          a_element_op,
                          b0_element_op,
                          acc0_element_op,
@@ -361,26 +349,37 @@ int main(int argc, char* argv[])
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << gemm.GetTypeString() << std::endl;
-    c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
    if(do_verification)
    {
+        c_gs_ms_os_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
        // Output of Gemm0 is input A of Gemm1
        Tensor<AccDataType> acc0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
        Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+        Tensor<CDataType> c_g_m_o_host_result(std::vector<int>{BatchCount, M, O},
+                                              std::vector<int>{M * O, O, 1});
        auto ref_gemm0          = ReferenceGemm0Instance{};
        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op);
+        // gemm 0
        ref_gemm0_invoker.Run(ref_gemm0_argument);
+        // mask out upper triangle
+        acc0_g_m_n.ForEach([&](auto& self, auto idx) {
+            if(idx[1] < idx[2])
+                self(idx) = -ck::NumericLimits<float>::Infinity();
+        });
        auto ref_softmax          = ReferenceSoftmaxInstance{};
        auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
        auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
+        // softmax
        ref_softmax_invoker.Run(ref_softmax_argument);
        auto ref_gemm1          = ReferenceGemm1Instance{};
@@ -388,9 +387,22 @@ int main(int argc, char* argv[])
        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+        // gemm1
        ref_gemm1_invoker.Run(ref_gemm1_argument);
-        return ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData) ? 0 : 1;
+        // permute
+        c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) {
+            const size_t& g0 = idx[0];
+            const size_t& g1 = idx[1];
+            const size_t g = g0 * G1 + g1;
+            self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]);
+        });
+        return ck::utils::check_err(c_gs_ms_os_device_result.mData, c_gs_ms_os_host_result.mData)
+                   ? 0
+                   : 1;
    }
    return 0;