Merge branch 'transpose_5d' of github.com:ROCmSoftwarePlatform/composable_kernel into transpose_5d

11279540 · Astha Rai · 14daa201 · 33e78b9a · 11279540 · 11279540
Commit 11279540 authored Nov 08, 2023 by Astha Rai
20 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,10 +14,10 @@ None
 ### Additions
 - Added an image to a column kernel (#867)
 - Added a column to an image kernel (#930)
- Support for 3D grouped convolution forward on RDNA 3 GPUs (#935)
+- Support for 3D grouped convolution on RDNA 3 GPUs (#935, #950, #985)
 - Grouped convolution support for small K and C (#822 #879 #897)
 - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
- Support for bf16/f32/f16 and NHWGC (2D and 3d) grouped convolution backward data (#757 #799)
+- Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
 - Support for Batched Gemm DL (#732)

 ### Changes

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,12 +32,10 @@ if (DTYPES)
    if (DTYPES MATCHES "fp8")
        add_definitions(-DCK_ENABLE_FP8)
        set(CK_ENABLE_FP8 "ON")
-        add_compile_options(-Wno-bit-int-extension)
    endif()
    if (DTYPES MATCHES "bf8")
        add_definitions(-DCK_ENABLE_BF8)
        set(CK_ENABLE_BF8 "ON")
-        add_compile_options(-Wno-bit-int-extension)
    endif()
    if (DTYPES MATCHES "fp16")
        add_definitions(-DCK_ENABLE_FP16)
@@ -59,9 +57,11 @@ if (DTYPES)
 else()
    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
    set(CK_ENABLE_ALL_DTYPES "ON")
-    add_compile_options(-Wno-bit-int-extension) # enable fp8 and bf8
 endif()

+#for f8/bf8_t type
+add_compile_options(-Wno-bit-int-extension)
+
 if(DL_KERNELS)
    add_definitions(-DDL_KERNELS)
    set(CK_ENABLE_DL_KERNELS "ON")
@@ -373,9 +373,10 @@ include_directories(BEFORE

 SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
 if(BUILD_DEV)
-    add_compile_options(-Werror)
-    add_compile_options(-Weverything)
+    add_compile_options(-Werror -Weverything)
 endif()
+#add flags to reduce the size of binaries
+add_compile_options(-Oz -flto=thin)
 message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

 add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
@@ -390,35 +391,27 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu
    file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
    set(add_inst 0)
    if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
-        #message("fp8 instance found!")
        set(add_inst 1)
    endif()
    if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
-        #message("bf8 instance found!")
        set(add_inst 1)
    endif()
    if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
-        #message("fp16 instance found!")
        set(add_inst 1)
    endif()
    if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
-        #message("fp32 instance found!")
        set(add_inst 1)
    endif()
    if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
-        #message("fp64 instance found!")
        set(add_inst 1)
    endif()
    if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
-        #message("bf16 instance found!")
        set(add_inst 1)
    endif()
    if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
-        #message("int8 instance found!")
        set(add_inst 1)
    endif()
    if(NOT "${cmake_instance}" MATCHES "DTYPES")
-        #message("instance should be built for all types!")
        set(add_inst 1)
    endif()
    if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)

--- a/Dockerfile
+++ b/Dockerfile
@@ -26,29 +26,42 @@ RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 RUN amdgpu-install -y --usecase=rocm --no-dkms

+## Sccache binary built from source for ROCm
+ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
+ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
+RUN mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
+curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
+chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
+ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
+
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
    build-essential \
-    ccache \
    cmake \
+    ccache \
    git \
    hip-rocclr \
+    iputils-ping \
    jq \
    libelf-dev \
    libncurses5-dev \
    libnuma-dev \
    libpthread-stubs0-dev \
    llvm-amdgpu \
+    net-tools \
    pkg-config \
    python \
    python3 \
    python3-dev \
    python3-pip \
+    redis \
    sshpass \
+    stunnel \
    software-properties-common \
    vim \
    nano \
    zlib1g-dev \
+    zip \
    openssh-server \
    clang-format-12 \
    kmod && \
@@ -61,7 +74,7 @@ RUN gunzip /usr/local/bin/ninja.gz
 RUN chmod a+x /usr/local/bin/ninja
 RUN git clone https://github.com/nico/ninjatracing.git
 # Update the cmake to the latest version
-RUN pip install --upgrade cmake
+RUN pip install --upgrade cmake==3.27.5

 # Setup ubsan environment to printstacktrace
 RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
@@ -76,9 +89,9 @@ ARG PREFIX=/opt/rocm
 RUN pip3 install --upgrade pip
 RUN pip3 install sqlalchemy==1.4.46
 RUN pip3 install pymysql
-RUN pip3 install pandas
+RUN pip3 install pandas==2.0.3
 RUN pip3 install setuptools-rust
-RUN pip3 install sshtunnel
+RUN pip3 install sshtunnel==0.4.0
 # Setup ubsan environment to printstacktrace
 ENV UBSAN_OPTIONS=print_stacktrace=1

@@ -114,6 +127,8 @@ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ];
    else echo "using the release compiler"; \
    fi

+#clean-up the deb package
+RUN sh -c "rm -rf amdgpu-install*"

 #ENV HIP_CLANG_PATH='/llvm-project/build/bin'
 #RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -65,10 +65,10 @@ def getDockerImageName(){
 }

 def check_host() {
-    if ("${env.CK_CCACHE}" != "null"){
-        def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}"
-        echo "ccache server: ${CCACHE_SERVER}"
-        sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt'''
+    if ("${env.CK_SCCACHE}" != "null"){
+        def SCCACHE_SERVER="${env.CK_SCCACHE.split(':')[0]}"
+        echo "sccache server: ${SCCACHE_SERVER}"
+        sh '''ping -c 1 -p 6379 "${SCCACHE_SERVER}" | echo $? > tmp.txt'''
        def output = readFile(file: "tmp.txt")
        echo "tmp.txt contents: \$output"
        return (output != "0")
@@ -96,24 +96,9 @@ def build_compiler(){

 def getDockerImage(Map conf=[:]){
    env.DOCKER_BUILDKIT=1
-    def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
+    def prefixpath = conf.get("prefixpath", "/opt/rocm")
    def no_cache = conf.get("no_cache", false)
    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    echo "ccache server: ${env.CK_CCACHE}"
-    if(env.CK_CCACHE)
-    {
-        if(check_host())
-        {
-            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
-        }
-        else 
-        {
-            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
-        }
-        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
-        env.CCACHE_DIR = """/tmp/ccache_store"""
-        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
-    }
    if(no_cache)
    {
        dockerArgs = dockerArgs + " --no-cache "
@@ -142,21 +127,6 @@ def buildDocker(install_prefix){
    def image_name = getDockerImageName()
    echo "Building Docker for ${image_name}"
    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    echo "ccache server: ${env.CK_CCACHE}"
-    if(env.CK_CCACHE)
-    {
-        if(check_host())
-        {
-            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
-        }
-        else 
-        {
-            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
-        }
-        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
-        env.CCACHE_DIR = """/tmp/ccache_store"""
-        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
-    }

    echo "Build Args: ${dockerArgs}"
    try{
@@ -169,7 +139,7 @@ def buildDocker(install_prefix){
        else{
            echo "Checking for image: ${image_name}"
            sh "docker manifest inspect --insecure ${image_name}"
-            echo "Image: ${image_name} found!! Skipping building image"
+            echo "Image: ${image_name} found! Skipping building image"
        }
    }
    catch(Exception ex){
@@ -219,13 +189,9 @@ def cmake_build(Map conf=[:]){
    }else{
        setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
    }
-    if(env.CK_CCACHE)
-    {
-        setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
-    }
-    echo "ccache server: ${env.CK_CCACHE}"

    def pre_setup_cmd = """
+            #!/bin/bash
            echo \$HSA_ENABLE_SDMA
            ulimit -c unlimited
            rm -rf build
@@ -234,6 +200,60 @@ def cmake_build(Map conf=[:]){
            mkdir install
            cd build
        """
+    def invocation_tag=""
+    if (setup_args.contains("gfx11")){
+        invocation_tag="gfx11"
+    }
+    if (setup_args.contains("gfx10")){
+        invocation_tag="gfx10"
+    }
+    if (setup_args.contains("gfx90")){
+        invocation_tag="gfx90"
+    }
+    if (setup_args.contains("gfx94")){
+        invocation_tag="gfx94"
+    }
+    echo "invocation tag: ${invocation_tag}"
+    def redis_pre_setup_cmd = pre_setup_cmd
+    if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
+        redis_pre_setup_cmd = pre_setup_cmd + """
+            #!/bin/bash
+            export ROCM_PATH=/opt/rocm
+            export SCCACHE_ENABLED=true
+            export SCCACHE_LOG_LEVEL=debug
+            export SCCACHE_IDLE_TIMEOUT=14400
+            export COMPILERS_HASH_DIR=/tmp/.sccache
+            export SCCACHE_BIN=/usr/local/.cargo/bin/sccache
+            export SCCACHE_EXTRAFILES=/tmp/.sccache/rocm_compilers_hash_file
+            export SCCACHE_REDIS="redis://${env.CK_SCCACHE}"
+            echo "connect = ${env.CK_SCCACHE}" >> ../script/redis-cli.conf
+            export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
+            echo \$SCCACHE_C_CUSTOM_CACHE_BUSTER
+            stunnel ../script/redis-cli.conf
+            ../script/sccache_wrapper.sh --enforce_redis
+        """
+        try {
+            def cmd1 = conf.get("cmd1", """
+                    ${redis_pre_setup_cmd}
+                """)
+            sh cmd1
+            setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
+        }
+        catch(Exception err){
+            echo "could not connect to redis server: ${err.getMessage()}. will not use sccache."
+            def cmd2 = conf.get("cmd2", """
+                    ${pre_setup_cmd}
+                """)
+            sh cmd2
+        }
+    }
+    else{
+        def cmd3 = conf.get("cmd3",  """
+                ${pre_setup_cmd}
+            """)
+        sh cmd3
+    }
+
    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
    // reduce parallelism when compiling, clang uses too much memory
    def nt = nthreads()
@@ -241,17 +261,19 @@ def cmake_build(Map conf=[:]){
    def execute_cmd = conf.get("execute_cmd", "")

    def cmd = conf.get("cmd", """
-            ${pre_setup_cmd}
            ${setup_cmd}
            ${build_cmd}
            ${execute_cmd}
        """)

    echo cmd
-    sh cmd
+
+    dir("build"){
+        sh cmd
+    }

    // Only archive from master or develop
-    if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) {
+    if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
        archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
    }
 }
@@ -526,6 +548,26 @@ def Build_CK(Map conf=[:]){
                           stash "ckprofiler_0.2.0_amd64.deb"
                        }
                    }
+                    if (params.hipTensor_test && navi_node == 0 ){
+                        //build and test hipTensor
+                        sh """#!/bin/bash
+                            rm -rf "${params.hipTensor_branch}".zip
+                            rm -rf hipTensor-"${params.hipTensor_branch}"
+                            wget https://github.com/ROCmSoftwarePlatform/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
+                            unzip -o "${params.hipTensor_branch}".zip
+                        """
+                        dir("hipTensor-${params.hipTensor_branch}"){
+                            sh """#!/bin/bash
+                                mkdir -p build
+                                ls -ltr
+                                CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="/opt/rocm;${env.WORKSPACE}/install"
+                                cmake --build build -- -j
+                            """
+                        }
+                        dir("hipTensor-${params.hipTensor_branch}/build"){
+                            sh 'ctest'
+                        }
+                    }
                }
            }
        }
@@ -615,7 +657,7 @@ def process_results(Map conf=[:]){
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=
                                              0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT=
-                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
+                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : ""

 pipeline {
    agent none
@@ -654,6 +696,18 @@ pipeline {
            name: "DL_KERNELS",
            defaultValue: false,
            description: "Select whether to build DL kernels (default: OFF)")
+        booleanParam(
+            name: "hipTensor_test",
+            defaultValue: true,
+            description: "Use the CK build to verify hipTensor build and tests (default: ON)")
+        string(
+            name: 'hipTensor_branch',
+            defaultValue: 'develop',
+            description: 'Specify which branch of hipTensor to use (default: develop)')
+        booleanParam(
+            name: "USE_SCCACHE",
+            defaultValue: true,
+            description: "Use the sccache for building CK (default: ON)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -761,8 +815,8 @@ pipeline {
                    }
                    agent{ label rocmnode("navi32") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')

--- a/README.md
+++ b/README.md
 # Composable Kernel

-## Methodology
+The Composable Kernel (CK) library provides a programming model for writing performance-critical
+kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
+uses general purpose kernel languages, such as HIP C++.

-Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+CK uses two concepts to achieve performance portability and code maintainability:

-CK utilizes two concepts to achieve performance portability and code maintainability:
 * A tile-based programming model
-* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
+* Algorithm complexity reduction for complex machine learning (ML) operators. This uses an innovative
+   technique called *Tensor Coordinate Transformation*.

 ![ALT](/docs/data/ck_component.png "CK Components")

-## Code Structure
+The current CK library is structured into four layers:

-Current CK library are structured into 4 layers:
-* "Templated Tile Operators" layer
-* "Templated Kernel and Invoker" layer
-* "Instantiated Kernel and Invoker" layer
-* "Client API" layer
+* Templated Tile Operators
+* Templated Kernel and Invoker
+* Instantiated Kernel and Invoker
+* Client API

 ![ALT](/docs/data/ck_layer.png "CK Layers")

-## Documentation
+## General information

-Run the steps below to build documentation locally.
+To build our documentation locally, use the following code:

-```
+``` bash
 cd docs
 pip3 install -r sphinx/requirements.txt
 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```

-## Contributors
-
-The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
+You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
+page.

-## Citation
+```note
+If you use CK, cite us as follows:

-If you use CK, please use following citations:
-* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
+* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
+  This paper will be available on arXiv soon.
 * [CITATION.cff](/CITATION.cff)
+```

-## License
+CK is released under the **[MIT license](/LICENSE)**.

-CK is released under the MIT license. [License File](/LICENSE)
+## Building CK

+We recommend building CK inside Docker containers, which include all necessary packages. Pre-built
+Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composable_kernel/tags).

-# Build CK
+1. To build a new Docker image, use the Dockerfile provided with the source code:

-## Build docker image
+    ```bash
+    DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
+    ```

-```bash
-DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
-```
-Pre-built dockers are available from this public repo: 
-https://hub.docker.com/r/rocm/composable_kernel/tags
+2. Launch the Docker container:

-## Launch docker
+    ```bash
+    docker run                                     \
+    -it                                            \
+    --privileged                                   \
+    --group-add sudo                               \
+    -w /root/workspace                             \
+    -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace  \
+    ck:latest                                      \
+    /bin/bash
+    ```

-```bash
-docker run                                     \
-it                                            \
--privileged                                   \
--group-add sudo                               \
-w /root/workspace                             \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace  \
-ck:latest                                      \
-/bin/bash
-```
+3. Clone CK source code from the GitHub repository and start the build:

-## Build CK
+    ```bash
+    git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \
+    cd composable_kernel && \
+    mkdir build && \
+    cd build
+    ```

-```bash
-mkdir build && cd build
+    You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s) you want
+    to run CK on. You can specify single or multiple architectures. If you specify multiple architectures,
+    use a semicolon between each; for example, `gfx908;gfx90a;gfx940`.

-# Need to specify target ID, example below is for gfx908 and gfx90a
+    ```bash
+    cmake                                                                                             \
+    -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+    -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+    -D CMAKE_BUILD_TYPE=Release                                                                       \
+    -D GPU_TARGETS="gfx908;gfx90a"                                                                    \
+    ..
+    ```

-cmake                                                                                             \
-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
-D CMAKE_BUILD_TYPE=Release                                                                       \
-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
-..
-```
+    If you don't set `GPU_TARGETS` on the cmake command line, CK is built for all GPU targets
+    supported by the current compiler (this may take a long time).

-If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the 
-current compiler.
+4. Build the entire CK library:

+    ```bash
+    make -j
+    ```

-Additional cmake flags can be used to significantly speed-up the build:
+5. Install CK:

-INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library
-while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency.
+    ```bash
+    make -j install
+    ```

-DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances 
-of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
+## Optional post-install steps

-DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl 
-instances. Those instances are only needed for the NAVI2x platforms.
+* Build examples and tests:

-### Build examples and tests
+    ```bash
+    make -j examples tests
+    ```

-```bash
- make -j examples tests
- make test
-```
+* Build and run all examples and tests:
+
+    ```bash
+    make -j check
+    ```

-Instructions for running each individual examples are under [example](/example)
+    You can find instructions for running each individual example in [example](/example).

+* Build ckProfiler:

-## Build ckProfiler
+    ```bash
+    make -j ckProfiler
+    ```
+
+    You can find instructions for running ckProfiler in [profiler](/profiler).
+
+Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
+Depending on the number of CPU cores and the amount of RAM on your system, you may want to
+limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
+
+By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
+crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
+
+Additional cmake flags can be used to significantly speed-up the build:
+
+* `INSTANCES_ONLY` (default is OFF) must be set to ON in order to build only the instances and library
+  while skipping all tests, examples, and profiler. This is useful in cases when you plan to use CK as a
+  dependency and don't plan to run any examples or tests.
+
+* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
+  instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
+  other data types.
+
+* `DL_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dl` or
+  `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
+  other platforms have faster instances, such as `xdl` or `wmma`, available.
+
+## Using sccache for building
+
+The default CK Docker images come with a pre-installed version of sccache, which supports clang
+being used as hip-compiler (" -x hip"). Using sccache can help reduce the time to re-build code from
+hours to 1-2 minutes. In order to invoke sccache, you need to run:

 ```bash
- make -j ckProfiler
+ sccache --start-server
 ```
-Instructions for running ckProfiler are under [profiler](/profiler)

-## Install CK
+then add the following flags to the cmake command line:

 ```bash
-make install
+ -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
 ```

+You may need to clean up the build folder and repeat the cmake and make steps in order to take
+advantage of the sccache during subsequent builds.
+
 ## Using CK as pre-built kernel library

-Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
+You can find instructions for using CK as a pre-built kernel library in [client_example](/client_example).

-## Contributing
+## Contributing to CK

-When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
+When you contribute to CK, make sure you run `clang-format` on all changed files. We highly
+recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:

 ```bash
 sudo script/install_precommit.sh
 ```

-This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created.
+With this approach, `pre-commit` adds the appropriate hooks to your local repository and
+automatically runs `clang-format` (and possibly additional checks) before any commit is created.

 If you need to uninstall hooks from the repository, you can do so by running the following command:

@@ -141,14 +191,5 @@ If you need to uninstall hooks from the repository, you can do so by running the
 script/uninstall_precommit.sh
 ```

-If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command.
-
-## Caveat
-### Kernel Timing and Verification
-
-CK's own kernel timer will warn up kernel once, and then run it multiple times
-to get average kernel time. For some kernels that use atomic add, this will cause
-output buffer to be accumulated multiple times, causing verification failure.
-To work around it, do not use CK's own timer and do verification at the same time.
-CK's own timer and verification in each example and ckProfiler can be enabled or
-disabled from command line.
+If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
+`git commit` command.
--- a/client_example/05_layernorm/layernorm2d.cpp
+++ b/client_example/05_layernorm/layernorm2d.cpp
@@ -12,12 +12,14 @@

 #include "ck/library/tensor_operation_instance/gpu/normalization.hpp"

-using XDataType       = ck::half_t;
-using GammaDataType   = ck::half_t;
-using BetaDataType    = ck::half_t;
-using YDataType       = ck::half_t;
-using ComputeDataType = float;
-using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using PassThrough            = ck::tensor_operation::element_wise::PassThrough;
+
+#define SAVE_MEAN_INV_STD

 constexpr int Rank         = 2;
 constexpr int NumReduceDim = 1;
@@ -50,12 +52,16 @@ int main(int argc, char* argv[])
    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+#ifdef SAVE_MEAN_INV_STD
+    SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * M);
+    SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * M);
+#endif

    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
                                                                       GammaDataType,
                                                                       BetaDataType,
-                                                                       ComputeDataType,
                                                                       YDataType,
+                                                                       SaveMeanInvStdDataType,
                                                                       PassThrough,
                                                                       Rank,
                                                                       NumReduceDim>;
@@ -84,14 +90,21 @@ int main(int argc, char* argv[])
                                                        {0, 1},      // gammaStrides
                                                        {0, 1},      // betaStrides
                                                        {Stride, 1}, // yStrides
+                                                        {1},         // save_mean Strides
+                                                        {1},         // save_inv_std Strides
                                                        {1},         // reduceDims
                                                        1e-4,
                                                        x_device_buf.GetDeviceBuffer(),
                                                        gamma_device_buf.GetDeviceBuffer(),
                                                        beta_device_buf.GetDeviceBuffer(),
                                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                                        save_mean_device_buf.GetDeviceBuffer(),
+                                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
                                                        nullptr,
                                                        nullptr,
+#endif
                                                        PassThrough{});

        auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -109,6 +122,10 @@ int main(int argc, char* argv[])
            std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
                                   sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;

+#ifdef SAVE_MEAN_INV_STD
+            num_byte += sizeof(SaveMeanInvStdDataType) * M * 2;
+#endif
+
            float gb_per_sec = num_byte / 1.E6 / ave_time;

            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
@@ -140,17 +157,24 @@ int main(int argc, char* argv[])

        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N},      // lengths
                                                        {Stride, 1}, // xStrides
-                                                        {1},         // gammaStrides
-                                                        {1},         // betaStrides
+                                                        {0, 1},      // gammaStrides
+                                                        {0, 1},      // betaStrides
                                                        {Stride, 1}, // yStrides
+                                                        {1},         // save_mean Strides
+                                                        {1},         // save_inv_std Strides
                                                        {1},         // reduceDims
                                                        1e-4,
                                                        x_device_buf.GetDeviceBuffer(),
                                                        gamma_device_buf.GetDeviceBuffer(),
                                                        beta_device_buf.GetDeviceBuffer(),
                                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                                        save_mean_device_buf.GetDeviceBuffer(),
+                                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
                                                        nullptr,
                                                        nullptr,
+#endif
                                                        PassThrough{});

        auto invoker_ptr = op_ptr->MakeInvokerPointer();

--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -12,12 +12,14 @@

 #include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"

-using XDataType       = ck::half_t;
-using GammaDataType   = float;
-using BetaDataType    = float;
-using YDataType       = ck::half_t;
-using ComputeDataType = float;
-using Swish           = ck::tensor_operation::element_wise::Swish;
+using XDataType              = ck::half_t;
+using GammaDataType          = float;
+using BetaDataType           = float;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using Swish                  = ck::tensor_operation::element_wise::Swish;
+
+#define SAVE_MEAN_INV_STD

 constexpr int Rank         = 5;
 constexpr int NumReduceDim = 3;
@@ -49,19 +51,24 @@ int main(int argc, char* argv[])
    std::size_t xy_size         = N * H * W * G * C;
    std::size_t gamma_beta_size = G * C;

-    std::vector<ck::index_t> xy_strides         = {H * W * G * C, W * G * C, G * C, C, 1};
-    std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
+    std::vector<ck::index_t> xy_strides                = {H * W * G * C, W * G * C, G * C, C, 1};
+    std::vector<ck::index_t> gamma_beta_strides        = {0, 0, 0, C, 1};
+    std::vector<ck::index_t> save_mean_inv_std_strides = {G, 1};

    SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size);
    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size);
    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+#ifdef SAVE_MEAN_INV_STD
+    SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * N * G);
+    SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * N * G);
+#endif

    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
                                                                       GammaDataType,
                                                                       BetaDataType,
-                                                                       ComputeDataType,
                                                                       YDataType,
+                                                                       SaveMeanInvStdDataType,
                                                                       Swish,
                                                                       Rank,
                                                                       NumReduceDim>;
@@ -75,19 +82,26 @@ int main(int argc, char* argv[])
    const auto& generic_op_ptr = op_ptrs[0];

    auto generic_argument_ptr =
-        generic_op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
-                                            xy_strides,         // xStrides
-                                            gamma_beta_strides, // gammaStrides
-                                            gamma_beta_strides, // betaStrides
-                                            xy_strides,         // yStrides
-                                            {1, 2, 4},          // reduceDims
+        generic_op_ptr->MakeArgumentPointer({N, H, W, G, C},           // lengths
+                                            xy_strides,                // xStrides
+                                            gamma_beta_strides,        // gammaStrides
+                                            gamma_beta_strides,        // betaStrides
+                                            xy_strides,                // yStrides
+                                            save_mean_inv_std_strides, // save_mean Strides
+                                            save_mean_inv_std_strides, // save_inv_std Strides
+                                            {1, 2, 4},                 // reduceDims
                                            1e-6,
                                            x_device_buf.GetDeviceBuffer(),
                                            gamma_device_buf.GetDeviceBuffer(),
                                            beta_device_buf.GetDeviceBuffer(),
                                            y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                            save_mean_device_buf.GetDeviceBuffer(),
+                                            save_inv_std_device_buf.GetDeviceBuffer(),
+#else
                                            nullptr,
                                            nullptr,
+#endif
                                            Swish{});

    if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
@@ -107,21 +121,29 @@ int main(int argc, char* argv[])

    for(int i = 0; i < op_ptrs.size(); ++i)
    {
-        auto& op_ptr      = op_ptrs[i];
-        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
-                                                        xy_strides,         // xStrides
-                                                        gamma_beta_strides, // gammaStrides
-                                                        gamma_beta_strides, // betaStrides
-                                                        xy_strides,         // yStrides
-                                                        {1, 2, 4},          // reduceDims
-                                                        1e-6,
-                                                        x_device_buf.GetDeviceBuffer(),
-                                                        gamma_device_buf.GetDeviceBuffer(),
-                                                        beta_device_buf.GetDeviceBuffer(),
-                                                        y_device_buf.GetDeviceBuffer(),
-                                                        nullptr,
-                                                        nullptr,
-                                                        Swish{});
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer({N, H, W, G, C},           // lengths
+                                        xy_strides,                // xStrides
+                                        gamma_beta_strides,        // gammaStrides
+                                        gamma_beta_strides,        // betaStrides
+                                        xy_strides,                // yStrides
+                                        save_mean_inv_std_strides, // save_mean Strides
+                                        save_mean_inv_std_strides, // save_inv_std Strides
+                                        {1, 2, 4},                 // reduceDims
+                                        1e-6,
+                                        x_device_buf.GetDeviceBuffer(),
+                                        gamma_device_buf.GetDeviceBuffer(),
+                                        beta_device_buf.GetDeviceBuffer(),
+                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                        save_mean_device_buf.GetDeviceBuffer(),
+                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
+                                        nullptr,
+                                        nullptr,
+#endif
+                                        Swish{});

        auto invoker_ptr = op_ptr->MakeInvokerPointer();

@@ -139,6 +161,10 @@ int main(int argc, char* argv[])
                sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size +
                sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size;

+#ifdef SAVE_MEAN_INV_STD
+            num_byte += sizeof(SaveMeanInvStdDataType) * N * G * 2;
+#endif
+
            float gb_per_sec = num_byte / 1.E6 / ave_time;

            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
@@ -169,20 +195,28 @@ int main(int argc, char* argv[])
        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                  << std::endl;

-        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
-                                                        xy_strides,         // xStrides
-                                                        gamma_beta_strides, // gammaStrides
-                                                        gamma_beta_strides, // betaStrides
-                                                        xy_strides,         // yStrides
-                                                        {1, 2, 4},          // reduceDims
-                                                        1e-6,
-                                                        x_device_buf.GetDeviceBuffer(),
-                                                        gamma_device_buf.GetDeviceBuffer(),
-                                                        beta_device_buf.GetDeviceBuffer(),
-                                                        y_device_buf.GetDeviceBuffer(),
-                                                        nullptr,
-                                                        nullptr,
-                                                        Swish{});
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer({N, H, W, G, C},           // lengths
+                                        xy_strides,                // xStrides
+                                        gamma_beta_strides,        // gammaStrides
+                                        gamma_beta_strides,        // betaStrides
+                                        xy_strides,                // yStrides
+                                        save_mean_inv_std_strides, // save_mean Strides
+                                        save_mean_inv_std_strides, // save_inv_std Strides
+                                        {1, 2, 4},                 // reduceDims
+                                        1e-6,
+                                        x_device_buf.GetDeviceBuffer(),
+                                        gamma_device_buf.GetDeviceBuffer(),
+                                        beta_device_buf.GetDeviceBuffer(),
+                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                        save_mean_device_buf.GetDeviceBuffer(),
+                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
+                                        nullptr,
+                                        nullptr,
+#endif
+                                        Swish{});

        auto invoker_ptr = op_ptr->MakeInvokerPointer();


--- a/client_example/22_im2col_col2im/column_to_image.cpp
+++ b/client_example/22_im2col_col2im/column_to_image.cpp
@@ -16,10 +16,10 @@
 using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;

-using ImageLayout = ck::tensor_layout::convolution::GNHWC;
+using ImageLayout = ck::tensor_layout::convolution::NHWGC;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 2;
 static constexpr ck::index_t N             = 32; // batch size
 static constexpr ck::index_t C             = 32; // input channel (per group)
 static constexpr ck::index_t Y             = 3;  // filter H
@@ -52,18 +52,18 @@ int main()
    std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
    std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};

-    // We have NHWGC in memory space (G is dummy)
-    // However, CK's API only accept length and stride with order of GNCHW
-    // Hence, we need to adjust the order of stride
+    // We have NHWGC in memory space
+    // However, CK's API only accepts lengths and strides with order of GNCHW.
+    // Hence, we need to adjust the order of strides.
    std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
-    std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1};
+    std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};

    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Ho * Wo * Y * X * C);
+    SimpleDeviceMem in(sizeof(InDataType) * G * N * Ho * Wo * Y * X * C);
    SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C);

    using namespace ck::conv_tensor_rearrange_op;
@@ -93,6 +93,7 @@ int main()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
+                                                        G,
                                                        N,
                                                        C,
                                                        in_spatial_lengths,
@@ -112,7 +113,7 @@ int main()
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
-                                    sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
+                                    sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;

            float gb_per_sec = num_bytes / 1.E6 / avg_time;

@@ -149,6 +150,7 @@ int main()
                  << std::endl;
        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
+                                                        G,
                                                        N,
                                                        C,
                                                        in_spatial_lengths,

--- a/client_example/22_im2col_col2im/image_to_column.cpp
+++ b/client_example/22_im2col_col2im/image_to_column.cpp
@@ -16,10 +16,10 @@
 using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;

-using ImageLayout = ck::tensor_layout::convolution::GNHWC;
+using ImageLayout = ck::tensor_layout::convolution::NHWGC;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 2;
 static constexpr ck::index_t N             = 32; // batch size
 static constexpr ck::index_t C             = 32; // input channel (per group)
 static constexpr ck::index_t Y             = 3;  // filter H
@@ -52,11 +52,11 @@ int main()
    std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
    std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};

-    // We have NHWGC in memory space (G is dummy)
-    // However, CK's API only accept length and stride with order of GNCHW
-    // Hence, we need to adjust the order of stride
+    // We have NHWGC in memory space
+    // However, CK's API only accepts lengths and strides with order of GNCHW.
+    // Hence, we need to adjust the order of strides.
    std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
-    std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1};
+    std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};

    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
@@ -64,7 +64,7 @@ int main()
    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};

    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C);

    using namespace ck::conv_tensor_rearrange_op;

@@ -93,6 +93,7 @@ int main()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
+                                                        G,
                                                        N,
                                                        C,
                                                        in_spatial_lengths,
@@ -112,7 +113,7 @@ int main()
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
-                                    sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
+                                    sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;

            float gb_per_sec = num_bytes / 1.E6 / avg_time;

@@ -149,6 +150,7 @@ int main()
                  << std::endl;
        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        out.GetDeviceBuffer(),
+                                                        G,
                                                        N,
                                                        C,
                                                        in_spatial_lengths,

--- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/CMakeLists.txt
+++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/CMakeLists.txt
+add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp)
+target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_operations)
+
+add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp)
+target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 PRIVATE composable_kernel::device_operations)
+
+add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp)
+target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 PRIVATE composable_kernel::device_operations)
+
+add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp)
+target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 PRIVATE composable_kernel::device_operations)
--- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
+++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InLayout             = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout            = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout            = ck::tensor_layout::convolution::NDHWGK;
+using PassThrough          = ck::tensor_operation::element_wise::PassThrough;
+using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 64; // batch size
+static constexpr ck::index_t K             = 64; // output channel
+static constexpr ck::index_t C             = 32; // input channel (per group)
+static constexpr ck::index_t Z             = 3;  // filter D
+static constexpr ck::index_t Y             = 3;  // filter H
+static constexpr ck::index_t X             = 3;  // filter W
+static constexpr ck::index_t Di            = 14; // input D
+static constexpr ck::index_t Hi            = 14; // input H
+static constexpr ck::index_t Wi            = 14; // input W
+static constexpr ck::index_t Do            = 14; // output D
+static constexpr ck::index_t Ho            = 14; // output H
+static constexpr ck::index_t Wo            = 14; // output W
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int execute_conv_fwd_scaleadd_scaleadd_relu()
+{
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
+    // However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
+    // Hence, we need to adjust the order of strides.
+    std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
+    std::array<ck::index_t, 6> in_strides{
+        C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
+    std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
+    std::array<ck::index_t, 6> wei_strides{
+        K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
+    std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
+    std::array<ck::index_t, 6> out_strides{
+        C, Do * Ho * Wo * G * C, 1, Ho * Wo * G * C, Wo * G * C, G * C};
+
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Di * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
+    SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K);
+    SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * N * Do * Ho * Wo * G * K);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+        NumDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout, OutLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<std::tuple_element_t<0, DDataTypes>, std::tuple_element_t<1, DDataTypes>>,
+        OutDataType,
+        PassThrough,
+        PassThrough,
+        ScaleAddScaleAddRelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        wei_lengths,
+                                        wei_strides,
+                                        {out_lengths, out_lengths},
+                                        {out_strides, out_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        filter_strides,
+                                        filter_dilations,
+                                        input_left_pads,
+                                        input_right_pads,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        ScaleAddScaleAddRelu{2.f, 2.f});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop =
+                std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 2 * N * Ho * Wo * G * K;
+            std::size_t num_bytes =
+                sizeof(InDataType) * N * Hi * Wi * G * C + sizeof(WeiDataType) * G * K * Y * X * C +
+                (sizeof(OutDataType) + sizeof(std::tuple_element_t<0, DDataTypes>) +
+                 sizeof(std::tuple_element_t<1, DDataTypes>)) *
+                    N * Ho * Wo * G * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        wei_lengths,
+                                        wei_strides,
+                                        {out_lengths, out_lengths},
+                                        {out_strides, out_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        filter_strides,
+                                        filter_dilations,
+                                        input_left_pads,
+                                        input_right_pads,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        ScaleAddScaleAddRelu{2.f, 2.f});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
+++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+
+using InDataType  = ck::bhalf_t;
+using WeiDataType = ck::bhalf_t;
+using OutDataType = ck::bhalf_t;
+// Use std tuple instead of ck tuple to avoid clang
+// implicit instantiation of undefined template error.
+using DDataTypes = std::tuple<ck::bhalf_t, ck::bhalf_t>;
+
+#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
+
+int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
--- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
+++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+// Use std tuple instead of ck tuple to avoid clang
+// implicit instantiation of undefined template error.
+using DDataTypes = std::tuple<ck::half_t, ck::half_t>;
+
+#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
+
+int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
--- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
+++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+
+using InDataType  = float;
+using WeiDataType = float;
+using OutDataType = float;
+// Use std tuple instead of ck tuple to avoid clang
+// implicit instantiation of undefined template error.
+using DDataTypes = std::tuple<float, float>;
+
+#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
+
+int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
--- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
+++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+
+using InDataType  = int8_t;
+using WeiDataType = int8_t;
+using OutDataType = int8_t;
+// Use std tuple instead of ck tuple to avoid clang
+// implicit instantiation of undefined template error.
+using DDataTypes = std::tuple<float, float>;
+
+#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
+
+int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -70,6 +70,7 @@ else()
            -Wno-option-ignored
            -Wsign-compare
            -Wno-extra-semi-stmt
+            -Wno-unused-template
        )
        if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
            list(APPEND CMAKE_COMPILER_WARNINGS

--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
 rocm-docs-core>=0.20.0
-sphinxcontrib-bibtex==2.5.0
+sphinxcontrib-bibtex==2.6.1
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.24.0
+rocm-docs-core==0.26.0
    # via -r requirements.in
 six==1.16.0
    # via
@@ -139,7 +139,7 @@ sphinx-notfound-page==0.8.3
    # via rocm-docs-core
 sphinxcontrib-applehelp==1.0.4
    # via sphinx
-sphinxcontrib-bibtex==2.5.0
+sphinxcontrib-bibtex==2.6.1
    # via -r requirements.in
 sphinxcontrib-devhelp==1.0.2
    # via sphinx

--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
 add_custom_target(example_gemm_dl)

 add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp)
-if(result EQUAL 0)
-    add_dependencies(example_gemm_dl example_gemm_dl_fp32)
-endif()
+add_example_dependencies(example_gemm_dl example_gemm_dl_fp32)
+
 add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp)
-if(result EQUAL 0)
-    add_dependencies(example_gemm_dl example_gemm_dl_fp16)
-endif()
+add_example_dependencies(example_gemm_dl example_gemm_dl_fp16)
+
 add_example_executable(example_gemm_dpp_fp16 gemm_dpp_fp16.cpp)
+
 add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
-if(result EQUAL 0)
-    add_dependencies(example_gemm_dl example_gemm_dl_int8)
-endif()
+add_example_dependencies(example_gemm_dl example_gemm_dl_int8)
 if(USE_BITINT_EXTENSION_INT4)
    add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp)
-    add_dependencies(example_gemm_dl example_gemm_dl_int4)
+    add_example_dependencies(example_gemm_dl example_gemm_dl_int4)
 endif(USE_BITINT_EXTENSION_INT4)

 add_custom_target(example_gemm_xdl)
 add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
-if(result EQUAL 0)
-    add_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
-endif()
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
+
 add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
-if(result EQUAL 0)
-    add_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
-endif()
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
+
 add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
-if(result EQUAL 0)
-    add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
-endif()
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
 if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
    add_custom_target(example_gemm_wmma)
    add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
-    if(result EQUAL 0)
-      add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
-    endif()
+    add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
 endif()

 add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
-if(result EQUAL 0)
-  add_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16)

-  add_example_executable(example_gemm_xdl_bf16_rtn gemm_xdl_bf16_rtn.cpp)
-  add_dependencies(example_gemm_xdl example_gemm_xdl_bf16_rtn)
-endif()
+add_example_executable(example_gemm_xdl_bf16_rtn gemm_xdl_bf16_rtn.cpp)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_rtn)

 add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
-if(result EQUAL 0)
-  add_dependencies(example_gemm_xdl example_gemm_xdl_int8)
-endif()
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_int8)

 if(USE_BITINT_EXTENSION_INT4)
-  add_example_executable(example_gemm_xdl_int4 gemm_xdl_int4.cpp)
-  add_dependencies(example_gemm_xdl example_gemm_xdl_int4)
+    add_example_executable(example_gemm_xdl_int4 gemm_xdl_int4.cpp)
+    add_example_dependencies(example_gemm_xdl example_gemm_xdl_int4)
 endif(USE_BITINT_EXTENSION_INT4)

 # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
 add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
-if(result EQUAL 0)
-  add_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
-endif()
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp64)

 add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)

-
 add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
-if(result EQUAL 0)
-    add_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
-endif()
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)

 add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
-if(result EQUAL 0)
-    add_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
-endif()
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)

 add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp)
-if(result EQUAL 0)
-    add_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
-endif()
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-    add_custom_target(example_gemm_add_add_fastgelu_xdl)
-    add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
-    if(result EQUAL 0)
-      add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
+    if(gpu IN_LIST gpu_list AND target EQUAL 0)
+        add_custom_target(example_gemm_add_add_fastgelu_xdl)
+        add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
+        add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
+
+        add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
+        add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
+
+        add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
+        add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
+
+        if(USE_BITINT_EXTENSION_INT4)
+            add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
+            add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
+        endif(USE_BITINT_EXTENSION_INT4)
+
+        add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
+        add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
+        set(target 1)
    endif()
-    add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
-    if(result EQUAL 0)
-      add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
-    endif()
-    add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
-    if(result EQUAL 0)
-      add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
-    endif()
-    if(USE_BITINT_EXTENSION_INT4)
-       add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
-       add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
-    endif(USE_BITINT_EXTENSION_INT4)
-    add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
-    if(result EQUAL 0)
-      add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
-    endif()
-   set(target 1)
- endif()
-endforeach()
\ No newline at end of file
+endforeach()