Merge remote-tracking branch 'origin/develop' into migx_merge

3c4fb1dd · Umang Yadav · 57cdd70b · e8cddfdc · 3c4fb1dd · 3c4fb1dd
Commit 3c4fb1dd authored Nov 23, 2023 by Umang Yadav
20 changed files
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
+# Documentation files
+docs/* @saadrahim @LisaDelaney
+*.md  @saadrahim @LisaDelaney
+*.rst  @saadrahim @LisaDelaney
+# Header directory
+library/include/*  @saadrahim @LisaDelaney
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
-# Change Log for Composable Kernel
+# Changelog for Composable Kernel
 Full documentation for Composable Kernel is not yet available.
-## CK 0.2.0 for ROCm 5.5.0
+## (Unreleased) CK for ROCm 6.0.0
-### Fixed
+### Fixes
- Fixed a bug in 6-dimensional kernels (#555).
+ - Fixed a hazard associated with inline v_dot (#808)
- Fixed grouped ConvBwdWeight test case failure (#524).
+ - Fixed two bugs in grouped convolution backward data without K padding (#848 #876)
 ### Optimizations
- Improve proformance of normalization kernel
+None
-### Added
+### Additions
- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances.
+- Added an image to a column kernel (#867)
- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types.
+- Added a column to an image kernel (#930)
- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler.
+- Support for 3D grouped convolution on RDNA 3 GPUs (#935, #950, #985)
- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler.
+- Grouped convolution support for small K and C (#822 #879 #897)
- Added support on MI300A/MI300X.
+- Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
- Added support on NAVI3x.
+- Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
- Added user tutorial (#563).
+- Support for Batched Gemm DL (#732)
- Added more instances for irregular GEMM sizes (#560).
- Added inter-wave consumer-producer programming model for GEMM kernels (#310).
+### Changes
- Added multi-D GEMM client APIs (#534).
+ - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
- Added multi-embeddings support (#542).
- Added Navi3x blockwise GEMM and real GEMM support (#541).
+## CK 0.2.0 for ROCm 5.7.0
- Added Navi grouped ConvBwdWeight support (#505).
- Added MaxPool, AvgPool forward (#815).
+### Fixes
- Added MaxPool backward (#750).
+- Fixed a bug in 6-dimensional kernels (#555)
+- Fixed a test case failure with grouped convolution backward weight (#524)
-### Changed
- Changed ...
+### Optimizations
+- Improved the performance of the normalization kernel
+### Additions
+- New CMake flags:
+  - "DL_KERNELS"-- Must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances
+  - "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
+  - "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
+- New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler
+- Support for MI300A/MI300X
+- Support for AMD RDNA 3
+- New user tutorial (#563)
+- Additional instances for irregular GEMM sizes (#560)
+- New inter-wave consumer-producer programming model for GEMM kernels (#310)
+- GEMM with support multiple elementwise fusions (multi-D) (#534)
+- Multi-embeddings support (#542)
+- AMD RDNA 3 blockwise GEMM and real GEMM support (#541)
+- AMD RDNA grouped convolution backward weight support (#505)
+- MaxPool and AvgPool forward (#815); MaxPool backward (#750)
+### Changes
+None
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.14)
+if(POLICY CMP0140)
+  # policies CMP0140 not known to CMake until 3.25
+  cmake_policy(SET CMP0140 NEW)
+endif()
+# This has to be initialized before the project() command appears
+# Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D.  MSVC_IDE does not use CMAKE_BUILD_TYPE
+if( NOT MSVC_IDE AND NOT CMAKE_BUILD_TYPE )
+    set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
+endif()
+# Default installation path
+if(WIN32)
+    set(CMAKE_INSTALL_PREFIX "/opt/rocm/x86_64-w64-mingw32" CACHE PATH "")
+else()
+    set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+endif()
 set(version 1.1.0)
 # Check support for CUDA/HIP in Cmake
@@ -16,6 +33,10 @@ if (DTYPES)
        add_definitions(-DCK_ENABLE_FP8)
        set(CK_ENABLE_FP8 "ON")
    endif()
+    if (DTYPES MATCHES "bf8")
+        add_definitions(-DCK_ENABLE_BF8)
+        set(CK_ENABLE_BF8 "ON")
+    endif()
    if (DTYPES MATCHES "fp16")
        add_definitions(-DCK_ENABLE_FP16)
        set(CK_ENABLE_FP16 "ON")
@@ -34,10 +55,13 @@ if (DTYPES)
    endif()
    message("DTYPES macro set to ${DTYPES}")
 else()
-    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
+    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
    set(CK_ENABLE_ALL_DTYPES "ON")
 endif()
+#for f8/bf8_t type
+add_compile_options(-Wno-bit-int-extension)
 if(DL_KERNELS)
    add_definitions(-DDL_KERNELS)
    set(CK_ENABLE_DL_KERNELS "ON")
@@ -82,26 +106,30 @@ message("checking which targets are supported")
 #Setting GPU_TARGETS on command line will override this list
 if(NOT PROFILER_ONLY)
    rocm_check_target_ids(DEFAULT_GPU_TARGETS
-        TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
+        TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
 else()
    add_definitions(-DPROFILER_ONLY)
+    set(GPU_TARGETS "" CACHE STRING "" FORCE)
    if(GPU_TARGETS)
-        message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11")
+        message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx90, gfx94, gfx10, or gfx11")
    endif()
-    if(GPU_ARCH MATCHES "gfx9")
+    if(GPU_ARCH MATCHES "gfx90")
-        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx908;gfx90a")
+    elseif(GPU_ARCH MATCHES "gfx94")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx940;gfx941;gfx942")
    elseif(GPU_ARCH MATCHES "gfx10")
        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
    elseif(GPU_ARCH MATCHES "gfx11")
        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
    else()
-        message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11")
+        message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx90, gfx94, gfx10, or gfx11")
    endif()
+    set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
 endif()
 message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
-set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " ")
+set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
 if(GPU_TARGETS)
    message("Building CK for the following targets: ${GPU_TARGETS}")
@@ -368,16 +396,18 @@ include_directories(BEFORE
    ${HIP_INCLUDE_DIRS}
 )
-add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
 if (NOT CK_BUILD_JIT_LIB)
    SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
    if(BUILD_DEV)
-        add_compile_options(-Werror)
+        add_compile_options(-Werror -Weverything)
-        add_compile_options(-Weverything)
    endif()
+    #add flags to reduce the size of binaries
+    add_compile_options(-Oz -flto=thin)
    message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
    file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
    file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
    set(CK_DEVICE_INSTANCES)
@@ -387,32 +417,28 @@ if (NOT CK_BUILD_JIT_LIB)
        set(cmake_instance)
        file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
        set(add_inst 0)
-        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8")
+        if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
-            #message("fp8 instance found!")
            set(add_inst 1)
        endif()
-        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16")
+        if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
-            #message("fp16 instance found!")
            set(add_inst 1)
        endif()
-        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32")
+        if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
-            #message("fp32 instance found!")
            set(add_inst 1)
        endif()
-        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64")
+        if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
-            #message("fp64 instance found!")
            set(add_inst 1)
        endif()
-        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16")
+        if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
-            #message("bf16 instance found!")
            set(add_inst 1)
        endif()
-        if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8")
+        if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
-            #message("int8 instance found!")
+            set(add_inst 1)
+        endif()
+        if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
            set(add_inst 1)
        endif()
        if(NOT "${cmake_instance}" MATCHES "DTYPES")
-            #message("instance should be built for all types!")
            set(add_inst 1)
        endif()
        if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
@@ -421,41 +447,41 @@ if (NOT CK_BUILD_JIT_LIB)
    ENDIF()
    ENDFOREACH()
-    add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
+        add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
-    add_subdirectory(library)
+        add_subdirectory(library)
-    if(NOT DEFINED INSTANCES_ONLY)
+        if(NOT DEFINED INSTANCES_ONLY)
-    if(NOT DEFINED PROFILER_ONLY)
+        if(NOT DEFINED PROFILER_ONLY)
-    rocm_package_setup_component(tests
+        rocm_package_setup_component(tests
-            LIBRARY_NAME composablekernel
+                LIBRARY_NAME composablekernel
-            PACKAGE_NAME tests # Prevent -static suffix on package name
+                PACKAGE_NAME tests # Prevent -static suffix on package name
-    )
+        )
-    rocm_package_setup_component(examples
+        rocm_package_setup_component(examples
-            LIBRARY_NAME composablekernel
+                LIBRARY_NAME composablekernel
-            PACKAGE_NAME examples
+                PACKAGE_NAME examples
-    )
+        )
-    add_subdirectory(example)
+        add_subdirectory(example)
-    add_subdirectory(test)
+        add_subdirectory(test)
    rocm_package_setup_component(profiler
            LIBRARY_NAME composablekernel
-            PACKAGE_NAME ckProfiler
+            PACKAGE_NAME ckprofiler
    )
    add_subdirectory(profiler)
    else()
        #When building PROFILER_ONLY, label the package with GPU_ARCH
        rocm_package_setup_component(profiler
        LIBRARY_NAME composablekernel
-        PACKAGE_NAME ckProfiler_${GPU_ARCH}
+        PACKAGE_NAME ckprofiler_${GPU_ARCH}
        )
        add_subdirectory(profiler)
    endif()
    endif()
 else()
    rocm_package_setup_component(jit_library
-        LIBRARY_NAME composablekernel
+    LIBRARY_NAME composablekernel
-        PACKAGE_NAME jit_library
+    PACKAGE_NAME jit_library
    )
    add_subdirectory(library)
    add_subdirectory(test)
@@ -483,7 +509,7 @@ rocm_install(FILES
 )
 # Install CK version and configuration files
-install(FILES
+rocm_install(FILES
    ${PROJECT_BINARY_DIR}/include/ck/version.h
    ${PROJECT_BINARY_DIR}/include/ck/config.h
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/

--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:20.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=5.6
+ARG ROCMVERSION=5.7
 ARG compiler_version=""
 ARG compiler_commit=""
@@ -16,52 +16,52 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
-RUN wget https://repo.radeon.com/amdgpu-install/5.6/ubuntu/focal/amdgpu-install_5.6.50600-1_all.deb  --no-check-certificate
+RUN wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb  --no-check-certificate
-RUN apt-get update && \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_5.7.50700-1_all.deb
-DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    ./amdgpu-install_5.6.50600-1_all.deb
+RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+    sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
-RUN if [ "$ROCMVERSION" != "5.7" ]; then \
+    sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'
-        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
-        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
-    elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
-        apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
-        amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \
-    elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "rc1" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
-        apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
-        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.7 rel-19 > /etc/apt/sources.list.d/rocm-build.list' && \
-        amdgpu-repo --amdgpu-build=1637781; \
-    fi
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 RUN amdgpu-install -y --usecase=rocm --no-dkms
+## Sccache binary built from source for ROCm
+ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
+ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
+RUN mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
+curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
+chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
+ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
    build-essential \
-    ccache \
    cmake \
+    ccache \
    git \
    hip-rocclr \
+    iputils-ping \
    jq \
    libelf-dev \
    libncurses5-dev \
    libnuma-dev \
    libpthread-stubs0-dev \
    llvm-amdgpu \
+    net-tools \
    pkg-config \
    python \
    python3 \
    python3-dev \
    python3-pip \
+    redis \
    sshpass \
+    stunnel \
    software-properties-common \
    vim \
    nano \
    zlib1g-dev \
+    zip \
    openssh-server \
    clang-format-12 \
    kmod && \
@@ -73,15 +73,8 @@ RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releas
 RUN gunzip /usr/local/bin/ninja.gz
 RUN chmod a+x /usr/local/bin/ninja
 RUN git clone https://github.com/nico/ninjatracing.git
-RUN apt purge --auto-remove -y cmake
+# Update the cmake to the latest version
-RUN apt update
+RUN pip install --upgrade cmake==3.27.5
-RUN apt install -y software-properties-common lsb-release
-RUN apt clean all
-RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
-RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
-RUN apt install -y kitware-archive-keyring
-RUN rm /etc/apt/trusted.gpg.d/kitware.gpg
-RUN apt install -y cmake
 # Setup ubsan environment to printstacktrace
 RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
@@ -96,9 +89,9 @@ ARG PREFIX=/opt/rocm
 RUN pip3 install --upgrade pip
 RUN pip3 install sqlalchemy==1.4.46
 RUN pip3 install pymysql
-RUN pip3 install pandas
+RUN pip3 install pandas==2.0.3
 RUN pip3 install setuptools-rust
-RUN pip3 install sshtunnel
+RUN pip3 install sshtunnel==0.4.0
 # Setup ubsan environment to printstacktrace
 ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -121,7 +114,7 @@ RUN sh -c "echo compiler commit = '$compiler_commit'"
 RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
        cd llvm-project && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
        make -j 8 ; \
    else echo "using the release compiler"; \
    fi
@@ -129,11 +122,13 @@ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ];
 RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
        make -j 8 ; \
    else echo "using the release compiler"; \
    fi
+#clean-up the deb package
+RUN sh -c "rm -rf amdgpu-install*"
 #ENV HIP_CLANG_PATH='/llvm-project/build/bin'
 #RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -33,7 +33,7 @@ def runShell(String command){
 def getDockerImageName(){
    def img
-    if (params.ROCMVERSION != "5.7"){
+    if (params.ROCMVERSION != "6.0"){
       if (params.COMPILER_VERSION == "") {
           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
       }
@@ -65,10 +65,10 @@ def getDockerImageName(){
 }
 def check_host() {
-    if ("${env.CK_CCACHE}" != "null"){
+    if ("${env.CK_SCCACHE}" != "null"){
-        def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}"
+        def SCCACHE_SERVER="${env.CK_SCCACHE.split(':')[0]}"
-        echo "ccache server: ${CCACHE_SERVER}"
+        echo "sccache server: ${SCCACHE_SERVER}"
-        sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt'''
+        sh '''ping -c 1 -p 6379 "${SCCACHE_SERVER}" | echo $? > tmp.txt'''
        def output = readFile(file: "tmp.txt")
        echo "tmp.txt contents: \$output"
        return (output != "0")
@@ -96,24 +96,9 @@ def build_compiler(){
 def getDockerImage(Map conf=[:]){
    env.DOCKER_BUILDKIT=1
-    def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
+    def prefixpath = conf.get("prefixpath", "/opt/rocm")
    def no_cache = conf.get("no_cache", false)
    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    echo "ccache server: ${env.CK_CCACHE}"
-    if(env.CK_CCACHE)
-    {
-        if(check_host())
-        {
-            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
-        }
-        else 
-        {
-            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
-        }
-        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
-        env.CCACHE_DIR = """/tmp/ccache_store"""
-        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
-    }
    if(no_cache)
    {
        dockerArgs = dockerArgs + " --no-cache "
@@ -142,21 +127,6 @@ def buildDocker(install_prefix){
    def image_name = getDockerImageName()
    echo "Building Docker for ${image_name}"
    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    echo "ccache server: ${env.CK_CCACHE}"
-    if(env.CK_CCACHE)
-    {
-        if(check_host())
-        {
-            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
-        }
-        else 
-        {
-            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
-        }
-        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
-        env.CCACHE_DIR = """/tmp/ccache_store"""
-        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
-    }
    echo "Build Args: ${dockerArgs}"
    try{
@@ -169,7 +139,7 @@ def buildDocker(install_prefix){
        else{
            echo "Checking for image: ${image_name}"
            sh "docker manifest inspect --insecure ${image_name}"
-            echo "Image: ${image_name} found!! Skipping building image"
+            echo "Image: ${image_name} found! Skipping building image"
        }
    }
    catch(Exception ex){
@@ -210,19 +180,18 @@ def cmake_build(Map conf=[:]){
    } else{
        setup_args = ' -DBUILD_DEV=On' + setup_args
    }
+    if (params.DL_KERNELS){
+        setup_args = setup_args + " -DDL_KERNELS=ON "
+    }
    if(build_type_debug){
        setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args
    }else{
        setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
    }
-    if(env.CK_CCACHE)
-    {
-        setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
-    }
-    echo "ccache server: ${env.CK_CCACHE}"
    def pre_setup_cmd = """
+            #!/bin/bash
            echo \$HSA_ENABLE_SDMA
            ulimit -c unlimited
            rm -rf build
@@ -231,6 +200,60 @@ def cmake_build(Map conf=[:]){
            mkdir install
            cd build
        """
+    def invocation_tag=""
+    if (setup_args.contains("gfx11")){
+        invocation_tag="gfx11"
+    }
+    if (setup_args.contains("gfx10")){
+        invocation_tag="gfx10"
+    }
+    if (setup_args.contains("gfx90")){
+        invocation_tag="gfx90"
+    }
+    if (setup_args.contains("gfx94")){
+        invocation_tag="gfx94"
+    }
+    echo "invocation tag: ${invocation_tag}"
+    def redis_pre_setup_cmd = pre_setup_cmd
+    if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
+        redis_pre_setup_cmd = pre_setup_cmd + """
+            #!/bin/bash
+            export ROCM_PATH=/opt/rocm
+            export SCCACHE_ENABLED=true
+            export SCCACHE_LOG_LEVEL=debug
+            export SCCACHE_IDLE_TIMEOUT=14400
+            export COMPILERS_HASH_DIR=/tmp/.sccache
+            export SCCACHE_BIN=/usr/local/.cargo/bin/sccache
+            export SCCACHE_EXTRAFILES=/tmp/.sccache/rocm_compilers_hash_file
+            export SCCACHE_REDIS="redis://${env.CK_SCCACHE}"
+            echo "connect = ${env.CK_SCCACHE}" >> ../script/redis-cli.conf
+            export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
+            echo \$SCCACHE_C_CUSTOM_CACHE_BUSTER
+            stunnel ../script/redis-cli.conf
+            ../script/sccache_wrapper.sh --enforce_redis
+        """
+        try {
+            def cmd1 = conf.get("cmd1", """
+                    ${redis_pre_setup_cmd}
+                """)
+            sh cmd1
+            setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
+        }
+        catch(Exception err){
+            echo "could not connect to redis server: ${err.getMessage()}. will not use sccache."
+            def cmd2 = conf.get("cmd2", """
+                    ${pre_setup_cmd}
+                """)
+            sh cmd2
+        }
+    }
+    else{
+        def cmd3 = conf.get("cmd3",  """
+                ${pre_setup_cmd}
+            """)
+        sh cmd3
+    }
    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
    // reduce parallelism when compiling, clang uses too much memory
    def nt = nthreads()
@@ -238,17 +261,19 @@ def cmake_build(Map conf=[:]){
    def execute_cmd = conf.get("execute_cmd", "")
    def cmd = conf.get("cmd", """
-            ${pre_setup_cmd}
            ${setup_cmd}
            ${build_cmd}
            ${execute_cmd}
        """)
    echo cmd
-    sh cmd
+    dir("build"){
+        sh cmd
+    }
    // Only archive from master or develop
-    if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) {
+    if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
        archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
    }
 }
@@ -367,8 +392,6 @@ def runCKProfiler(Map conf=[:]){
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 24, unit: 'HOURS')
                {
-                    //cmake_build(conf)
-                    //instead of building, just unstash the ckProfiler and install it
                    sh """
                        rm -rf build
                        mkdir build
@@ -525,6 +548,26 @@ def Build_CK(Map conf=[:]){
                           stash "ckprofiler_0.2.0_amd64.deb"
                        }
                    }
+                    if (params.hipTensor_test && navi_node == 0 ){
+                        //build and test hipTensor
+                        sh """#!/bin/bash
+                            rm -rf "${params.hipTensor_branch}".zip
+                            rm -rf hipTensor-"${params.hipTensor_branch}"
+                            wget https://github.com/ROCmSoftwarePlatform/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
+                            unzip -o "${params.hipTensor_branch}".zip
+                        """
+                        dir("hipTensor-${params.hipTensor_branch}"){
+                            sh """#!/bin/bash
+                                mkdir -p build
+                                ls -ltr
+                                CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="/opt/rocm;${env.WORKSPACE}/install"
+                                cmake --build build -- -j
+                            """
+                        }
+                        dir("hipTensor-${params.hipTensor_branch}/build"){
+                            sh 'ctest'
+                        }
+                    }
                }
            }
        }
@@ -612,9 +655,9 @@ def process_results(Map conf=[:]){
 }
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=rc1
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=
-                                              0 21 * * * % ROCMVERSION=5.6;COMPILER_VERSION=;COMPILER_COMMIT=
+                                              0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT=
-                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
+                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : ""
 pipeline {
    agent none
@@ -631,8 +674,8 @@ pipeline {
            description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
        string(
            name: 'ROCMVERSION', 
-            defaultValue: '5.6', 
+            defaultValue: '5.7', 
-            description: 'Specify which ROCM version to use: 5.6 (default).')
+            description: 'Specify which ROCM version to use: 5.7 (default).')
        string(
            name: 'COMPILER_VERSION', 
            defaultValue: '', 
@@ -649,6 +692,22 @@ pipeline {
            name: "RUN_FULL_QA",
            defaultValue: false,
            description: "Select whether to run small set of performance tests (default) or full QA")
+        booleanParam(
+            name: "DL_KERNELS",
+            defaultValue: false,
+            description: "Select whether to build DL kernels (default: OFF)")
+        booleanParam(
+            name: "hipTensor_test",
+            defaultValue: true,
+            description: "Use the CK build to verify hipTensor build and tests (default: ON)")
+        string(
+            name: 'hipTensor_branch',
+            defaultValue: 'develop',
+            description: 'Specify which branch of hipTensor to use (default: develop)')
+        booleanParam(
+            name: "USE_SCCACHE",
+            defaultValue: true,
+            description: "Use the sccache for building CK (default: ON)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -663,15 +722,12 @@ pipeline {
    }
    stages{
        stage("Build Docker"){
-            //when {
-            //    beforeAgent true
-            //    expression { params.BUILD_DOCKER.toBoolean() }
-            //}
            parallel{
                stage('Docker /opt/rocm'){
                    agent{ label rocmnode("nogpu") }
                    steps{
                        buildDocker('/opt/rocm')
+                        cleanWs()
                    }
                }
            }
@@ -693,6 +749,7 @@ pipeline {
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+                        cleanWs()
                    }
                }
            }
@@ -710,11 +767,12 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx908 || gfx90a") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        cleanWs()
                    }
                }
                stage("Build CK and run Tests on MI100/MI200")
@@ -730,6 +788,7 @@ pipeline {
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        cleanWs()
                    }
                }
                stage("Build CK and run Tests on Navi21")
@@ -742,10 +801,10 @@ pipeline {
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """ 
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        cleanWs()
                    }
                }
                stage("Build CK and run Tests on Navi32")
@@ -756,12 +815,12 @@ pipeline {
                    }
                    agent{ label rocmnode("navi32") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DDTYPES="fp16;fp32;bf16" -DGPU_TARGETS="gfx1101" """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDTYPES="fp16;fp32;bf16" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        cleanWs()
                    }
                }
            }
@@ -784,6 +843,7 @@ pipeline {
                   }
                    steps{
                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
+                        cleanWs()
                    }
                }
                stage("Run ckProfiler: gfx90a")
@@ -799,6 +859,7 @@ pipeline {
                    }
                    steps{
                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
+                        cleanWs()
                    }
                }
            }
@@ -811,6 +872,7 @@ pipeline {
                    agent { label 'mici' }
                    steps{
                        process_results()
+                        cleanWs()
                    }
                }
            }

--- a/README.md
+++ b/README.md
 # Composable Kernel
-## Methodology
+The Composable Kernel (CK) library provides a programming model for writing performance-critical
+kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
+uses general purpose kernel languages, such as HIP C++.
-Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+CK uses two concepts to achieve performance portability and code maintainability:
-CK utilizes two concepts to achieve performance portability and code maintainability:
 * A tile-based programming model
-* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
+* Algorithm complexity reduction for complex machine learning (ML) operators. This uses an innovative
+   technique called *Tensor Coordinate Transformation*.
 ![ALT](/docs/data/ck_component.png "CK Components")
-## Code Structure
+The current CK library is structured into four layers:
-Current CK library are structured into 4 layers:
+* Templated Tile Operators
-* "Templated Tile Operators" layer
+* Templated Kernel and Invoker
-* "Templated Kernel and Invoker" layer
+* Instantiated Kernel and Invoker
-* "Instantiated Kernel and Invoker" layer
+* Client API
-* "Client API" layer
 ![ALT](/docs/data/ck_layer.png "CK Layers")
-## Documentation
+## General information
-Run the steps below to build documentation locally.
+To build our documentation locally, use the following code:
-```
+``` bash
 cd docs
 pip3 install -r sphinx/requirements.txt
 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```
-## Contributors
+You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
+page.
-The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
-## Citation
+```note
+If you use CK, cite us as follows:
-If you use CK, please use following citations:
+* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
-* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
+  This paper will be available on arXiv soon.
 * [CITATION.cff](/CITATION.cff)
+```
-## License
+CK is released under the **[MIT license](/LICENSE)**.
-CK is released under the MIT license. [License File](/LICENSE)
+## Building CK
+We recommend building CK inside Docker containers, which include all necessary packages. Pre-built
+Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composable_kernel/tags).
-# Build CK
+1. To build a new Docker image, use the Dockerfile provided with the source code:
-## Build docker image
+    ```bash
+    DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
+    ```
-```bash
+2. Launch the Docker container:
-DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
-```
-Pre-built dockers are available from this public repo: 
-https://hub.docker.com/r/rocm/composable_kernel/tags
-## Launch docker
+    ```bash
+    docker run                                     \
+    -it                                            \
+    --privileged                                   \
+    --group-add sudo                               \
+    -w /root/workspace                             \
+    -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace  \
+    ck:latest                                      \
+    /bin/bash
+    ```
-```bash
+3. Clone CK source code from the GitHub repository and start the build:
-docker run                                     \
-it                                            \
--privileged                                   \
--group-add sudo                               \
-w /root/workspace                             \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace  \
-ck:latest                                      \
-/bin/bash
-```
-## Build CK
+    ```bash
+    git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \
+    cd composable_kernel && \
+    mkdir build && \
+    cd build
+    ```
-```bash
+    You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s) you want
-mkdir build && cd build
+    to run CK on. You can specify single or multiple architectures. If you specify multiple architectures,
+    use a semicolon between each; for example, `gfx908;gfx90a;gfx940`.
-# Need to specify target ID, example below is for gfx908 and gfx90a
+    ```bash
+    cmake                                                                                             \
+    -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+    -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+    -D CMAKE_BUILD_TYPE=Release                                                                       \
+    -D GPU_TARGETS="gfx908;gfx90a"                                                                    \
+    ..
+    ```
-cmake                                                                                             \
+    If you don't set `GPU_TARGETS` on the cmake command line, CK is built for all GPU targets
-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+    supported by the current compiler (this may take a long time).
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
-D CMAKE_BUILD_TYPE=Release                                                                       \
-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
-..
-```
-If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the 
+4. Build the entire CK library:
-current compiler.
+    ```bash
+    make -j
+    ```
-Additional cmake flags can be used to significantly speed-up the build:
+5. Install CK:
-INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library
+    ```bash
-while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency.
+    make -j install
+    ```
-DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances 
+## Optional post-install steps
-of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
-DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl 
+* Build examples and tests:
-instances. Those instances are only needed for the NAVI2x platforms.
-### Build examples and tests
+    ```bash
+    make -j examples tests
+    ```
-```bash
+* Build and run all examples and tests:
- make -j examples tests
- make test
+    ```bash
-```
+    make -j check
+    ```
-Instructions for running each individual examples are under [example](/example)
+    You can find instructions for running each individual example in [example](/example).
+* Build ckProfiler:
-## Build ckProfiler
+    ```bash
+    make -j ckProfiler
+    ```
+    You can find instructions for running ckProfiler in [profiler](/profiler).
+Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
+Depending on the number of CPU cores and the amount of RAM on your system, you may want to
+limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
+By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
+crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
+Additional cmake flags can be used to significantly speed-up the build:
+* `INSTANCES_ONLY` (default is OFF) must be set to ON in order to build only the instances and library
+  while skipping all tests, examples, and profiler. This is useful in cases when you plan to use CK as a
+  dependency and don't plan to run any examples or tests.
+* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
+  instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
+  other data types.
+* `DL_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dl` or
+  `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
+  other platforms have faster instances, such as `xdl` or `wmma`, available.
+## Using sccache for building
+The default CK Docker images come with a pre-installed version of sccache, which supports clang
+being used as hip-compiler (" -x hip"). Using sccache can help reduce the time to re-build code from
+hours to 1-2 minutes. In order to invoke sccache, you need to run:
 ```bash
- make -j ckProfiler
+ sccache --start-server
 ```
-Instructions for running ckProfiler are under [profiler](/profiler)
-## Install CK
+then add the following flags to the cmake command line:
 ```bash
-make install
+ -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
 ```
+You may need to clean up the build folder and repeat the cmake and make steps in order to take
+advantage of the sccache during subsequent builds.
 ## Using CK as pre-built kernel library
-Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
+You can find instructions for using CK as a pre-built kernel library in [client_example](/client_example).
-## Contributing
+## Contributing to CK
-When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
+When you contribute to CK, make sure you run `clang-format` on all changed files. We highly
+recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
 ```bash
 sudo script/install_precommit.sh
 ```
-This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created.
+With this approach, `pre-commit` adds the appropriate hooks to your local repository and
+automatically runs `clang-format` (and possibly additional checks) before any commit is created.
 If you need to uninstall hooks from the repository, you can do so by running the following command:
@@ -141,14 +191,5 @@ If you need to uninstall hooks from the repository, you can do so by running the
 script/uninstall_precommit.sh
 ```
-If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command.
+If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
+`git commit` command.
-## Caveat
-### Kernel Timing and Verification
-CK's own kernel timer will warn up kernel once, and then run it multiple times
-to get average kernel time. For some kernels that use atomic add, this will cause
-output buffer to be accumulated multiple times, causing verification failure.
-To work around it, do not use CK's own timer and do verification at the same time.
-CK's own timer and verification in each example and ckProfiler can be enabled or
-disabled from command line.
--- a/client_example/05_layernorm/CMakeLists.txt
+++ b/client_example/05_layernorm/CMakeLists.txt
-add_executable(client_layernorm2d layernorm2d.cpp)
+add_executable(client_layernorm2d_fwd layernorm2d_fwd.cpp)
-target_link_libraries(client_layernorm2d PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_layernorm2d_fwd PRIVATE composable_kernel::device_operations)
+add_executable(client_layernorm4d_fwd layernorm4d_fwd.cpp)
+target_link_libraries(client_layernorm4d_fwd PRIVATE composable_kernel::device_operations)
--- a/client_example/05_layernorm/layernorm2d.cpp
+++ b/client_example/05_layernorm/layernorm2d.cpp
@@ -7,17 +7,19 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
+#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
-using XDataType       = ck::half_t;
+using XDataType              = ck::half_t;
-using GammaDataType   = ck::half_t;
+using GammaDataType          = ck::half_t;
-using BetaDataType    = ck::half_t;
+using BetaDataType           = ck::half_t;
-using YDataType       = ck::half_t;
+using YDataType              = ck::half_t;
-using ComputeDataType = float;
+using SaveMeanInvStdDataType = float;
-using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+using PassThrough            = ck::tensor_operation::element_wise::PassThrough;
+#define SAVE_MEAN_INV_STD
 constexpr int Rank         = 2;
 constexpr int NumReduceDim = 1;
@@ -50,15 +52,19 @@ int main(int argc, char* argv[])
    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+#ifdef SAVE_MEAN_INV_STD
-    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
+    SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * M);
-                                                                       GammaDataType,
+    SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * M);
-                                                                       BetaDataType,
+#endif
-                                                                       ComputeDataType,
-                                                                       YDataType,
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
-                                                                       PassThrough,
+                                                                          GammaDataType,
-                                                                       Rank,
+                                                                          BetaDataType,
-                                                                       NumReduceDim>;
+                                                                          YDataType,
+                                                                          SaveMeanInvStdDataType,
+                                                                          PassThrough,
+                                                                          Rank,
+                                                                          NumReduceDim>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -84,14 +90,21 @@ int main(int argc, char* argv[])
                                                        {0, 1},      // gammaStrides
                                                        {0, 1},      // betaStrides
                                                        {Stride, 1}, // yStrides
+                                                        {1},         // save_mean Strides
+                                                        {1},         // save_inv_std Strides
                                                        {1},         // reduceDims
                                                        1e-4,
                                                        x_device_buf.GetDeviceBuffer(),
                                                        gamma_device_buf.GetDeviceBuffer(),
                                                        beta_device_buf.GetDeviceBuffer(),
                                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                                        save_mean_device_buf.GetDeviceBuffer(),
+                                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
                                                        nullptr,
                                                        nullptr,
+#endif
                                                        PassThrough{});
        auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -100,11 +113,19 @@ int main(int argc, char* argv[])
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
            std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
                                   sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;
+#ifdef SAVE_MEAN_INV_STD
+            num_byte += sizeof(SaveMeanInvStdDataType) * M * 2;
+#endif
            float gb_per_sec = num_byte / 1.E6 / ave_time;
            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
@@ -136,23 +157,34 @@ int main(int argc, char* argv[])
        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N},      // lengths
                                                        {Stride, 1}, // xStrides
-                                                        {1},         // gammaStrides
+                                                        {0, 1},      // gammaStrides
-                                                        {1},         // betaStrides
+                                                        {0, 1},      // betaStrides
                                                        {Stride, 1}, // yStrides
+                                                        {1},         // save_mean Strides
+                                                        {1},         // save_inv_std Strides
                                                        {1},         // reduceDims
                                                        1e-4,
                                                        x_device_buf.GetDeviceBuffer(),
                                                        gamma_device_buf.GetDeviceBuffer(),
                                                        beta_device_buf.GetDeviceBuffer(),
                                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                                        save_mean_device_buf.GetDeviceBuffer(),
+                                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
                                                        nullptr,
                                                        nullptr,
+#endif
                                                        PassThrough{});
        auto invoker_ptr = op_ptr->MakeInvokerPointer();
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
        }

--- a/client_example/05_layernorm/layernorm4d_fwd.cpp
+++ b/client_example/05_layernorm/layernorm4d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using PassThrough            = ck::tensor_operation::element_wise::PassThrough;
+#define SAVE_MEAN_INV_STD
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 256;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t C = 8;
+    std::vector<ck::index_t> strideXY             = {H * W * C, W * C, C, 1};
+    std::vector<ck::index_t> strideGammaBeta      = {0, W * C, C, 1};
+    std::vector<ck::index_t> strideSaveMeanInvStd = {1};
+    SimpleDeviceMem x_device_buf(sizeof(XDataType) * N * H * W * C);
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * H * W * C);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * H * W * C);
+    SimpleDeviceMem y_device_buf(sizeof(YDataType) * N * H * W * C);
+#ifdef SAVE_MEAN_INV_STD
+    SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * N);
+    SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * N);
+#endif
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
+                                                                          GammaDataType,
+                                                                          BetaDataType,
+                                                                          YDataType,
+                                                                          SaveMeanInvStdDataType,
+                                                                          PassThrough,
+                                                                          Rank,
+                                                                          NumReduceDim>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer({N, H, W, C},         // lengths
+                                        strideXY,             // xStrides
+                                        strideGammaBeta,      // gammaStrides
+                                        strideGammaBeta,      // betaStrides
+                                        strideXY,             // yStrides
+                                        strideSaveMeanInvStd, // save_mean Strides
+                                        strideSaveMeanInvStd, // save_inv_std Strides
+                                        {1, 2, 3},            // reduceDims
+                                        1e-4,
+                                        x_device_buf.GetDeviceBuffer(),
+                                        gamma_device_buf.GetDeviceBuffer(),
+                                        beta_device_buf.GetDeviceBuffer(),
+                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                        save_mean_device_buf.GetDeviceBuffer(),
+                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
+                                        nullptr,
+                                        nullptr,
+#endif
+                                        PassThrough{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t num_byte =
+                sizeof(XDataType) * N * H * W * C + sizeof(GammaDataType) * H * W * C +
+                sizeof(BetaDataType) * H * W * C + sizeof(YDataType) * N * H * W * C;
+#ifdef SAVE_MEAN_INV_STD
+            num_byte += sizeof(SaveMeanInvStdDataType) * N * 2;
+#endif
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer({N, H, W, C},         // lengths
+                                        strideXY,             // xStrides
+                                        strideGammaBeta,      // gammaStrides
+                                        strideGammaBeta,      // betaStrides
+                                        strideXY,             // yStrides
+                                        strideSaveMeanInvStd, // save_mean Strides
+                                        strideSaveMeanInvStd, // save_inv_std Strides
+                                        {1, 2, 3},            // reduceDims
+                                        1e-4,
+                                        x_device_buf.GetDeviceBuffer(),
+                                        gamma_device_buf.GetDeviceBuffer(),
+                                        beta_device_buf.GetDeviceBuffer(),
+                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                        save_mean_device_buf.GetDeviceBuffer(),
+                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
+                                        nullptr,
+                                        nullptr,
+#endif
+                                        PassThrough{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
@@ -100,18 +100,18 @@ int main()
    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * X * C);
    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Wo * K);
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
-                                                                                 InLayout,
+                                                                                   InLayout,
-                                                                                 WeiLayout,
+                                                                                   WeiLayout,
-                                                                                 ck::Tuple<>,
+                                                                                   ck::Tuple<>,
-                                                                                 OutLayout,
+                                                                                   OutLayout,
-                                                                                 InDataType,
+                                                                                   InDataType,
-                                                                                 WeiDataType,
+                                                                                   WeiDataType,
-                                                                                 ck::Tuple<>,
+                                                                                   ck::Tuple<>,
-                                                                                 OutDataType,
+                                                                                   OutDataType,
-                                                                                 PassThrough,
+                                                                                   PassThrough,
-                                                                                 PassThrough,
+                                                                                   PassThrough,
-                                                                                 PassThrough>;
+                                                                                   PassThrough>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<

--- a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
@@ -71,18 +71,18 @@ int main()
    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
-                                                                                 InLayout,
+                                                                                   InLayout,
-                                                                                 WeiLayout,
+                                                                                   WeiLayout,
-                                                                                 ck::Tuple<>,
+                                                                                   ck::Tuple<>,
-                                                                                 OutLayout,
+                                                                                   OutLayout,
-                                                                                 InDataType,
+                                                                                   InDataType,
-                                                                                 WeiDataType,
+                                                                                   WeiDataType,
-                                                                                 ck::Tuple<>,
+                                                                                   ck::Tuple<>,
-                                                                                 OutDataType,
+                                                                                   OutDataType,
-                                                                                 PassThrough,
+                                                                                   PassThrough,
-                                                                                 PassThrough,
+                                                                                   PassThrough,
-                                                                                 PassThrough>;
+                                                                                   PassThrough>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<

--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
@@ -80,7 +80,7 @@ int main(int argc, char* argv[])
    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
        NumDimSpatial,
        InLayout,
        WeiLayout,

--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -78,18 +78,18 @@ int main(int argc, char* argv[])
    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp =
-        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
-                                                                    InLayout,
+                                                                      InLayout,
-                                                                    WeiLayout,
+                                                                      WeiLayout,
-                                                                    ck::Tuple<BiasLayout>,
+                                                                      ck::Tuple<BiasLayout>,
-                                                                    OutLayout,
+                                                                      OutLayout,
-                                                                    InDataType,
+                                                                      InDataType,
-                                                                    WeiDataType,
+                                                                      WeiDataType,
-                                                                    ck::Tuple<BiasDataType>,
+                                                                      ck::Tuple<BiasDataType>,
-                                                                    OutDataType,
+                                                                      OutDataType,
-                                                                    PassThrough,
+                                                                      PassThrough,
-                                                                    PassThrough,
+                                                                      PassThrough,
-                                                                    OutElementOp>;
+                                                                      OutElementOp>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
@@ -83,7 +83,7 @@ int main(int argc, char* argv[])
    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
        NumDimSpatial,
        InLayout,
        WeiLayout,

--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
@@ -79,18 +79,18 @@ int main(int argc, char* argv[])
    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp =
-        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
-                                                                    InLayout,
+                                                                      InLayout,
-                                                                    WeiLayout,
+                                                                      WeiLayout,
-                                                                    ck::Tuple<BiasLayout>,
+                                                                      ck::Tuple<BiasLayout>,
-                                                                    OutLayout,
+                                                                      OutLayout,
-                                                                    InDataType,
+                                                                      InDataType,
-                                                                    WeiDataType,
+                                                                      WeiDataType,
-                                                                    ck::Tuple<BiasDataType>,
+                                                                      ck::Tuple<BiasDataType>,
-                                                                    OutDataType,
+                                                                      OutDataType,
-                                                                    PassThrough,
+                                                                      PassThrough,
-                                                                    PassThrough,
+                                                                      PassThrough,
-                                                                    OutElementOp>;
+                                                                      OutElementOp>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -76,19 +76,19 @@ int main(int argc, char* argv[])
    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
-    using DeviceOp =
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
-        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+        NumDimSpatial,
-                                                                    InLayout,
+        InLayout,
-                                                                    WeiLayout,
+        WeiLayout,
-                                                                    ck::Tuple<RequantScaleLayout>,
+        ck::Tuple<RequantScaleLayout>,
-                                                                    OutLayout,
+        OutLayout,
-                                                                    InDataType,
+        InDataType,
-                                                                    WeiDataType,
+        WeiDataType,
-                                                                    ck::Tuple<RequantScaleDataType>,
+        ck::Tuple<RequantScaleDataType>,
-                                                                    OutDataType,
+        OutDataType,
-                                                                    PassThrough,
+        PassThrough,
-                                                                    PassThrough,
+        PassThrough,
-                                                                    OutElementOp>;
+        OutElementOp>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -72,18 +72,18 @@ int main(int argc, char* argv[])
    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
-                                                                                 InLayout,
+                                                                                   InLayout,
-                                                                                 WeiLayout,
+                                                                                   WeiLayout,
-                                                                                 ck::Tuple<>,
+                                                                                   ck::Tuple<>,
-                                                                                 OutLayout,
+                                                                                   OutLayout,
-                                                                                 InDataType,
+                                                                                   InDataType,
-                                                                                 WeiDataType,
+                                                                                   WeiDataType,
-                                                                                 ck::Tuple<>,
+                                                                                   ck::Tuple<>,
-                                                                                 OutDataType,
+                                                                                   OutDataType,
-                                                                                 PassThrough,
+                                                                                   PassThrough,
-                                                                                 PassThrough,
+                                                                                   PassThrough,
-                                                                                 OutElementOp>;
+                                                                                   OutElementOp>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

--- a/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt
+++ b/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt
-add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
-target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_operations)
--- a/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
+++ b/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
+add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
+target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_operations)
+add_executable(client_grouped_conv3d_bwd_data grouped_conv3d_bwd_data.cpp)
+target_link_libraries(client_grouped_conv3d_bwd_data PRIVATE composable_kernel::device_operations)
+add_executable(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp)
+target_link_libraries(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 PRIVATE composable_kernel::device_operations)
--- a/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
+++ b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp