Merge branch 'develop' into avgpool_bwd

c26c154e · rocking · GitHub · 0ab4fa0f · 1ee99dca · c26c154e
Unverified Commit c26c154e authored Jul 14, 2023 by rocking Committed by GitHub Jul 14, 2023
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+repos:
+-   repo: local
+    hooks:
+    -   id: clang-format
+        name: clang-format
+        entry: clang-format-10 -i --style=file
+        language: system
+        types_or: [c++, inc]
+    -   id: copyright-year-checker
+        name: copyright-year-checker
+        entry: script/check_copyright_year.sh
+        verbose: false
+        language: script
+        types: [c++]
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,24 +12,27 @@ RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
 RUN chmod 1777 /tmp
 RUN apt-get update
 RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
-RUN if [ "$ROCMVERSION" != "5.6" ]; then \
+
+ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
+RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+
+RUN wget https://repo.radeon.com/amdgpu-install/5.6/ubuntu/focal/amdgpu-install_5.6.50600-1_all.deb  --no-check-certificate
+RUN apt-get update && \
+DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    ./amdgpu-install_5.6.50600-1_all.deb
+
+RUN if [ "$ROCMVERSION" != "5.7" ]; then \
        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-        sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
-    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "" ]; then \
-         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
-         apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
-         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
-         amdgpu-install -y --usecase=rocm --no-dkms; \
-    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "rc3" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
-         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.6-20.04-1_all.deb" && \
-         apt update && apt-get install -y ./amdgpu-install-internal_5.6-20.04-1_all.deb && \
-         sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.6 rel-45  > /etc/apt/sources.list.d/rocm-build.list' && \
-         amdgpu-repo --amdgpu-build=1602498 && amdgpu-install -y --usecase=rocm --no-dkms; \
+        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
+        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
+    elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
+         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
+         apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
+         amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \
    fi

-RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
-RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+RUN amdgpu-install -y --usecase=rocm --no-dkms

 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -11,6 +11,20 @@ def show_node_info() {
    """
 }

+def nthreads() {
+    def nproc = sh(returnStdout: true, script: 'nproc')
+    echo "Number of cores: ${nproc}"
+    def n = nproc.toInteger()
+    if (n > 32){
+        n /= 2
+    }
+    if (n > 64){
+        n = 64
+    }
+    echo "Number of threads used for building: ${n}"
+    return n
+}
+
 def runShell(String command){
    def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
    def output = readFile(file: "tmp.txt")
@@ -19,7 +33,7 @@ def runShell(String command){

 def getDockerImageName(){
    def img
-    if (params.ROCMVERSION != "5.6"){
+    if (params.ROCMVERSION != "5.7"){
       if (params.COMPILER_VERSION == "") {
           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
       }
@@ -219,7 +233,8 @@ def cmake_build(Map conf=[:]){
        """
    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
    // reduce parallelism when compiling, clang uses too much memory
-    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 2 )) ${config_targets}")
+    def nt = nthreads()
+    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j${nt} ${config_targets}")
    def execute_cmd = conf.get("execute_cmd", "")

    def cmd = conf.get("cmd", """
@@ -461,7 +476,7 @@ def Build_CK(Map conf=[:]){
                        else{
                            echo "GPU is OK"
                        }
-                        if ( runShell('grep -n "gfx1030" clinfo.log') ){
+                        if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
                            navi_node = 1
                        }
                    }
@@ -482,7 +497,7 @@ def Build_CK(Map conf=[:]){
                        else{
                            echo "GPU is OK"
                        }
-                        if ( runShell('grep -n "gfx1030" clinfo.log') ){
+                        if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
                            navi_node = 1
                        }
                    }
@@ -493,8 +508,9 @@ def Build_CK(Map conf=[:]){
                {
                    cmake_build(conf)
                    dir("build"){
-                        //run tests and examples 	
-                        sh 'make -j\$(( \$(nproc) / 2 )) check'
+                        //run tests and examples
+                        def nt = nthreads()	
+                        sh 'make -j${nt} check'
                        if (navi_node == 0 ){
                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
                            //do not stash profiler on Navi nodes
@@ -598,7 +614,7 @@ def process_results(Map conf=[:]){

 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
-                                              0 21 * * * % ROCMVERSION=5.5;COMPILER_VERSION=release;COMPILER_COMMIT=
+                                              0 21 * * * % ROCMVERSION=5.6;COMPILER_VERSION=;COMPILER_COMMIT=
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""

 pipeline {
@@ -696,7 +712,7 @@ pipeline {
                    agent{ label rocmnode("gfx908 || gfx90a") }
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
@@ -717,7 +733,7 @@ pipeline {
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                    }
                }
-                stage("Build CK and run Tests on Navi")
+                stage("Build CK and run Tests on Navi21")
                {
                    when {
                        beforeAgent true

--- a/README.md
+++ b/README.md
@@ -109,6 +109,24 @@ make install

 Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)

+## Contributing
+
+When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
+
+```bash
+sudo script/install_precommit.sh
+```
+
+This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created.
+
+If you need to uninstall hooks from the repository, you can do so by running the following command:
+
+```bash
+script/uninstall_precommit.sh
+```
+
+If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command.
+
 ## Caveat
 ### Kernel Timing and Verification


--- a/client_example/11_grouped_conv_bwd_weight/common.hpp
+++ b/client_example/11_grouped_conv_bwd_weight/common.hpp
@@ -101,13 +101,15 @@ template <ck::index_t NumDimSpatial,
          typename WeiLayout,
          typename OutLayout>
 bool run_grouped_conv_bwd_weight(
-    ck::index_t G,
-    ck::index_t N,
-    ck::index_t K,
-    ck::index_t C,
+    const ck::index_t G,
+    const ck::index_t N,
+    const ck::index_t K,
+    const ck::index_t C,
    const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths,
    const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths,
    const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& input_strides,
+    const std::array<ck::index_t, NumDimSpatial + 3>& output_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
    const std::array<ck::index_t, NumDimSpatial>& input_left_pads,
@@ -157,6 +159,8 @@ bool run_grouped_conv_bwd_weight(
                                                        input_spatial_lengths,
                                                        filter_spatial_lengths,
                                                        output_spatial_lengths,
+                                                        input_strides,
+                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
                                                        input_left_pads,
@@ -224,6 +228,8 @@ bool run_grouped_conv_bwd_weight(
                                                        input_spatial_lengths,
                                                        filter_spatial_lengths,
                                                        output_spatial_lengths,
+                                                        input_strides,
+                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
                                                        input_left_pads,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
@@ -22,6 +22,15 @@ static constexpr ck::index_t C             = 192;
 static constexpr ck::index_t X             = 3;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Wo            = 28;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1};

 int main()
 {
@@ -31,7 +40,19 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G, N, K, C, {Wi}, {X}, {Wo}, {1}, {1}, {1}, {1})
+                                       OutLayout>(G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  output_spatial_lengths,
+                                                  input_strides,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
@@ -25,6 +25,17 @@ static constexpr ck::index_t Hi            = 28;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 28;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Hi * Wi * C, Hi* Wi* C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Ho * Wo * K, Ho* Wo* K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};

 int main()
 {
@@ -34,8 +45,19 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(
-               G, N, K, C, {Hi, Wi}, {Y, X}, {Ho, Wo}, {1, 1}, {1, 1}, {1, 1}, {1, 1})
+                                       OutLayout>(G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  output_spatial_lengths,
+                                                  input_strides,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
@@ -28,6 +28,17 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};

 int main()
 {
@@ -41,13 +52,15 @@ int main()
                                                  N,
                                                  K,
                                                  C,
-                                                  {Di, Hi, Wi},
-                                                  {Z, Y, X},
-                                                  {Do, Ho, Wo},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1})
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  output_spatial_lengths,
+                                                  input_strides,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
@@ -28,6 +28,17 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};

 int main()
 {
@@ -37,17 +48,20 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
-                                                  N,
-                                                  K,
-                                                  C,
-                                                  {Di, Hi, Wi},
-                                                  {Z, Y, X},
-                                                  {Do, Ho, Wo},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1},
-                                                  {1, 1, 1})
+                                       OutLayout>(
+               G,
+               N,
+               K,
+               C,
+               {Di, Hi, Wi},
+               {Z, Y, X},
+               {Do, Ho, Wo},
+               {N * Di * Hi * Wi * C, Di * Hi * Wi * C, Hi * Wi * C, Wi * C, C, 1},
+               {N * Do * Ho * Wo * K, Do * Ho * Wo * K, Ho * Wo * K, Wo * K, K, 1},
+               {1, 1, 1},
+               {1, 1, 1},
+               {1, 1, 1},
+               {1, 1, 1})
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -44,3 +44,7 @@ if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS
  add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
 endif()

+if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942")
+  add_example_executable(example_gemm_xdl_f8 gemm_xdl_f8.cpp)
+  add_dependencies(example_gemm_xdl example_gemm_xdl_f8)
+endif()
--- a/example/01_gemm/gemm_xdl_f8.cpp
+++ b/example/01_gemm/gemm_xdl_f8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+
+using ADataType        = ck::f8_t;
+using BDataType        = ck::f8_t;
+using CDataType        = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::f8_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
@@ -3,7 +3,7 @@

 #include "common.hpp"

-#include "ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp"

 using ADataType        = ck::half_t;
 using BDataType        = ck::half_t;

--- a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
@@ -3,7 +3,7 @@

 #include "convnd_fwd_dl_common.hpp"

-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"


--- a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
@@ -3,7 +3,7 @@

 #include "convnd_fwd_dl_common.hpp"

-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"


--- a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
@@ -3,7 +3,7 @@

 #include "convnd_fwd_dl_common.hpp"

-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"


--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
@@ -3,7 +3,7 @@

 #include "common.hpp"

-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"

 using InDataType = BF16;
 // bf16 kernel use fp32 atomic add to accumulate Weight tensor into global memory
@@ -17,8 +17,20 @@ using OutElementOp = PassThrough;

 template <ck::index_t NDimSpatial>
 using DeviceConvBwdWeightInstance =
-    ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<
-        NDimSpatial,          // NDimSpatial
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
+        NDimSpatial,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                      ck::tensor_layout::convolution::GNHWC,
+                                      ck::tensor_layout::convolution::GNDHWC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                      ck::tensor_layout::convolution::GKYXC,
+                                      ck::tensor_layout::convolution::GKZYXC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                      ck::tensor_layout::convolution::GNHWK,
+                                      ck::tensor_layout::convolution::GNDHWK>>,
        InDataType,           // InDataType
        WeiDataType,          // WeiDataType
        OutDataType,          // OutDataType

--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
@@ -3,7 +3,7 @@

 #include "common.hpp"

-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"

 using InDataType  = F16;
 using WeiDataType = F16;
@@ -16,8 +16,20 @@ using OutElementOp = PassThrough;

 template <ck::index_t NDimSpatial>
 using DeviceConvBwdWeightInstance =
-    ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<
-        NDimSpatial,          // NDimSpatial
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
+        NDimSpatial,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                      ck::tensor_layout::convolution::GNHWC,
+                                      ck::tensor_layout::convolution::GNDHWC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                      ck::tensor_layout::convolution::GKYXC,
+                                      ck::tensor_layout::convolution::GKZYXC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                      ck::tensor_layout::convolution::GNHWK,
+                                      ck::tensor_layout::convolution::GNDHWK>>,
        InDataType,           // InDataType
        WeiDataType,          // WeiDataType
        OutDataType,          // OutDataType

--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -75,6 +75,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> input_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> output_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
@@ -85,6 +87,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
    range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
    range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
    range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
+    range_copy(in_g_n_c_wis_desc.GetStrides(), begin(input_strides));
+    range_copy(out_g_n_k_wos_desc.GetStrides(), begin(output_strides));
    range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
    range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
@@ -103,6 +107,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
                                      input_spatial_lengths,
                                      filter_spatial_lengths,
                                      output_spatial_lengths,
+                                      input_strides,
+                                      output_strides,
                                      conv_filter_strides,
                                      conv_filter_dilations,
                                      input_left_pads,

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -17,7 +17,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -17,7 +17,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"