Merge remote-tracking branch 'origin/develop' into grouped_gemm_args_simplify

ad2fddf4 · Jing Zhang · fa649421 · 1ee99dca · ad2fddf4 · ad2fddf4
Commit ad2fddf4 authored Jul 16, 2023 by Jing Zhang
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+repos:
+-   repo: local
+    hooks:
+    -   id: clang-format
+        name: clang-format
+        entry: clang-format-10 -i --style=file
+        language: system
+        types_or: [c++, inc]
+    -   id: copyright-year-checker
+        name: copyright-year-checker
+        entry: script/check_copyright_year.sh
+        verbose: false
+        language: script
+        types: [c++]
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -11,6 +11,20 @@ def show_node_info() {
    """
 }
+def nthreads() {
+    def nproc = sh(returnStdout: true, script: 'nproc')
+    echo "Number of cores: ${nproc}"
+    def n = nproc.toInteger()
+    if (n > 32){
+        n /= 2
+    }
+    if (n > 64){
+        n = 64
+    }
+    echo "Number of threads used for building: ${n}"
+    return n
+}
 def runShell(String command){
    def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
    def output = readFile(file: "tmp.txt")
@@ -219,7 +233,8 @@ def cmake_build(Map conf=[:]){
        """
    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
    // reduce parallelism when compiling, clang uses too much memory
-    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 2 )) ${config_targets}")
+    def nt = nthreads()
+    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j${nt} ${config_targets}")
    def execute_cmd = conf.get("execute_cmd", "")
    def cmd = conf.get("cmd", """
@@ -461,7 +476,7 @@ def Build_CK(Map conf=[:]){
                        else{
                            echo "GPU is OK"
                        }
-                        if ( runShell('grep -n "gfx1030" clinfo.log') ){
+                        if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
                            navi_node = 1
                        }
                    }
@@ -482,7 +497,7 @@ def Build_CK(Map conf=[:]){
                        else{
                            echo "GPU is OK"
                        }
-                        if ( runShell('grep -n "gfx1030" clinfo.log') ){
+                        if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
                            navi_node = 1
                        }
                    }
@@ -493,8 +508,9 @@ def Build_CK(Map conf=[:]){
                {
                    cmake_build(conf)
                    dir("build"){
-                        //run tests and examples 	
+                        //run tests and examples
-                        sh 'make -j\$(( \$(nproc) / 2 )) check'
+                        def nt = nthreads()	
+                        sh 'make -j${nt} check'
                        if (navi_node == 0 ){
                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
                            //do not stash profiler on Navi nodes
@@ -717,7 +733,7 @@ pipeline {
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                    }
                }
-                stage("Build CK and run Tests on Navi")
+                stage("Build CK and run Tests on Navi21")
                {
                    when {
                        beforeAgent true

--- a/README.md
+++ b/README.md
@@ -109,6 +109,24 @@ make install
 Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
+## Contributing
+When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
+```bash
+sudo script/install_precommit.sh
+```
+This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created.
+If you need to uninstall hooks from the repository, you can do so by running the following command:
+```bash
+script/uninstall_precommit.sh
+```
+If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command.
 ## Caveat
 ### Kernel Timing and Verification

--- a/client_example/11_grouped_conv_bwd_weight/common.hpp
+++ b/client_example/11_grouped_conv_bwd_weight/common.hpp
@@ -101,13 +101,15 @@ template <ck::index_t NumDimSpatial,
          typename WeiLayout,
          typename OutLayout>
 bool run_grouped_conv_bwd_weight(
-    ck::index_t G,
+    const ck::index_t G,
-    ck::index_t N,
+    const ck::index_t N,
-    ck::index_t K,
+    const ck::index_t K,
-    ck::index_t C,
+    const ck::index_t C,
    const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths,
    const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths,
    const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& input_strides,
+    const std::array<ck::index_t, NumDimSpatial + 3>& output_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
    const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
    const std::array<ck::index_t, NumDimSpatial>& input_left_pads,
@@ -157,6 +159,8 @@ bool run_grouped_conv_bwd_weight(
                                                        input_spatial_lengths,
                                                        filter_spatial_lengths,
                                                        output_spatial_lengths,
+                                                        input_strides,
+                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
                                                        input_left_pads,
@@ -224,6 +228,8 @@ bool run_grouped_conv_bwd_weight(
                                                        input_spatial_lengths,
                                                        filter_spatial_lengths,
                                                        output_spatial_lengths,
+                                                        input_strides,
+                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
                                                        input_left_pads,

--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
@@ -22,6 +22,15 @@ static constexpr ck::index_t C             = 192;
 static constexpr ck::index_t X             = 3;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Wo            = 28;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1};
 int main()
 {
@@ -31,7 +40,19 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G, N, K, C, {Wi}, {X}, {Wo}, {1}, {1}, {1}, {1})
+                                       OutLayout>(G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  output_spatial_lengths,
+                                                  input_strides,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
@@ -25,6 +25,17 @@ static constexpr ck::index_t Hi            = 28;
 static constexpr ck::index_t Wi            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 28;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Hi * Wi * C, Hi* Wi* C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Ho * Wo * K, Ho* Wo* K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
 int main()
 {
@@ -34,8 +45,19 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(
+                                       OutLayout>(G,
-               G, N, K, C, {Hi, Wi}, {Y, X}, {Ho, Wo}, {1, 1}, {1, 1}, {1, 1}, {1, 1})
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  output_spatial_lengths,
+                                                  input_strides,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
@@ -28,6 +28,17 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
 int main()
 {
@@ -41,13 +52,15 @@ int main()
                                                  N,
                                                  K,
                                                  C,
-                                                  {Di, Hi, Wi},
+                                                  input_spatial_lengths,
-                                                  {Z, Y, X},
+                                                  filter_spatial_lengths,
-                                                  {Do, Ho, Wo},
+                                                  output_spatial_lengths,
-                                                  {1, 1, 1},
+                                                  input_strides,
-                                                  {1, 1, 1},
+                                                  output_strides,
-                                                  {1, 1, 1},
+                                                  conv_filter_strides,
-                                                  {1, 1, 1})
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
@@ -28,6 +28,17 @@ static constexpr ck::index_t Wi            = 3;
 static constexpr ck::index_t Do            = 28;
 static constexpr ck::index_t Ho            = 28;
 static constexpr ck::index_t Wo            = 3;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
 int main()
 {
@@ -37,17 +48,20 @@ int main()
                                       OutDataType,
                                       InLayout,
                                       WeiLayout,
-                                       OutLayout>(G,
+                                       OutLayout>(
-                                                  N,
+               G,
-                                                  K,
+               N,
-                                                  C,
+               K,
-                                                  {Di, Hi, Wi},
+               C,
-                                                  {Z, Y, X},
+               {Di, Hi, Wi},
-                                                  {Do, Ho, Wo},
+               {Z, Y, X},
-                                                  {1, 1, 1},
+               {Do, Ho, Wo},
-                                                  {1, 1, 1},
+               {N * Di * Hi * Wi * C, Di * Hi * Wi * C, Hi * Wi * C, Wi * C, C, 1},
-                                                  {1, 1, 1},
+               {N * Do * Ho * Wo * K, Do * Ho * Wo * K, Ho * Wo * K, Wo * K, K, 1},
-                                                  {1, 1, 1})
+               {1, 1, 1},
+               {1, 1, 1},
+               {1, 1, 1},
+               {1, 1, 1})
               ? EXIT_SUCCESS
               : EXIT_FAILURE;
 }
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -44,3 +44,7 @@ if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS
  add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
 endif()
+if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942")
+  add_example_executable(example_gemm_xdl_f8 gemm_xdl_f8.cpp)
+  add_dependencies(example_gemm_xdl example_gemm_xdl_f8)
+endif()
--- a/example/01_gemm/gemm_xdl_f8.cpp
+++ b/example/01_gemm/gemm_xdl_f8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+using ADataType        = ck::f8_t;
+using BDataType        = ck::f8_t;
+using CDataType        = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::f8_t;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+#include "run_gemm_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
@@ -3,7 +3,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp"
 using ADataType        = ck::half_t;
 using BDataType        = ck::half_t;

--- a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
@@ -3,7 +3,7 @@
 #include "convnd_fwd_dl_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

--- a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
@@ -3,7 +3,7 @@
 #include "convnd_fwd_dl_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

--- a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
@@ -3,7 +3,7 @@
 #include "convnd_fwd_dl_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
@@ -3,7 +3,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
 using InDataType = BF16;
 // bf16 kernel use fp32 atomic add to accumulate Weight tensor into global memory
@@ -17,8 +17,20 @@ using OutElementOp = PassThrough;
 template <ck::index_t NDimSpatial>
 using DeviceConvBwdWeightInstance =
-    ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
-        NDimSpatial,          // NDimSpatial
+        NDimSpatial,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                      ck::tensor_layout::convolution::GNHWC,
+                                      ck::tensor_layout::convolution::GNDHWC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                      ck::tensor_layout::convolution::GKYXC,
+                                      ck::tensor_layout::convolution::GKZYXC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                      ck::tensor_layout::convolution::GNHWK,
+                                      ck::tensor_layout::convolution::GNDHWK>>,
        InDataType,           // InDataType
        WeiDataType,          // WeiDataType
        OutDataType,          // OutDataType

--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
@@ -3,7 +3,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
 using InDataType  = F16;
 using WeiDataType = F16;
@@ -16,8 +16,20 @@ using OutElementOp = PassThrough;
 template <ck::index_t NDimSpatial>
 using DeviceConvBwdWeightInstance =
-    ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
-        NDimSpatial,          // NDimSpatial
+        NDimSpatial,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                      ck::tensor_layout::convolution::GNHWC,
+                                      ck::tensor_layout::convolution::GNDHWC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                      ck::tensor_layout::convolution::GKYXC,
+                                      ck::tensor_layout::convolution::GKZYXC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                      ck::tensor_layout::convolution::GNHWK,
+                                      ck::tensor_layout::convolution::GNDHWK>>,
        InDataType,           // InDataType
        WeiDataType,          // WeiDataType
        OutDataType,          // OutDataType

--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -75,6 +75,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> input_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> output_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
@@ -85,6 +87,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
    range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
    range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
    range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
+    range_copy(in_g_n_c_wis_desc.GetStrides(), begin(input_strides));
+    range_copy(out_g_n_k_wos_desc.GetStrides(), begin(output_strides));
    range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
    range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
@@ -103,6 +107,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
                                      input_spatial_lengths,
                                      filter_spatial_lengths,
                                      output_spatial_lengths,
+                                      input_strides,
+                                      output_strides,
                                      conv_filter_strides,
                                      conv_filter_dilations,
                                      input_left_pads,

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -17,7 +17,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -17,7 +17,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"

--- a/example/34_batchnorm/CMakeLists.txt
+++ b/example/34_batchnorm/CMakeLists.txt
 add_example_executable(example_batchnorm_forward_training batchnorm_forward_training_nhwc.cpp)
+add_example_executable(example_batchnorm_forward_training_obsolete batchnorm_forward_training_nhwc_obsolete.cpp)
 add_example_executable(example_batchnorm_forward_inferring batchnorm_forward_inferring_nhwc.cpp)
 add_example_executable(example_batchnorm_backward batchnorm_backward_nhwc.cpp)