Merge branch 'develop' into lib_gemm_softmax_gemm_type

185af92b · ltqin · 5f4a0f73 · 8bb2bb4a · 185af92b · 185af92b
Commit 185af92b authored Apr 26, 2023 by ltqin
20 changed files
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/docs/.sphinx" # Location of package manifests
+    open-pull-requests-limit: 10
+    schedule:
+      interval: "daily"
--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:20.04

-ARG ROCMVERSION=5.3
-ARG compiler_version="release"
+ARG ROCMVERSION=5.6
+ARG compiler_version=""
 ARG compiler_commit=""

 RUN set -xe

 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
-RUN useradd -rm -d /home/manitera -s /bin/bash -u 1002 manitera
 # Add rocm repository
 RUN apt-get update
-RUN apt-get install -y wget gnupg
-RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
+RUN apt-get install -y wget gnupg curl
+RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6"]; then \
+	wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+        sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
+    else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
+         apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
+         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
+         DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
+    fi
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
+RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg

 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
    apt-utils \
    build-essential \
    ccache \
-    cmake-data \
    cmake \
-    curl \
    git \
    hip-rocclr \
    jq \
@@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    rocm-device-libs \
    rocm-cmake \
    vim \
+    nano \
    zlib1g-dev \
    openssh-server \
    clang-format-10 \
@@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

+#Install latest version of cmake
+RUN apt purge --auto-remove -y cmake
+RUN apt update
+RUN apt install -y software-properties-common lsb-release
+RUN apt clean all
+RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+RUN apt install -y kitware-archive-keyring
+RUN rm /etc/apt/trusted.gpg.d/kitware.gpg
+RUN apt install -y cmake
+
 # Setup ubsan environment to printstacktrace
 RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
 ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'"
 RUN sh -c "echo compiler commit = '$compiler_commit'"

-RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \
-        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \
-        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \
-    fi
-
-RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \
+RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
        cd llvm-project && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
@@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com
    else echo "using the release compiler"; \
    fi

-RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \
+RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -19,6 +19,11 @@ def runShell(String command){

 def getDockerImageName(){
    def img
+    if (params.ROCMVERSION != "5.5" && params.ROCMVERSION != "5.6"){
+       if (params.COMPILER_VERSION == "") {
+           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
+       }
+       else{
          if (params.COMPILER_COMMIT == ""){
             img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
          }
@@ -26,6 +31,22 @@ def getDockerImageName(){
             def commit = "${params.COMPILER_COMMIT}"[0..6]
             img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
          }
+       }
+    }
+    else{
+       if (params.COMPILER_VERSION == "") {
+           img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
+       }
+       else{
+          if (params.COMPILER_COMMIT == ""){
+             img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+          }
+          else{
+             def commit = "${params.COMPILER_COMMIT}"[0..6]
+             img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+          }
+       }
+    }
    return img
 }

@@ -49,11 +70,11 @@ def build_compiler(){
        compiler = '/opt/rocm/bin/hipcc'
    }
    else{
-        if (params.COMPILER_VERSION == "release"){
-            compiler = "/opt/rocm/llvm/bin/clang++"
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
+            compiler = "/llvm-project/build/bin/clang++"
        }
        else{
-            compiler = "/llvm-project/build/bin/clang++"
+            compiler = "/opt/rocm/llvm/bin/clang++"
        }        
    }
    return compiler
@@ -232,7 +253,7 @@ def buildHipClangJob(Map conf=[:]){
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION != "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }

@@ -287,7 +308,7 @@ def runCKProfiler(Map conf=[:]){
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION != "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }

@@ -420,7 +441,7 @@ def Build_CK(Map conf=[:]){
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION != "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }

@@ -576,7 +597,7 @@ def process_results(Map conf=[:]){

 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
-                                              0 21 * * * % COMPILER_VERSION=release;COMPILER_COMMIT=
+                                              0 21 * * * % ROCMVERSION=5.4.3;COMPILER_VERSION=release;COMPILER_COMMIT=
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""

 pipeline {
@@ -594,16 +615,16 @@ pipeline {
            description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
        string(
            name: 'ROCMVERSION', 
-            defaultValue: '5.4.3', 
-            description: 'Specify which ROCM version to use: 5.4.3 (default).')
+            defaultValue: '5.6', 
+            description: 'Specify which ROCM version to use: 5.6 (default).')
        string(
            name: 'COMPILER_VERSION', 
-            defaultValue: 'amd-stg-open', 
-            description: 'Specify which version of compiler to use: ck-9110, release, or amd-stg-open (default).')
+            defaultValue: '', 
+            description: 'Specify which version of compiler to use: release, amd-stg-open, or leave blank (default).')
        string(
            name: 'COMPILER_COMMIT', 
-            defaultValue: '5541927df00eabd6a110180170eca7785d436ee3', 
-            description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.')
+            defaultValue: '', 
+            description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.')
        string(
            name: 'BUILD_COMPILER', 
            defaultValue: 'hipcc', 

--- a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
@@ -17,22 +17,22 @@ using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
 using OutDataType = ck::half_t;

-using InLayout    = ck::tensor_layout::convolution::GNHWC;
+using InLayout    = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
-using OutLayout   = ck::tensor_layout::convolution::GNHWK;
+using OutLayout   = ck::tensor_layout::convolution::NHWGK;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 static constexpr ck::index_t NumDimSpatial = 2;
 static constexpr ck::index_t G             = 32;
-static constexpr ck::index_t N             = 256;
-static constexpr ck::index_t K             = 192;
-static constexpr ck::index_t C             = 192;
-static constexpr ck::index_t Y             = 3;
-static constexpr ck::index_t X             = 3;
-static constexpr ck::index_t Hi            = 28;
-static constexpr ck::index_t Wi            = 28;
-static constexpr ck::index_t Ho            = 28;
-static constexpr ck::index_t Wo            = 28;
+static constexpr ck::index_t N             = 256; // batch size
+static constexpr ck::index_t K             = 64;  // output channel
+static constexpr ck::index_t C             = 32;  // input channel (per group)
+static constexpr ck::index_t Y             = 3;   // filter H
+static constexpr ck::index_t X             = 3;   // filter W
+static constexpr ck::index_t Hi            = 28;  // input H
+static constexpr ck::index_t Wi            = 28;  // input W
+static constexpr ck::index_t Ho            = 28;  // output H
+static constexpr ck::index_t Wo            = 28;  // output W

 struct SimpleDeviceMem
 {
@@ -52,50 +52,24 @@ struct SimpleDeviceMem

 int main()
 {
-    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
-    std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
-
-    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
-    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
-
-    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
-    std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
-
-    std::partial_sum(rbegin(in_lengths),
-                     std::prev(rend(in_lengths)),
-                     std::next(rbegin(in_strides)),
-                     std::multiplies<>{});
-    std::partial_sum(rbegin(wei_lengths),
-                     std::prev(rend(wei_lengths)),
-                     std::next(rbegin(wei_strides)),
-                     std::multiplies<>{});
-    std::partial_sum(rbegin(out_lengths),
-                     std::prev(rend(out_lengths)),
-                     std::next(rbegin(out_strides)),
-                     std::multiplies<>{});
-
-    // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
-    std::rotate(
-        rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
-    std::rotate(
-        rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
-    std::rotate(
-        rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
-    std::rotate(
-        rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
-    std::rotate(
-        rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
-    std::rotate(
-        rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
+    std::array<ck::index_t, 5> wei_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> wei_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};

    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                 InLayout,
@@ -155,9 +129,9 @@ int main()
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
-            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
                                    sizeof(WeiDataType) * G * K * Y * X * C +
-                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+                                    sizeof(OutDataType) * N * Ho * Wo * G * K;

            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;

--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
@@ -17,20 +17,20 @@ using BiasDataType         = int32_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;

-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout         = ck::tensor_layout::convolution::G_K;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = ck::tensor_operation::element_wise::Relu;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;  // batch size
-static constexpr ck::index_t K             = 64;  // output channel
-static constexpr ck::index_t C             = 192; // input channel
+static constexpr ck::index_t K             = 32; // output channel
+static constexpr ck::index_t C             = 64; // input channel (per group)
 static constexpr ck::index_t Y             = 3;  // filter H
 static constexpr ck::index_t X             = 3;  // filter W
 static constexpr ck::index_t Hi            = 71; // input H
@@ -55,8 +55,11 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
@@ -64,17 +67,18 @@ int main(int argc, char* argv[])
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
        NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -16,19 +16,19 @@ using WeiDataType  = int8_t;
 using BiasDataType = int32_t;
 using OutDataType  = int8_t;

-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout   = ck::tensor_layout::convolution::G_K;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = ck::tensor_operation::element_wise::Relu;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -55,23 +55,27 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
@@ -17,21 +17,21 @@ using BiasDataType         = int32_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;

-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout         = ck::tensor_layout::convolution::G_K;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = ck::tensor_operation::element_wise::TanH;
 using OutElementOp =
    ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -58,8 +58,11 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
@@ -67,17 +70,18 @@ int main(int argc, char* argv[])
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
        NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
@@ -16,19 +16,19 @@ using WeiDataType  = int8_t;
 using BiasDataType = int32_t;
 using OutDataType  = int8_t;

-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout   = ck::tensor_layout::convolution::G_K;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = ck::tensor_operation::element_wise::TanH;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -56,23 +56,27 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -16,19 +16,19 @@ using WeiDataType          = int8_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;

-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = PassThrough;
 using OutElementOp       = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;  // batch size
-static constexpr ck::index_t K             = 64;  // output channel
-static constexpr ck::index_t C             = 192; // input channel
+static constexpr ck::index_t K             = 32; // output channel
+static constexpr ck::index_t C             = 64; // input channel (per group)
 static constexpr ck::index_t Y             = 3;  // filter H
 static constexpr ck::index_t X             = 3;  // filter W
 static constexpr ck::index_t Hi            = 71; // input H
@@ -54,23 +54,27 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -15,18 +15,18 @@ using InDataType  = int8_t;
 using WeiDataType = int8_t;
 using OutDataType = int8_t;

-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -53,20 +53,24 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                 InLayout,

--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -13,8 +13,8 @@
 #include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"

 using XDataType       = ck::half_t;
-using GammaDataType   = ck::half_t;
-using BetaDataType    = ck::half_t;
+using GammaDataType   = float;
+using BetaDataType    = float;
 using YDataType       = ck::half_t;
 using ComputeDataType = float;
 using Swish           = ck::tensor_operation::element_wise::Swish;

--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -21,6 +21,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
     -Wno-comma
     -Wno-old-style-cast
     -Wno-deprecated
+     -Wno-unsafe-buffer-usage
 )
 message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")


--- a/docs/.sphinx/requirements.in
+++ b/docs/.sphinx/requirements.in
-git+https://github.com/RadeonOpenCompute/rocm-docs-core.git
+rocm-docs-core==0.2.0
 sphinxcontrib-bibtex==2.5.0
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -2,9 +2,9 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile requirements.in
+#    pip-compile .sphinx/requirements.in
 #
-accessible-pygments==0.0.4
+accessible-pygments==0.0.3
    # via pydata-sphinx-theme
 alabaster==0.7.13
    # via sphinx
@@ -20,7 +20,7 @@ babel==2.12.1
    #   sphinx
 backcall==0.2.0
    # via ipython
-beautifulsoup4==4.12.0
+beautifulsoup4==4.11.2
    # via pydata-sphinx-theme
 breathe==4.34.0
    # via rocm-docs-core
@@ -34,7 +34,7 @@ click==8.1.3
    # via
    #   jupyter-cache
    #   sphinx-external-toc
-comm==0.1.3
+comm==0.1.2
    # via ipykernel
 debugpy==1.6.6
    # via ipykernel
@@ -65,13 +65,11 @@ idna==3.4
    # via requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==6.1.0
+importlib-metadata==6.0.0
    # via
    #   jupyter-cache
    #   myst-nb
-importlib-resources==5.10.4
-    # via rocm-docs-core
-ipykernel==6.22.0
+ipykernel==6.21.3
    # via myst-nb
 ipython==8.11.0
    # via
@@ -87,7 +85,7 @@ jsonschema==4.17.3
    # via nbformat
 jupyter-cache==0.5.0
    # via myst-nb
-jupyter-client==8.1.0
+jupyter-client==8.0.3
    # via
    #   ipykernel
    #   nbclient
@@ -124,7 +122,7 @@ nbclient==0.5.13
    # via
    #   jupyter-cache
    #   myst-nb
-nbformat==5.8.0
+nbformat==5.7.3
    # via
    #   jupyter-cache
    #   myst-nb
@@ -187,7 +185,7 @@ pyyaml==6.0
    #   myst-parser
    #   pybtex
    #   sphinx-external-toc
-pyzmq==25.0.2
+pyzmq==25.0.1
    # via
    #   ipykernel
    #   jupyter-client
@@ -195,8 +193,8 @@ requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core @ git+https://github.com/RadeonOpenCompute/rocm-docs-core.git
-    # via -r requirements.in
+rocm-docs-core==0.2.0
+    # via -r .sphinx/requirements.in
 six==1.16.0
    # via
    #   asttokens
@@ -235,9 +233,7 @@ sphinx-notfound-page==0.8.3
 sphinxcontrib-applehelp==1.0.4
    # via sphinx
 sphinxcontrib-bibtex==2.5.0
-    # via
-    #   -r requirements.in
-    #   rocm-docs-core
+    # via -r .sphinx/requirements.in
 sphinxcontrib-devhelp==1.0.2
    # via sphinx
 sphinxcontrib-htmlhelp==2.0.1
@@ -248,7 +244,7 @@ sphinxcontrib-qthelp==1.0.3
    # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
-sqlalchemy==1.4.47
+sqlalchemy==1.4.46
    # via jupyter-cache
 stack-data==0.6.2
    # via ipython

--- a/example/15_grouped_gemm/CMakeLists.txt
+++ b/example/15_grouped_gemm/CMakeLists.txt
@@ -5,6 +5,7 @@ add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
 add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp)
 add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp)
 add_example_executable(example_grouped_gemm_multiple_d_dl_fp16 grouped_gemm_multiple_d_dl_fp16.cpp)
+add_example_executable(example_grouped_gemm_xdl_splitk_fp16 grouped_gemm_xdl_splitk_fp16.cpp)


 add_dependencies(example_grouped_gemm_xdl
@@ -12,7 +13,8 @@ add_dependencies(example_grouped_gemm_xdl
                 example_grouped_gemm_xdl_fp16
                 example_grouped_gemm_xdl_bfp16
                 example_grouped_gemm_xdl_int8
-                 example_grouped_gemm_multiple_d_dl_fp16)
+                 example_grouped_gemm_multiple_d_dl_fp16
+                 example_grouped_gemm_xdl_splitk_fp16)

 if(USE_BITINT_EXTENSION_INT4)
  add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)

--- a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdlSplitKCShuffle
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    problem_size.group_count = 16;
+
+    problem_size.Ms = {
+        167, 183, 177, 181, 153, 139, 156, 173, 163, 150, 204, 184, 168, 156, 168, 148};
+
+    for(int i = 0; i < problem_size.group_count; i++)
+    {
+        problem_size.Ns.push_back(768);
+        problem_size.Ks.push_back(4608);
+
+        problem_size.stride_As.push_back(problem_size.Ks[i]);
+        problem_size.stride_Bs.push_back(problem_size.Ks[i]);
+        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
+    }
+
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+
+    return !run_grouped_gemm(problem_size, config);
+}
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -147,6 +147,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 #else
        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
+	c_tensors_device[i]->SetZero();
 #endif

        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());

--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
@@ -190,11 +190,11 @@ int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};

-    using InLayout           = ck::tensor_layout::convolution::GNHWC;
-    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+    using InLayout           = ck::tensor_layout::convolution::NHWGC;
+    using WeiLayout          = ck::tensor_layout::convolution::KYXGC;
    using BiasLayout         = ck::tensor_layout::convolution::G_K;
    using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-    using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+    using OutLayout          = ck::tensor_layout::convolution::NHWGK;

    const auto in_g_n_c_wis_desc =
        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);

--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
@@ -178,10 +178,10 @@ int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_el
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};

-    using InLayout   = ck::tensor_layout::convolution::GNHWC;
-    using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+    using InLayout   = ck::tensor_layout::convolution::NHWGC;
+    using WeiLayout  = ck::tensor_layout::convolution::KYXGC;
    using BiasLayout = ck::tensor_layout::convolution::G_K;
-    using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+    using OutLayout  = ck::tensor_layout::convolution::NHWGK;

    const auto in_g_n_c_wis_desc =
        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);

--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
@@ -180,10 +180,10 @@ int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_eleme
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};

-    using InLayout           = ck::tensor_layout::convolution::GNHWC;
-    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+    using InLayout           = ck::tensor_layout::convolution::NHWGC;
+    using WeiLayout          = ck::tensor_layout::convolution::KYXGC;
    using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-    using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+    using OutLayout          = ck::tensor_layout::convolution::NHWGK;

    const auto in_g_n_c_wis_desc =
        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);