Merge branch 'develop' into feature/integrage-karg-simplification-pr

8820cf9f · Po-Yen, Chen · cb46ef7a · 4feebedd · 8820cf9f · 8820cf9f
Commit 8820cf9f authored May 04, 2023 by Po-Yen, Chen
20 changed files
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/docs/.sphinx" # Location of package manifests
+    open-pull-requests-limit: 10
+    schedule:
+      interval: "daily"
--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:20.04
-ARG ROCMVERSION=5.3
+ARG ROCMVERSION=5.6
-ARG compiler_version="release"
+ARG compiler_version=""
 ARG compiler_commit=""
 RUN set -xe
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
-RUN useradd -rm -d /home/manitera -s /bin/bash -u 1002 manitera
 # Add rocm repository
 RUN apt-get update
-RUN apt-get install -y wget gnupg
+RUN apt-get install -y wget gnupg curl
-RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
+RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6"]; then \
-RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
+	wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+        sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
+    else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
+         apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
+         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
+         DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
+    fi
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
+RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
    apt-utils \
    build-essential \
    ccache \
-    cmake-data \
    cmake \
-    curl \
    git \
    hip-rocclr \
    jq \
@@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    rocm-device-libs \
    rocm-cmake \
    vim \
+    nano \
    zlib1g-dev \
    openssh-server \
    clang-format-10 \
@@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
+#Install latest version of cmake
+RUN apt purge --auto-remove -y cmake
+RUN apt update
+RUN apt install -y software-properties-common lsb-release
+RUN apt clean all
+RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+RUN apt install -y kitware-archive-keyring
+RUN rm /etc/apt/trusted.gpg.d/kitware.gpg
+RUN apt install -y cmake
 # Setup ubsan environment to printstacktrace
 RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
 ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'"
 RUN sh -c "echo compiler commit = '$compiler_commit'"
-RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \
+RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
-        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \
-        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \
-    fi
-RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \
        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
        cd llvm-project && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
@@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com
    else echo "using the release compiler"; \
    fi
-RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \
+RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -19,12 +19,33 @@ def runShell(String command){
 def getDockerImageName(){
    def img
-    if (params.COMPILER_COMMIT == ""){
+    if (params.ROCMVERSION != "5.5" && params.ROCMVERSION != "5.6"){
-        img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+       if (params.COMPILER_VERSION == "") {
+           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
+       }
+       else{
+          if (params.COMPILER_COMMIT == ""){
+             img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+          }
+          else{
+             def commit = "${params.COMPILER_COMMIT}"[0..6]
+             img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+          }
+       }
    }
    else{
-        def commit = "${params.COMPILER_COMMIT}"[0..6]
+       if (params.COMPILER_VERSION == "") {
-        img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+           img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
+       }
+       else{
+          if (params.COMPILER_COMMIT == ""){
+             img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+          }
+          else{
+             def commit = "${params.COMPILER_COMMIT}"[0..6]
+             img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+          }
+       }
    }
    return img
 }
@@ -49,11 +70,11 @@ def build_compiler(){
        compiler = '/opt/rocm/bin/hipcc'
    }
    else{
-        if (params.COMPILER_VERSION == "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
-            compiler = "/opt/rocm/llvm/bin/clang++"
+            compiler = "/llvm-project/build/bin/clang++"
        }
        else{
-            compiler = "/llvm-project/build/bin/clang++"
+            compiler = "/opt/rocm/llvm/bin/clang++"
        }        
    }
    return compiler
@@ -232,7 +253,7 @@ def buildHipClangJob(Map conf=[:]){
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION != "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -287,7 +308,7 @@ def runCKProfiler(Map conf=[:]){
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION != "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -420,7 +441,7 @@ def Build_CK(Map conf=[:]){
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION != "release"){
+        if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -576,7 +597,7 @@ def process_results(Map conf=[:]){
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
-                                              0 21 * * * % COMPILER_VERSION=release;COMPILER_COMMIT=
+                                              0 21 * * * % ROCMVERSION=5.4.3;COMPILER_VERSION=release;COMPILER_COMMIT=
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
 pipeline {
@@ -594,16 +615,16 @@ pipeline {
            description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
        string(
            name: 'ROCMVERSION', 
-            defaultValue: '5.4.3', 
+            defaultValue: '5.6', 
-            description: 'Specify which ROCM version to use: 5.4.3 (default).')
+            description: 'Specify which ROCM version to use: 5.6 (default).')
        string(
            name: 'COMPILER_VERSION', 
-            defaultValue: 'amd-stg-open', 
+            defaultValue: '', 
-            description: 'Specify which version of compiler to use: ck-9110, release, or amd-stg-open (default).')
+            description: 'Specify which version of compiler to use: release, amd-stg-open, or leave blank (default).')
        string(
            name: 'COMPILER_COMMIT', 
-            defaultValue: '5541927df00eabd6a110180170eca7785d436ee3', 
+            defaultValue: '', 
-            description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.')
+            description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.')
        string(
            name: 'BUILD_COMPILER', 
            defaultValue: 'hipcc', 

--- a/client_example/04_contraction/CMakeLists.txt
+++ b/client_example/04_contraction/CMakeLists.txt
-add_executable(client_contraction_scale contraction_scale.cpp)
+add_executable(client_contraction_scale_fp32 contraction_scale_fp32.cpp)
-target_link_libraries(client_contraction_scale PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_contraction_scale_fp32 PRIVATE composable_kernel::device_operations)
-add_executable(client_contraction_bilinear contraction_bilinear.cpp)
+add_executable(client_contraction_bilinear_fp32 contraction_bilinear_fp32.cpp)
-target_link_libraries(client_contraction_bilinear PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_contraction_bilinear_fp32 PRIVATE composable_kernel::device_operations)
+add_executable(client_contraction_scale_fp64 contraction_scale_fp64.cpp)
+target_link_libraries(client_contraction_scale_fp64 PRIVATE composable_kernel::device_operations)
+add_executable(client_contraction_bilinear_fp64 contraction_bilinear_fp64.cpp)
+target_link_libraries(client_contraction_bilinear_fp64 PRIVATE composable_kernel::device_operations)
 add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp)
 target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations)

--- a/client_example/04_contraction/contraction_bilinear.cpp
+++ b/client_example/04_contraction/contraction_bilinear.cpp
--- a/client_example/04_contraction/contraction_bilinear_fp64.cpp
+++ b/client_example/04_contraction/contraction_bilinear_fp64.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <numeric>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
+#include "ck/library/utility/numeric.hpp"
+using F64 = double;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Bilinear;
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F64;
+using CShuffleDataType = F64;
+using DDataType        = F64;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F64;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+// kknn
+#if 1
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// knnn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// mknn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// mnnn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+#endif
+    float alpha = 1.f;
+    float beta  = 1.f;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 25)
+    {
+        const ck::index_t M0 = std::stoi(argv[1]);
+        const ck::index_t M1 = std::stoi(argv[2]);
+        const ck::index_t N0 = std::stoi(argv[3]);
+        const ck::index_t N1 = std::stoi(argv[4]);
+        const ck::index_t K0 = std::stoi(argv[5]);
+        const ck::index_t K1 = std::stoi(argv[6]);
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+        d_ms_ns_lengths = {M0, M1, N0, N1};
+        d_ms_ns_strides = {
+            std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
+        alpha = std::stof(argv[23]);
+        beta  = std::stof(argv[24]);
+    }
+    else
+    {
+        printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+        printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+        printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg23 to 24: alpha, beta\n");
+        exit(0);
+    }
+    auto f_tensor_space_size = [](auto lengths, auto strides) {
+        std::size_t space_size = 1;
+        for(std::size_t i = 0; i < lengths.size(); ++i)
+        {
+            space_size += (lengths[i] - 1) * strides[i];
+        }
+        return space_size;
+    };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+                                 f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+                                 f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+    SimpleDeviceMem d_device_buf(sizeof(DDataType) *
+                                 f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+                                 f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+    using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        ADataType,
+        BDataType,
+        ck::Tuple<DDataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Bilinear>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{alpha, beta};
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                        b_device_buf.GetDeviceBuffer(),
+                                        std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                        e_device_buf.GetDeviceBuffer(),
+                                        a_ms_ks_lengths,
+                                        a_ms_ks_strides,
+                                        b_ns_ks_lengths,
+                                        b_ns_ks_strides,
+                                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                        e_ms_ns_lengths,
+                                        e_ms_ns_strides,
+                                        a_element_op,
+                                        b_element_op,
+                                        cde_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            ck::index_t M = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+            ck::index_t N = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+            ck::index_t K = ck::accumulate_n<ck::index_t>(
+                a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+            std::size_t flop      = std::size_t(2) * M * N * K;
+            std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                    sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    return 0;
+}
--- a/client_example/04_contraction/contraction_scale.cpp
+++ b/client_example/04_contraction/contraction_scale.cpp
--- a/client_example/04_contraction/contraction_scale_fp64.cpp
+++ b/client_example/04_contraction/contraction_scale_fp64.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <numeric>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
+#include "ck/library/utility/numeric.hpp"
+using F64 = double;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Scale;
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F64;
+using CShuffleDataType = F64;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F64;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+// kkn
+#if 1
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// knn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// mkn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+// mnn
+#elif 0
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+#endif
+    float scale = 1.f;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 20)
+    {
+        const ck::index_t M0 = std::stoi(argv[1]);
+        const ck::index_t M1 = std::stoi(argv[2]);
+        const ck::index_t N0 = std::stoi(argv[3]);
+        const ck::index_t N1 = std::stoi(argv[4]);
+        const ck::index_t K0 = std::stoi(argv[5]);
+        const ck::index_t K1 = std::stoi(argv[6]);
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+        scale = std::stof(argv[19]);
+    }
+    else
+    {
+        printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+        printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg19: scale\n");
+        exit(0);
+    }
+    auto f_tensor_space_size = [](auto lengths, auto strides) {
+        std::size_t space_size = 1;
+        for(std::size_t i = 0; i < lengths.size(); ++i)
+        {
+            space_size += (lengths[i] - 1) * strides[i];
+        }
+        return space_size;
+    };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+                                 f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+                                 f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+                                 f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+    using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        ADataType,
+        BDataType,
+        ck::Tuple<>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Scale>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{scale};
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        std::array<const void*, 0>{},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        a_ms_ks_lengths,
+                                                        a_ms_ks_strides,
+                                                        b_ns_ks_lengths,
+                                                        b_ns_ks_strides,
+                                                        std::array<std::vector<ck::index_t>, 0>{},
+                                                        std::array<std::vector<ck::index_t>, 0>{},
+                                                        e_ms_ns_lengths,
+                                                        e_ms_ns_strides,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            ck::index_t M = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+            ck::index_t N = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+            ck::index_t K = ck::accumulate_n<ck::index_t>(
+                a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    return 0;
+}
--- a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
@@ -17,22 +17,22 @@ using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
 using OutDataType = ck::half_t;
-using InLayout    = ck::tensor_layout::convolution::GNHWC;
+using InLayout    = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
-using OutLayout   = ck::tensor_layout::convolution::GNHWK;
+using OutLayout   = ck::tensor_layout::convolution::NHWGK;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr ck::index_t NumDimSpatial = 2;
 static constexpr ck::index_t G             = 32;
-static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t N             = 256; // batch size
-static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t K             = 64;  // output channel
-static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t C             = 32;  // input channel (per group)
-static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t Y             = 3;   // filter H
-static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t X             = 3;   // filter W
-static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Hi            = 28;  // input H
-static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Wi            = 28;  // input W
-static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Ho            = 28;  // output H
-static constexpr ck::index_t Wo            = 28;
+static constexpr ck::index_t Wo            = 28;  // output W
 struct SimpleDeviceMem
 {
@@ -52,50 +52,24 @@ struct SimpleDeviceMem
 int main()
 {
-    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
-    std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
-    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
+    std::array<ck::index_t, 5> wei_lengths{G, K, C, Y, X};
-    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
+    std::array<ck::index_t, 5> wei_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
-    std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
-    std::partial_sum(rbegin(in_lengths),
-                     std::prev(rend(in_lengths)),
-                     std::next(rbegin(in_strides)),
-                     std::multiplies<>{});
-    std::partial_sum(rbegin(wei_lengths),
-                     std::prev(rend(wei_lengths)),
-                     std::next(rbegin(wei_strides)),
-                     std::multiplies<>{});
-    std::partial_sum(rbegin(out_lengths),
-                     std::prev(rend(out_lengths)),
-                     std::next(rbegin(out_strides)),
-                     std::multiplies<>{});
-    // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
-    std::rotate(
-        rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
-    std::rotate(
-        rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
-    std::rotate(
-        rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
-    std::rotate(
-        rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
-    std::rotate(
-        rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
-    std::rotate(
-        rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
-    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                 InLayout,
@@ -155,9 +129,9 @@ int main()
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
-            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
                                    sizeof(WeiDataType) * G * K * Y * X * C +
-                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+                                    sizeof(OutDataType) * N * Ho * Wo * G * K;
            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;

--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
@@ -17,26 +17,26 @@ using BiasDataType         = int32_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;
-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout         = ck::tensor_layout::convolution::G_K;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = ck::tensor_operation::element_wise::Relu;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
-static constexpr ck::index_t N             = 4;   // batch size
+static constexpr ck::index_t N             = 4;  // batch size
-static constexpr ck::index_t K             = 64;  // output channel
+static constexpr ck::index_t K             = 32; // output channel
-static constexpr ck::index_t C             = 192; // input channel
+static constexpr ck::index_t C             = 64; // input channel (per group)
-static constexpr ck::index_t Y             = 3;   // filter H
+static constexpr ck::index_t Y             = 3;  // filter H
-static constexpr ck::index_t X             = 3;   // filter W
+static constexpr ck::index_t X             = 3;  // filter W
-static constexpr ck::index_t Hi            = 71;  // input H
+static constexpr ck::index_t Hi            = 71; // input H
-static constexpr ck::index_t Wi            = 71;  // input W
+static constexpr ck::index_t Wi            = 71; // input W
-static constexpr ck::index_t Ho            = 36;  // output H
+static constexpr ck::index_t Ho            = 36; // output H
-static constexpr ck::index_t Wo            = 36;  // output W
+static constexpr ck::index_t Wo            = 36; // output W
 struct SimpleDeviceMem
 {
    SimpleDeviceMem() = delete;
@@ -55,8 +55,11 @@ struct SimpleDeviceMem
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
@@ -64,17 +67,18 @@ int main(int argc, char* argv[])
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
        NumDimSpatial,
@@ -203,4 +207,4 @@ int main(int argc, char* argv[])
    }
    return 0;
 }
\ No newline at end of file
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -16,19 +16,19 @@ using WeiDataType  = int8_t;
 using BiasDataType = int32_t;
 using OutDataType  = int8_t;
-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout   = ck::tensor_layout::convolution::G_K;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = ck::tensor_operation::element_wise::Relu;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
+static constexpr ck::index_t K             = 32;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -55,23 +55,27 @@ struct SimpleDeviceMem
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
@@ -17,21 +17,21 @@ using BiasDataType         = int32_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;
-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout         = ck::tensor_layout::convolution::G_K;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = ck::tensor_operation::element_wise::TanH;
 using OutElementOp =
    ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
+static constexpr ck::index_t K             = 32;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -58,8 +58,11 @@ struct SimpleDeviceMem
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
@@ -67,17 +70,18 @@ int main(int argc, char* argv[])
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
        NumDimSpatial,
@@ -206,4 +210,4 @@ int main(int argc, char* argv[])
    }
    return 0;
 }
\ No newline at end of file
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
@@ -16,19 +16,19 @@ using WeiDataType  = int8_t;
 using BiasDataType = int32_t;
 using OutDataType  = int8_t;
-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout   = ck::tensor_layout::convolution::G_K;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = ck::tensor_operation::element_wise::TanH;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
+static constexpr ck::index_t K             = 32;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -56,23 +56,27 @@ struct SimpleDeviceMem
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -16,25 +16,25 @@ using WeiDataType          = int8_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;
-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = PassThrough;
 using OutElementOp       = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
-static constexpr ck::index_t N             = 4;   // batch size
+static constexpr ck::index_t N             = 4;  // batch size
-static constexpr ck::index_t K             = 64;  // output channel
+static constexpr ck::index_t K             = 32; // output channel
-static constexpr ck::index_t C             = 192; // input channel
+static constexpr ck::index_t C             = 64; // input channel (per group)
-static constexpr ck::index_t Y             = 3;   // filter H
+static constexpr ck::index_t Y             = 3;  // filter H
-static constexpr ck::index_t X             = 3;   // filter W
+static constexpr ck::index_t X             = 3;  // filter W
-static constexpr ck::index_t Hi            = 71;  // input H
+static constexpr ck::index_t Hi            = 71; // input H
-static constexpr ck::index_t Wi            = 71;  // input W
+static constexpr ck::index_t Wi            = 71; // input W
-static constexpr ck::index_t Ho            = 36;  // output H
+static constexpr ck::index_t Ho            = 36; // output H
-static constexpr ck::index_t Wo            = 36;  // output W
+static constexpr ck::index_t Wo            = 36; // output W
 struct SimpleDeviceMem
 {
@@ -54,23 +54,27 @@ struct SimpleDeviceMem
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
@@ -196,4 +200,4 @@ int main(int argc, char* argv[])
    }
    return 0;
 }
\ No newline at end of file
--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -15,18 +15,18 @@ using InDataType  = int8_t;
 using WeiDataType = int8_t;
 using OutDataType = int8_t;
-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
+static constexpr ck::index_t K             = 32;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -53,20 +53,24 @@ struct SimpleDeviceMem
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                 InLayout,

--- a/client_example/18_groupnorm/CMakeLists.txt
+++ b/client_example/18_groupnorm/CMakeLists.txt
+add_executable(client_groupnorm_swish groupnorm_swish.cpp)
+target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations)
--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"
+using XDataType       = ck::half_t;
+using GammaDataType   = float;
+using BetaDataType    = float;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using Swish           = ck::tensor_operation::element_wise::Swish;
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 32;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 64;
+    ck::index_t C = 128;
+    std::size_t xy_size         = N * H * W * G * C;
+    std::size_t gamma_beta_size = G * C;
+    std::vector<ck::index_t> xy_strides         = {H * W * G * C, W * G * C, G * C, C, 1};
+    std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
+    SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size);
+    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
+                                                                       GammaDataType,
+                                                                       BetaDataType,
+                                                                       ComputeDataType,
+                                                                       YDataType,
+                                                                       Swish,
+                                                                       Rank,
+                                                                       NumReduceDim>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t num_byte =
+                sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size +
+                sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size;
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -92,6 +92,7 @@ else()
                -Wno-unused-command-line-argument
                -Wno-weak-vtables
                -Wno-covered-switch-default
+                -Wno-unsafe-buffer-usage
            )
        else()
            if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")

--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -21,6 +21,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
     -Wno-comma
     -Wno-old-style-cast
     -Wno-deprecated
+     -Wno-unsafe-buffer-usage
 )
 message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")

--- a/docs/.sphinx/requirements.in
+++ b/docs/.sphinx/requirements.in
-git+https://github.com/RadeonOpenCompute/rocm-docs-core.git
+rocm-docs-core==0.2.0
 sphinxcontrib-bibtex==2.5.0