Merge pull request #13 from ROCmSoftwarePlatform/merge_from-_public_repo

Merge from public repo

Merge pull request #13 from ROCmSoftwarePlatform/merge_from-_public_repo
Merge from public repo
984889fb · Illia Silin · GitHub · a73ab0d8 · 4a106f7d · 984889fb
Unverified Commit 984889fb authored Nov 01, 2023 by Illia Silin Committed by GitHub Nov 01, 2023
20 changed files
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <vector>
@@ -172,18 +172,19 @@ int main()
            BLayout,
            CLayout>();

-    const auto normalize_ptrs =
-        ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
-            CDataType,
-            ReduceDataType,
-            ReduceDataType,
-            GammaDataType,
-            BetaDataType,
-            LayerNormOutDataType>();
-
    std::cout << "found " << gemm_reduce_ptrs.size()
              << " gemm_reduceMean_reduceSquareMean instances" << std::endl;

+    using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<CDataType, ReduceDataType, ReduceDataType, GammaDataType, BetaDataType>,
+        ck::Tuple<LayerNormOutDataType>,
+        ck::tensor_operation::element_wise::Normalize,
+        2>;
+
+    const auto normalize_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            NormalizeDeviceOp>::GetInstances();
+
    std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;

    auto f_matrix_space_size =

--- a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <iostream>

--- a/client_example/04_contraction/contraction_bilinear_fp32.cpp
+++ b/client_example/04_contraction/contraction_bilinear_fp32.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <numeric>

--- a/client_example/04_contraction/contraction_bilinear_fp64.cpp
+++ b/client_example/04_contraction/contraction_bilinear_fp64.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <numeric>

--- a/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
+++ b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <numeric>

--- a/client_example/04_contraction/contraction_scale_fp32.cpp
+++ b/client_example/04_contraction/contraction_scale_fp32.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <numeric>

--- a/client_example/04_contraction/contraction_scale_fp64.cpp
+++ b/client_example/04_contraction/contraction_scale_fp64.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <numeric>

--- a/client_example/05_layernorm/layernorm2d.cpp
+++ b/client_example/05_layernorm/layernorm2d.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <vector>
@@ -16,9 +16,11 @@ using XDataType       = ck::half_t;
 using GammaDataType          = ck::half_t;
 using BetaDataType           = ck::half_t;
 using YDataType              = ck::half_t;
-using ComputeDataType = float;
+using SaveMeanInvStdDataType = float;
 using PassThrough            = ck::tensor_operation::element_wise::PassThrough;

+#define SAVE_MEAN_INV_STD
+
 constexpr int Rank         = 2;
 constexpr int NumReduceDim = 1;

@@ -50,12 +52,16 @@ int main(int argc, char* argv[])
    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+#ifdef SAVE_MEAN_INV_STD
+    SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * M);
+    SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * M);
+#endif

    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
                                                                       GammaDataType,
                                                                       BetaDataType,
-                                                                       ComputeDataType,
                                                                       YDataType,
+                                                                       SaveMeanInvStdDataType,
                                                                       PassThrough,
                                                                       Rank,
                                                                       NumReduceDim>;
@@ -84,14 +90,21 @@ int main(int argc, char* argv[])
                                                        {0, 1},      // gammaStrides
                                                        {0, 1},      // betaStrides
                                                        {Stride, 1}, // yStrides
+                                                        {1},         // save_mean Strides
+                                                        {1},         // save_inv_std Strides
                                                        {1},         // reduceDims
                                                        1e-4,
                                                        x_device_buf.GetDeviceBuffer(),
                                                        gamma_device_buf.GetDeviceBuffer(),
                                                        beta_device_buf.GetDeviceBuffer(),
                                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                                        save_mean_device_buf.GetDeviceBuffer(),
+                                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
                                                        nullptr,
                                                        nullptr,
+#endif
                                                        PassThrough{});

        auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -100,11 +113,19 @@ int main(int argc, char* argv[])

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

            std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
                                   sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;

+#ifdef SAVE_MEAN_INV_STD
+            num_byte += sizeof(SaveMeanInvStdDataType) * M * 2;
+#endif
+
            float gb_per_sec = num_byte / 1.E6 / ave_time;

            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
@@ -136,23 +157,34 @@ int main(int argc, char* argv[])

        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N},      // lengths
                                                        {Stride, 1}, // xStrides
-                                                        {1},         // gammaStrides
-                                                        {1},         // betaStrides
+                                                        {0, 1},      // gammaStrides
+                                                        {0, 1},      // betaStrides
                                                        {Stride, 1}, // yStrides
+                                                        {1},         // save_mean Strides
+                                                        {1},         // save_inv_std Strides
                                                        {1},         // reduceDims
                                                        1e-4,
                                                        x_device_buf.GetDeviceBuffer(),
                                                        gamma_device_buf.GetDeviceBuffer(),
                                                        beta_device_buf.GetDeviceBuffer(),
                                                        y_device_buf.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+                                                        save_mean_device_buf.GetDeviceBuffer(),
+                                                        save_inv_std_device_buf.GetDeviceBuffer(),
+#else
                                                        nullptr,
                                                        nullptr,
+#endif
                                                        PassThrough{});

        auto invoker_ptr = op_ptr->MakeInvokerPointer();

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
        }


--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <functional>
 #include <numeric>
@@ -53,12 +53,35 @@ int main(int argc, char* argv[])
    SimpleDeviceMem in(sizeof(InDataType) * num_elements);
    SimpleDeviceMem out(sizeof(OutDataType) * num_elements);

-    using DeviceOp = ck::tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
+                                                                 AccDataType,
+                                                                 OutDataType,
+                                                                 PassThrough,
+                                                                 PassThrough,
+                                                                 Rank,
+                                                                 NumReduceDim>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

+    auto& generic_op_ptr = op_ptrs[0];
+
+    auto generic_argument_ptr = generic_op_ptr->MakeArgumentPointer(in_lengths,
+                                                                    in_strides,
+                                                                    reduce_dims,
+                                                                    alpha,
+                                                                    beta,
+                                                                    in.GetDeviceBuffer(),
+                                                                    out.GetDeviceBuffer(),
+                                                                    PassThrough{},
+                                                                    PassThrough{});
+
+    if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The generic kernel instance should be able to support any input shapes");
+    };
+
    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;

    std::string best_op_name;
@@ -74,11 +97,6 @@ int main(int argc, char* argv[])
    {
        auto& op_ptr = op_ptrs[i];

-        if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
-        {
-            continue;
-        }
-
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in_lengths,
                                                        in_strides,
                                                        reduce_dims,

--- a/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>
 #include <iomanip>

--- a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>
 #include <iomanip>
@@ -17,22 +17,22 @@ using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
 using OutDataType = ck::half_t;

-using InLayout    = ck::tensor_layout::convolution::GNHWC;
+using InLayout    = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
-using OutLayout   = ck::tensor_layout::convolution::GNHWK;
+using OutLayout   = ck::tensor_layout::convolution::NHWGK;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 static constexpr ck::index_t NumDimSpatial = 2;
 static constexpr ck::index_t G             = 32;
-static constexpr ck::index_t N             = 256;
-static constexpr ck::index_t K             = 192;
-static constexpr ck::index_t C             = 192;
-static constexpr ck::index_t Y             = 3;
-static constexpr ck::index_t X             = 3;
-static constexpr ck::index_t Hi            = 28;
-static constexpr ck::index_t Wi            = 28;
-static constexpr ck::index_t Ho            = 28;
-static constexpr ck::index_t Wo            = 28;
+static constexpr ck::index_t N             = 256; // batch size
+static constexpr ck::index_t K             = 64;  // output channel
+static constexpr ck::index_t C             = 32;  // input channel (per group)
+static constexpr ck::index_t Y             = 3;   // filter H
+static constexpr ck::index_t X             = 3;   // filter W
+static constexpr ck::index_t Hi            = 28;  // input H
+static constexpr ck::index_t Wi            = 28;  // input W
+static constexpr ck::index_t Ho            = 28;  // output H
+static constexpr ck::index_t Wo            = 28;  // output W

 struct SimpleDeviceMem
 {
@@ -52,50 +52,24 @@ struct SimpleDeviceMem

 int main()
 {
-    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
-    std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
-
-    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
-    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
-
-    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
-    std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
-
-    std::partial_sum(rbegin(in_lengths),
-                     std::prev(rend(in_lengths)),
-                     std::next(rbegin(in_strides)),
-                     std::multiplies<>{});
-    std::partial_sum(rbegin(wei_lengths),
-                     std::prev(rend(wei_lengths)),
-                     std::next(rbegin(wei_strides)),
-                     std::multiplies<>{});
-    std::partial_sum(rbegin(out_lengths),
-                     std::prev(rend(out_lengths)),
-                     std::next(rbegin(out_strides)),
-                     std::multiplies<>{});
-
-    // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
-    std::rotate(
-        rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
-    std::rotate(
-        rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
-    std::rotate(
-        rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
-    std::rotate(
-        rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
-    std::rotate(
-        rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
-    std::rotate(
-        rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
+    std::array<ck::index_t, 5> wei_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> wei_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};

    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                 InLayout,
@@ -155,9 +129,9 @@ int main()
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
-            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
                                    sizeof(WeiDataType) * G * K * Y * X * C +
-                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+                                    sizeof(OutDataType) * N * Ho * Wo * G * K;

            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;

--- a/client_example/08_fused_attention/fused_attention.cpp
+++ b/client_example/08_fused_attention/fused_attention.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <vector>

--- a/client_example/08_fused_attention/fused_attention_bias.cpp
+++ b/client_example/08_fused_attention/fused_attention_bias.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <vector>

--- a/client_example/09_quantization/CMakeLists.txt
+++ b/client_example/09_quantization/CMakeLists.txt
+if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
 add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
 target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations)

@@ -18,3 +19,4 @@ target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable

 add_executable(client_gemm_quantization gemm_quantization.cpp)
 target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations)
+endif()
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <iostream>
@@ -17,20 +17,20 @@ using BiasDataType         = int32_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;

-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout         = ck::tensor_layout::convolution::G_K;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = ck::tensor_operation::element_wise::Relu;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;  // batch size
-static constexpr ck::index_t K             = 64;  // output channel
-static constexpr ck::index_t C             = 192; // input channel
+static constexpr ck::index_t K             = 32; // output channel
+static constexpr ck::index_t C             = 64; // input channel (per group)
 static constexpr ck::index_t Y             = 3;  // filter H
 static constexpr ck::index_t X             = 3;  // filter W
 static constexpr ck::index_t Hi            = 71; // input H
@@ -55,8 +55,11 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
@@ -64,17 +67,18 @@ int main(int argc, char* argv[])
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
        NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <iostream>
@@ -16,19 +16,19 @@ using WeiDataType  = int8_t;
 using BiasDataType = int32_t;
 using OutDataType  = int8_t;

-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout   = ck::tensor_layout::convolution::G_K;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = ck::tensor_operation::element_wise::Relu;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -55,23 +55,27 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <iostream>
@@ -17,21 +17,21 @@ using BiasDataType         = int32_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;

-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout         = ck::tensor_layout::convolution::G_K;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = ck::tensor_operation::element_wise::TanH;
 using OutElementOp =
    ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -58,8 +58,11 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
@@ -67,17 +70,18 @@ int main(int argc, char* argv[])
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
        NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <iostream>
@@ -16,19 +16,19 @@ using WeiDataType  = int8_t;
 using BiasDataType = int32_t;
 using OutDataType  = int8_t;

-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
 using BiasLayout   = ck::tensor_layout::convolution::G_K;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = ck::tensor_operation::element_wise::TanH;
 using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -56,23 +56,27 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <iostream>
@@ -16,19 +16,19 @@ using WeiDataType          = int8_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;

-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = PassThrough;
 using OutElementOp       = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;  // batch size
-static constexpr ck::index_t K             = 64;  // output channel
-static constexpr ck::index_t C             = 192; // input channel
+static constexpr ck::index_t K             = 32; // output channel
+static constexpr ck::index_t C             = 64; // input channel (per group)
 static constexpr ck::index_t Y             = 3;  // filter H
 static constexpr ck::index_t X             = 3;  // filter W
 static constexpr ck::index_t Hi            = 71; // input H
@@ -54,23 +54,27 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,

--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <iostream>
@@ -15,18 +15,18 @@ using InDataType  = int8_t;
 using WeiDataType = int8_t;
 using OutDataType = int8_t;

-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
 static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;   // output channel
-static constexpr ck::index_t C             = 192;  // input channel
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
 static constexpr ck::index_t Y             = 3;    // filter H
 static constexpr ck::index_t X             = 3;    // filter W
 static constexpr ck::index_t Hi            = 71;   // input H
@@ -53,20 +53,24 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                 InLayout,