sync from public repo

c997bbf6 · illsilin · 91c1d147 · ae5e5181 · c997bbf6 · c997bbf6
Commit c997bbf6 authored Nov 28, 2023 by illsilin
20 changed files
--- a/example/54_groupnorm_bwd/CMakeLists.txt
+++ b/example/54_groupnorm_bwd/CMakeLists.txt
+add_example_executable(example_groupnorm_bwd_fp16 groupnorm_bwd_fp16.cpp)
--- a/example/54_groupnorm_bwd/groupnorm_bwd_fp16.cpp
+++ b/example/54_groupnorm_bwd/groupnorm_bwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp"
+
+using DYDataType         = ck::half_t;
+using XDataType          = ck::half_t;
+using GammaDataType      = ck::half_t;
+using MeanInvStdDataType = float;
+using DGammaDataType     = ck::half_t;
+using DBetaDataType      = ck::half_t;
+using DXDataType         = ck::half_t;
+using ComputeDataType    = float;
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+// Grouprnorm
+// kernel:                  M    , K
+// dy:     N, H, W, G, C -> G * C, N * H * W
+// x:      N, H, W, G, C -> G * C, N * H * W
+// mean:   N, 1, 1, G, 1 -> G * 1, N * 1 * 1
+// rstd:   N, 1, 1, G, 1 -> G * 1, N * 1 * 1
+
+// dgamma: 1, 1, 1, G, C -> G * C
+// dbeta:  1, 1, 1, G, C -> G * C
+
+// reduced axis: 0, 1, 2
+
+using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizationBwdGammaBetaImpl<
+    DYDataType,
+    XDataType,
+    MeanInvStdDataType,
+    ComputeDataType,
+    DGammaDataType,
+    DBetaDataType,
+    Rank,
+    NumReduceDim,
+    256,   // BlockSize
+    8,     // ClusterInvarient
+    32,    // ClusterReduce
+    8,     // SliceInvarient
+    1,     // SliceReduce
+    false, // IsDYFastestDimReduced
+    8,     // DYSrcVectorSize
+    false, // IsXFastestDimReduced
+    8,     // XSrcVectorSize
+    false, // IsMeanInvStdFastestDimReduced
+    1,     // MeanInvStdSrcVectorSize
+    1,     // DGammaDstVectorSize
+    1>;    // DBetaDstVectorSize
+
+int main()
+{
+    bool time_kernel = false;
+
+    ck::index_t N = 16;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 32;
+    ck::index_t C = 64;
+
+    Tensor<DYDataType> dy({N, H, W, G, C});
+    Tensor<XDataType> x({N, H, W, G, C});
+    Tensor<GammaDataType> gamma({G, C});
+    Tensor<MeanInvStdDataType> mean({N, G});
+    Tensor<MeanInvStdDataType> inv_std({N, G});
+
+    Tensor<DGammaDataType> dgamma({G, C});
+    Tensor<DBetaDataType> dbeta({G, C});
+    Tensor<DXDataType> dx({N, H, W, G, C});
+
+    dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0.0, 1.0});
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+    gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+    mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});
+    inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});
+
+    DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
+    DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
+    DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
+    DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());
+
+    dy_dev.ToDevice(dy.mData.data());
+    x_dev.ToDevice(x.mData.data());
+    mean_dev.ToDevice(mean.mData.data());
+    inv_std_dev.ToDevice(inv_std.mData.data());
+
+    std::vector<ck::index_t> dyStrides{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
+    std::vector<ck::index_t> xStrides{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
+    std::vector<ck::index_t> meanStrides   = {G, 0, 0, 1, 0};
+    std::vector<ck::index_t> invStdStrides = {G, 0, 0, 1, 0};
+
+    auto gamma_beta_device_instance = GammaBetaDeviceInstance{};
+    auto gamma_beta_argument_ptr =
+        gamma_beta_device_instance.MakeArgumentPointer({N, H, W, G, C}, // inLengths
+                                                       dyStrides,       // dyStrides
+                                                       xStrides,        // xStrides
+                                                       meanStrides,     // meanStrides
+                                                       invStdStrides,   // invStdStrides
+                                                       {G, C},          // outLengths
+                                                       {C, 1},          // dgammaStrides
+                                                       {C, 1},          // dbetaStrides
+                                                       {0, 1, 2},       // reduceDims
+                                                       dy_dev.GetDeviceBuffer(),
+                                                       x_dev.GetDeviceBuffer(),
+                                                       mean_dev.GetDeviceBuffer(),
+                                                       inv_std_dev.GetDeviceBuffer(),
+                                                       dgamma_dev.GetDeviceBuffer(),
+                                                       dbeta_dev.GetDeviceBuffer());
+
+    if(!gamma_beta_device_instance.IsSupportedArgument(gamma_beta_argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    auto gamma_beta_invoker_ptr = gamma_beta_device_instance.MakeInvokerPointer();
+    gamma_beta_invoker_ptr->Run(gamma_beta_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    bool pass = true;
+    {
+        Tensor<DGammaDataType> host_dgamma({G, C});
+        Tensor<DBetaDataType> host_dbeta({G, C});
+        Tensor<DXDataType> host_dx({N, H, W, G, C});
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceGroupnormBwd<DYDataType,
+                                                              XDataType,
+                                                              GammaDataType,
+                                                              MeanInvStdDataType,
+                                                              DGammaDataType,
+                                                              DBetaDataType,
+                                                              DXDataType,
+                                                              ComputeDataType>;
+
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(
+            dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, {N, H, W, G, C});
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        dgamma_dev.FromDevice(dgamma.mData.data());
+        dbeta_dev.FromDevice(dbeta.mData.data());
+
+        pass &= ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
+    }
+
+    return (pass ? 0 : 1);
+}
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
@@ -34,6 +34,7 @@ using AccDataType      = F32;
 using CShuffleDataType = F32;
 using DDataType        = F16;
 using EDataType        = F16;
+using ComputeDataType  = F16;

 static constexpr ck::index_t NumDimM = 2;
 static constexpr ck::index_t NumDimN = 2;
@@ -291,6 +292,7 @@ int main(int argc, char* argv[])
                                                                      BDataType,
                                                                      CShuffleDataType,
                                                                      AccDataType,
+                                                                      ComputeDataType,
                                                                      PassThrough,
                                                                      BElementOp>;


--- a/example/62_conv_fwd_activ/CMakeLists.txt
+++ b/example/62_conv_fwd_activ/CMakeLists.txt
@@ -30,6 +30,18 @@ foreach(gpu IN LISTS GPU_TARGETS)
      # Elu
      add_example_executable(example_convnd_fwd_xdl_elu_fp16 convnd_fwd_xdl_elu_fp16.cpp)
      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_elu_fp16)
+      # ScaleAdd on A and B
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp16 multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_fp16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp32 multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_fp32)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_bf16 multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_bf16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_int8 multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_int8)
+      # ScaleAdd ScaleAdd Relu
+      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)
   set(target 1)
 endif()
 endforeach()
--- a/example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp
@@ -11,7 +11,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -47,7 +47,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio

 template <typename OutElementOp>
 using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,
@@ -190,9 +190,8 @@ bool run_grouped_conv_fwd(bool do_verification,

    if(!conv.IsSupportedArgument(argument))
    {
-        throw std::runtime_error(
-            "wrong! device_conv with the specified compilation parameters does "
-            "not support this Conv problem");
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
    }

    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename OutElementOp>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout, OutLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<OutDataType, OutDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+
+namespace {
+// Use custom implementation to pass two more tensors for post op
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumDs = 2;
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::array<Tensor<OutDataType>, NumDs> d_tensors = {Tensor<OutDataType>(out_g_n_k_wos_desc),
+                                                        Tensor<OutDataType>(out_g_n_k_wos_desc)};
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem d0_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize());
+    DeviceMem d1_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    d0_buf.ToDevice(d_tensors[0].mData.data());
+    d1_buf.ToDevice(d_tensors[1].mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    const std::array<const void*, NumDs> ds = {d0_buf.GetDeviceBuffer(), d1_buf.GetDeviceBuffer()};
+
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      ds,
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_lengths, e_g_n_k_wos_lengths},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_strides, e_g_n_k_wos_strides},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop =
+        conv_param.GetFlops() + 2 * conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            2 * conv_param.GetOutputByte<OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                         InDataType,
+                                                         WeiDataType,
+                                                         OutDataType,
+                                                         InElementOp,
+                                                         WeiElementOp,
+                                                         OutElementOp,
+                                                         0, /*Num A Elementwise Tensors*/
+                                                         0, /*Num B Elementwise Tensors*/
+                                                         NumDs>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
+
+} // namespace
+
+#include "run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = ck::bhalf_t;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+
+#include "../run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = ck::half_t;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+
+#include "../run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = float;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+
+#include "../run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = int8_t;
+using AccDataType = int32_t;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+
+#include "../run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
+++ b/example/62_conv_fwd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename DataType,
+          typename AccDataType,
+          typename InDataTypes,
+          typename WeiDataTypes,
+          typename InElementOp,
+          typename WeiElementOp>
+using DeviceGroupedConvNDMultiABFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataTypes,
+        WeiDataTypes,
+        AccDataType,
+        DataType,
+        ck::Tuple<>,
+        DataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+namespace {
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumAs = 2;
+    constexpr ck::index_t NumBs = 2;
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_bias(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<WeiDataType> wei_bias(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        in_bias.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        wei_bias.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        in_bias.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+        wei_bias.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem in_bias_device_buf(sizeof(InDataType) * in_bias.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem wei_bias_device_buf(sizeof(WeiDataType) * wei_bias.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    in_bias_device_buf.ToDevice(in_bias.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    wei_bias_device_buf.ToDevice(wei_bias.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    std::array<const void*, NumAs> as{in_device_buf.GetDeviceBuffer(),
+                                      in_bias_device_buf.GetDeviceBuffer()};
+    std::array<const void*, NumBs> bs{wei_device_buf.GetDeviceBuffer(),
+                                      wei_bias_device_buf.GetDeviceBuffer()};
+    std::array<const void*, 0> ds{};
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(as,
+                                      bs,
+                                      ds,
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      {},
+                                      {},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = conv_param.GetFlops() +
+                       2 * conv_param.GetOutputByte<InDataType>() / sizeof(InDataType) +
+                       2 * conv_param.GetOutputByte<WeiDataType>() / sizeof(WeiDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            conv_param.GetInputByte<InDataType>() +
+                            conv_param.GetWeightByte<WeiDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        const std::array<Tensor<InDataType>, NumAs - 1> elementwise_a_tensors  = {in_bias};
+        const std::array<Tensor<WeiDataType>, NumBs - 1> elementwise_b_tensors = {wei_bias};
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp,
+                                                                     NumAs - 1,
+                                                                     NumBs - 1>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  elementwise_a_tensors,
+                                                  elementwise_b_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
+
+} // namespace
--- a/example/63_layernorm4d_fwd/CMakeLists.txt
+++ b/example/63_layernorm4d_fwd/CMakeLists.txt
+add_example_executable(example_layernorm4d_fwd_fp16 layernorm4d_fwd_fp16.cpp)
+add_example_executable(example_layernorm4d_fwd_splitk_fp16 layernorm4d_fwd_splitk_fp16.cpp)
--- a/example/63_layernorm4d_fwd/common.hpp
+++ b/example/63_layernorm4d_fwd/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
--- a/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp
+++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using PassThrough            = ck::tensor_operation::element_wise::PassThrough;
+
+#define SAVE_MEAN_INV_STD
+
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationFwdImpl<XDataType,
+                                                             GammaDataType,
+                                                             BetaDataType,
+                                                             ComputeDataType,
+                                                             YDataType,
+                                                             SaveMeanInvStdDataType,
+                                                             PassThrough,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             256, // BlockSize
+                                                             8,   // ClusterM
+                                                             32,  // ClusterK
+                                                             1,   // SliceM
+                                                             8,   // SliceK
+                                                             1,   // XYVectorDim (0=M, 1=K)
+                                                             8,   // SrcScalarPerVector
+                                                             1,   // GammaVecDim (0=M, 1=K)
+                                                             8,   // GammaScalarPerVector
+                                                             1,   // BetaVecDim (0=M, 1=K)
+                                                             8,   // BetaScalarPerVector
+                                                             8,   // YScalarPerVector
+                                                             1>;  // SaveMeanInvStdScalarPerVector
+#include "run_layernorm4d_fwd_example.inc"
+
+int main() { return run_layernorm4d_fwd_example<DeviceInstance>(); }
--- a/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp
+++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using PassThrough            = ck::tensor_operation::element_wise::PassThrough;
+
+#define SAVE_MEAN_INV_STD
+
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+
+using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationFwdSplitKImpl<
+    XDataType,
+    GammaDataType,
+    BetaDataType,
+    ComputeDataType,
+    YDataType,
+    SaveMeanInvStdDataType,
+    PassThrough,
+    Rank,
+    NumReduceDim,
+    256, // BlockSize
+    8,   // ClusterM
+    32,  // ClusterK
+    1,   // SliceM
+    8,   // SliceK
+    1,   // XYVectorDim (0=M, 1=K)
+    8,   // XScalarPerVector
+    1,   // GammaVecDim (0=M, 1=K)
+    8,   // GammaScalarPerVector
+    1,   // BetaVecDim (0=M, 1=K)
+    8,   // BetaScalarPerVector
+    8,   // YScalarPerVector
+    1>;  // SaveMeanInvStdScalarPerVector
+
+#include "run_layernorm4d_fwd_example.inc"
+
+int main() { return run_layernorm4d_fwd_example<DeviceInstance>(); }
--- a/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+++ b/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <typename DeviceInstance>
+int run_layernorm4d_fwd_example()
+{
+    bool time_kernel = false;
+
+    ck::index_t N = 256;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t C = 8;
+
+    Tensor<XDataType> x({N, H, W, C});
+    Tensor<GammaDataType> gamma({H, W, C});
+    Tensor<BetaDataType> beta({H, W, C});
+    Tensor<YDataType> y({N, H, W, C});
+    Tensor<SaveMeanInvStdDataType> save_mean({N});
+    Tensor<SaveMeanInvStdDataType> save_inv_std({N});
+
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+    gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+    beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+#ifdef SAVE_MEAN_INV_STD
+    DeviceMem save_mean_dev(sizeof(SaveMeanInvStdDataType) * save_mean.mDesc.GetElementSpaceSize());
+    DeviceMem save_inv_std_dev(sizeof(SaveMeanInvStdDataType) *
+                               save_inv_std.mDesc.GetElementSpaceSize());
+#endif
+
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {N, H, W, C},
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+        {0, W * C, C, 1},
+        {0, W * C, C, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
+                                 save_mean.mDesc.GetStrides().end()},
+        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
+                                 save_mean.mDesc.GetStrides().end()},
+        {1, 2, 3},
+        1e-4,
+        x_dev.GetDeviceBuffer(),
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+        save_mean_dev.GetDeviceBuffer(),
+        save_inv_std_dev.GetDeviceBuffer(),
+#else
+        nullptr,
+        nullptr,
+#endif
+        PassThrough{});
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    bool pass = true;
+    {
+        Tensor<YDataType> host_y({N, H, W, C});
+        Tensor<SaveMeanInvStdDataType> host_save_mean({N});
+        Tensor<SaveMeanInvStdDataType> host_save_inv_std({N});
+
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                           GammaDataType,
+                                                           BetaDataType,
+                                                           YDataType,
+                                                           SaveMeanInvStdDataType,
+                                                           ComputeDataType,
+                                                           PassThrough,
+                                                           Rank,
+                                                           NumReduceDim>;
+
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(x,
+                                             gamma,
+                                             beta,
+                                             host_y,
+                                             host_save_mean,
+                                             host_save_inv_std,
+                                             PassThrough{},
+                                             {N, H, W, C},
+                                             {1, 2, 3},
+                                             1e-4);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        y_dev.FromDevice(y.mData.data());
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results (y)", 1e-3, 1e-3);
+#ifdef SAVE_MEAN_INV_STD
+        save_mean_dev.FromDevice(save_mean.mData.data());
+        save_inv_std_dev.FromDevice(save_inv_std.mData.data());
+        pass &= ck::utils::check_err(
+            save_mean, host_save_mean, "Error: Incorrect results (mean)", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(
+            save_inv_std, host_save_inv_std, "Error: Incorrect results (inv_std)", 1e-3, 1e-3);
+#endif
+    }
+
+    return (pass ? 0 : 1);
+}
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -134,6 +134,9 @@
 // inner product using V_DOT with DPP8 modifiers
 #define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1

+// set stochastic rounding as default for f8 conversions
+#define CK_USE_SR_F8_CONVERSION 1
+
 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1


--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -58,4 +58,11 @@ inline bool is_xdl_supported()
           ck::get_device_name() == "gfx942";
 }

+inline bool is_lds_direct_load_supported()
+{
+    // Check if direct loads from global memory to LDS are supported.
+    return ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx940" ||
+           ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942";
+}
+
 } // namespace ck
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -33,10 +33,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        printf("Warm up 1 time\n");
 #endif
        // warm up
-        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
-        hip_check_error(hipGetLastError());
+        for(int i = 0; i < stream_config.cold_niters_; ++i)
+        {
+            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
+        }

-        const int nrepeat = 10;
+        const int nrepeat = stream_config.nrepeat_;
 #if DEBUG_LOG
        printf("Start running %d times...\n", nrepeat);
 #endif

--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
@@ -11,4 +11,6 @@ struct StreamConfig
    hipStream_t stream_id_ = nullptr;
    bool time_kernel_      = false;
    int log_level_         = 0;
+    int cold_niters_       = 1;
+    int nrepeat_           = 10;
 };