Merge branch 'develop' into amd-develop

e9047ab9 · Jun Liu · bc641634 · a2969aa8 · e9047ab9 · e9047ab9
Commit e9047ab9 authored Nov 29, 2023 by Jun Liu
20 changed files
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ADataType = F32;
+using BDataType = F32;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using UnaryOp     = ck::tensor_operation::element_wise::UnarySquare;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+using DeviceElementwisePermuteInstance =
+    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
+                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
+                                                        PassThrough,          // ElementwiseOp
+                                                        UnaryOp,              // UnaryOp
+                                                        Scale,                // Scalar
+                                                        4,                    // NumDim
+                                                        8,                    // MPerThread
+                                                        ck::Sequence<8>,  // InScalarPerVectorSeq
+                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
+
+template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
+void host_elementwise4D(HostTensorB& B_nhwc,
+                        const HostTensorA& A_nchw,
+                        FunctorA functor_a,
+                        FunctorB functor_b,
+                        float scale)
+{
+    for(std::size_t n = 0; n < A_nchw.mDesc.GetLengths()[0]; ++n)
+        for(std::size_t c = 0; c < A_nchw.mDesc.GetLengths()[1]; ++c)
+            for(std::size_t h = 0; h < A_nchw.mDesc.GetLengths()[2]; ++h)
+                for(std::size_t w = 0; w < A_nchw.mDesc.GetLengths()[3]; ++w)
+                {
+                    ADataType tmp_val;
+                    auto a_val = A_nchw(n, c, h, w);
+                    functor_b(tmp_val, a_val);
+                    functor_a(B_nhwc(n, h, w, c), scale * tmp_val);
+                }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    std::vector<std::size_t> nchw = {16, 128, 32, 64};
+    std::vector<std::size_t> nhwc = {16, 32, 64, 128};
+    Tensor<ADataType> a(nchw);
+    Tensor<BDataType> b(nhwc);
+    float scale = 2.f;
+    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a.mData.data());
+
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 4> ab_lengths;
+    std::array<ck::index_t, 4> a_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
+                                            static_cast<int>(nchw[2] * nchw[3]),
+                                            static_cast<int>(nchw[3]),
+                                            1};
+    std::array<ck::index_t, 4> b_strides = {static_cast<int>(nhwc[1] * nhwc[2] * nhwc[3]),
+                                            1,
+                                            static_cast<int>(nhwc[2] * nhwc[3]),
+                                            static_cast<int>(nhwc[3])};
+
+    ck::ranges::copy(nchw, ab_lengths.begin());
+
+    auto broadcastPermute = DeviceElementwisePermuteInstance{};
+    auto argument         = broadcastPermute.MakeArgumentPointer(ab_lengths,
+                                                         {a_strides},
+                                                         {b_strides},
+                                                         input,
+                                                         output,
+                                                         PassThrough{},
+                                                         UnaryOp{},
+                                                         Scale{scale});
+
+    if(!broadcastPermute.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    std::cout << "A (nchw): " << a.mDesc << std::endl;
+    std::cout << "B (nhwc): " << b.mDesc << std::endl;
+
+    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
+    float ave_time =
+        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
+
+    std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
+                            sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        b_device_buf.FromDevice(b.mData.data());
+        Tensor<BDataType> host_b(nhwc);
+        host_elementwise4D(host_b, a, PassThrough{}, UnaryOp{}, scale);
+
+        pass &=
+            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
--- a/example/53_layernorm_bwd/CMakeLists.txt
+++ b/example/53_layernorm_bwd/CMakeLists.txt
+add_example_executable(example_layernorm2d_bwd_fp16 layernorm2d_bwd_fp16.cpp)
--- a/example/53_layernorm_bwd/layernorm2d_bwd_fp16.cpp
+++ b/example/53_layernorm_bwd/layernorm2d_bwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp"
+
+using DYDataType         = ck::half_t;
+using XDataType          = ck::half_t;
+using GammaDataType      = ck::half_t;
+using MeanInvStdDataType = float;
+using DGammaDataType     = ck::half_t;
+using DBetaDataType      = ck::half_t;
+using DXDataType         = ck::half_t;
+using ComputeDataType    = float;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+// Layernorm:
+
+// Input shape
+// dy:        [M, N]
+// x:         [M, N]
+// mean:      [M, 1]
+// inv_std:   [M, 1]
+
+// Output shape
+// dgamma: [1, N]
+// dbeta:  [1, N]
+
+// dgamma = reduce_sum(dy * (x - mean) * inv_std, axis=0)
+// dbeta = reduce_sum(dy, axis=0)
+
+// [CAUSION]
+// In DeviceNormalizationBwdGammaBetaImpl, M is invarient dimension, K is reduced dimension
+// Hence, M in this example and DeviceNormalizationBwdGammaBetaImpl is different
+using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizationBwdGammaBetaImpl<
+    DYDataType,
+    XDataType,
+    MeanInvStdDataType,
+    ComputeDataType,
+    DGammaDataType,
+    DBetaDataType,
+    Rank,
+    NumReduceDim,
+    256,   // BlockSize
+    8,     // ClusterInvarient
+    32,    // ClusterReduce
+    8,     // SliceInvarient
+    1,     // SliceReduce
+    false, // IsDYFastestDimReduced
+    8,     // DYSrcVectorSize
+    false, // IsXFastestDimReduced
+    8,     // XSrcVectorSize
+    true,  // IsMeanInvStdFastestDimReduced
+    1,     // MeanInvStdSrcVectorSize
+    1,     // DGammaDstVectorSize
+    1>;    // DBetaDstVectorSize
+
+int main()
+{
+    bool time_kernel = false;
+
+    ck::index_t M = 1024;
+    ck::index_t N = 512;
+
+    Tensor<DYDataType> dy({M, N});
+    Tensor<XDataType> x({M, N});
+    Tensor<GammaDataType> gamma({N});
+    Tensor<MeanInvStdDataType> mean({M});
+    Tensor<MeanInvStdDataType> inv_std({M});
+
+    Tensor<DGammaDataType> dgamma({N});
+    Tensor<DBetaDataType> dbeta({N});
+    Tensor<DXDataType> dx({M, N});
+
+    dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0.0, 1.0});
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+    gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+    mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});
+    inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});
+
+    DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
+    DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
+    DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
+    DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());
+
+    dy_dev.ToDevice(dy.mData.data());
+    x_dev.ToDevice(x.mData.data());
+    mean_dev.ToDevice(mean.mData.data());
+    inv_std_dev.ToDevice(inv_std.mData.data());
+
+    auto gamma_beta_device_instance = GammaBetaDeviceInstance{};
+    auto gamma_beta_argument_ptr =
+        gamma_beta_device_instance.MakeArgumentPointer({M, N}, // inLengths
+                                                       {N, 1}, // dyStrides
+                                                       {N, 1}, // xStrides
+                                                       {1, 0}, // meanStrides
+                                                       {1, 0}, // invStdStrides
+                                                       {N},    // outLengths
+                                                       {1},    // dgammaStrides
+                                                       {1},    // dbetaStrides
+                                                       {0},    // reduceDims
+                                                       dy_dev.GetDeviceBuffer(),
+                                                       x_dev.GetDeviceBuffer(),
+                                                       mean_dev.GetDeviceBuffer(),
+                                                       inv_std_dev.GetDeviceBuffer(),
+                                                       dgamma_dev.GetDeviceBuffer(),
+                                                       dbeta_dev.GetDeviceBuffer());
+
+    if(!gamma_beta_device_instance.IsSupportedArgument(gamma_beta_argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    auto gamma_beta_invoker_ptr = gamma_beta_device_instance.MakeInvokerPointer();
+    gamma_beta_invoker_ptr->Run(gamma_beta_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    bool pass = true;
+    {
+        Tensor<DGammaDataType> host_dgamma({N});
+        Tensor<DBetaDataType> host_dbeta({N});
+        Tensor<DXDataType> host_dx({M, N});
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceLayernormBwd<DYDataType,
+                                                              XDataType,
+                                                              GammaDataType,
+                                                              MeanInvStdDataType,
+                                                              DGammaDataType,
+                                                              DBetaDataType,
+                                                              DXDataType,
+                                                              ComputeDataType>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, {M, N});
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        dgamma_dev.FromDevice(dgamma.mData.data());
+        dbeta_dev.FromDevice(dbeta.mData.data());
+
+        pass &= ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
+    }
+
+    return (pass ? 0 : 1);
+}
--- a/example/54_groupnorm_bwd/CMakeLists.txt
+++ b/example/54_groupnorm_bwd/CMakeLists.txt
+add_example_executable(example_groupnorm_bwd_fp16 groupnorm_bwd_fp16.cpp)
--- a/example/54_groupnorm_bwd/groupnorm_bwd_fp16.cpp
+++ b/example/54_groupnorm_bwd/groupnorm_bwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp"
+
+using DYDataType         = ck::half_t;
+using XDataType          = ck::half_t;
+using GammaDataType      = ck::half_t;
+using MeanInvStdDataType = float;
+using DGammaDataType     = ck::half_t;
+using DBetaDataType      = ck::half_t;
+using DXDataType         = ck::half_t;
+using ComputeDataType    = float;
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+// Grouprnorm
+// kernel:                  M    , K
+// dy:     N, H, W, G, C -> G * C, N * H * W
+// x:      N, H, W, G, C -> G * C, N * H * W
+// mean:   N, 1, 1, G, 1 -> G * 1, N * 1 * 1
+// rstd:   N, 1, 1, G, 1 -> G * 1, N * 1 * 1
+
+// dgamma: 1, 1, 1, G, C -> G * C
+// dbeta:  1, 1, 1, G, C -> G * C
+
+// reduced axis: 0, 1, 2
+
+using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizationBwdGammaBetaImpl<
+    DYDataType,
+    XDataType,
+    MeanInvStdDataType,
+    ComputeDataType,
+    DGammaDataType,
+    DBetaDataType,
+    Rank,
+    NumReduceDim,
+    256,   // BlockSize
+    8,     // ClusterInvarient
+    32,    // ClusterReduce
+    8,     // SliceInvarient
+    1,     // SliceReduce
+    false, // IsDYFastestDimReduced
+    8,     // DYSrcVectorSize
+    false, // IsXFastestDimReduced
+    8,     // XSrcVectorSize
+    false, // IsMeanInvStdFastestDimReduced
+    1,     // MeanInvStdSrcVectorSize
+    1,     // DGammaDstVectorSize
+    1>;    // DBetaDstVectorSize
+
+int main()
+{
+    bool time_kernel = false;
+
+    ck::index_t N = 16;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 32;
+    ck::index_t C = 64;
+
+    Tensor<DYDataType> dy({N, H, W, G, C});
+    Tensor<XDataType> x({N, H, W, G, C});
+    Tensor<GammaDataType> gamma({G, C});
+    Tensor<MeanInvStdDataType> mean({N, G});
+    Tensor<MeanInvStdDataType> inv_std({N, G});
+
+    Tensor<DGammaDataType> dgamma({G, C});
+    Tensor<DBetaDataType> dbeta({G, C});
+    Tensor<DXDataType> dx({N, H, W, G, C});
+
+    dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0.0, 1.0});
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+    gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+    mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});
+    inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});
+
+    DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
+    DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
+    DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
+    DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());
+
+    dy_dev.ToDevice(dy.mData.data());
+    x_dev.ToDevice(x.mData.data());
+    mean_dev.ToDevice(mean.mData.data());
+    inv_std_dev.ToDevice(inv_std.mData.data());
+
+    std::vector<ck::index_t> dyStrides{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
+    std::vector<ck::index_t> xStrides{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
+    std::vector<ck::index_t> meanStrides   = {G, 0, 0, 1, 0};
+    std::vector<ck::index_t> invStdStrides = {G, 0, 0, 1, 0};
+
+    auto gamma_beta_device_instance = GammaBetaDeviceInstance{};
+    auto gamma_beta_argument_ptr =
+        gamma_beta_device_instance.MakeArgumentPointer({N, H, W, G, C}, // inLengths
+                                                       dyStrides,       // dyStrides
+                                                       xStrides,        // xStrides
+                                                       meanStrides,     // meanStrides
+                                                       invStdStrides,   // invStdStrides
+                                                       {G, C},          // outLengths
+                                                       {C, 1},          // dgammaStrides
+                                                       {C, 1},          // dbetaStrides
+                                                       {0, 1, 2},       // reduceDims
+                                                       dy_dev.GetDeviceBuffer(),
+                                                       x_dev.GetDeviceBuffer(),
+                                                       mean_dev.GetDeviceBuffer(),
+                                                       inv_std_dev.GetDeviceBuffer(),
+                                                       dgamma_dev.GetDeviceBuffer(),
+                                                       dbeta_dev.GetDeviceBuffer());
+
+    if(!gamma_beta_device_instance.IsSupportedArgument(gamma_beta_argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    auto gamma_beta_invoker_ptr = gamma_beta_device_instance.MakeInvokerPointer();
+    gamma_beta_invoker_ptr->Run(gamma_beta_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    bool pass = true;
+    {
+        Tensor<DGammaDataType> host_dgamma({G, C});
+        Tensor<DBetaDataType> host_dbeta({G, C});
+        Tensor<DXDataType> host_dx({N, H, W, G, C});
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceGroupnormBwd<DYDataType,
+                                                              XDataType,
+                                                              GammaDataType,
+                                                              MeanInvStdDataType,
+                                                              DGammaDataType,
+                                                              DBetaDataType,
+                                                              DXDataType,
+                                                              ComputeDataType>;
+
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(
+            dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, {N, H, W, G, C});
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        dgamma_dev.FromDevice(dgamma.mData.data());
+        dbeta_dev.FromDevice(dbeta.mData.data());
+
+        pass &= ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
+    }
+
+    return (pass ? 0 : 1);
+}
--- a/example/62_conv_fwd_activ/CMakeLists.txt
+++ b/example/62_conv_fwd_activ/CMakeLists.txt
@@ -30,6 +30,15 @@ foreach(gpu IN LISTS GPU_TARGETS)
      # Elu
      add_example_executable(example_convnd_fwd_xdl_elu_fp16 convnd_fwd_xdl_elu_fp16.cpp)
      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_elu_fp16)
+      # ScaleAdd on A and B
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp16 multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_fp16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp32 multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_fp32)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_bf16 multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_bf16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_int8 multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_int8)
      # ScaleAdd ScaleAdd Relu
      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)

--- a/example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp
@@ -11,7 +11,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -47,7 +47,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio

 template <typename OutElementOp>
 using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,

--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
@@ -9,7 +9,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -47,7 +47,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio

 template <typename OutElementOp>
 using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,
@@ -226,13 +226,16 @@ bool run_grouped_conv_fwd(bool do_verification,

    if(do_verification)
    {
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                         InDataType,
                                                         WeiDataType,
                                                         OutDataType,
                                                         InElementOp,
                                                         WeiElementOp,
                                                         OutElementOp,
+                                                         0, /*Num A Elementwise Tensors*/
+                                                         0, /*Num B Elementwise Tensors*/
                                                         NumDs>();

        auto ref_invoker  = ref_conv.MakeInvoker();
@@ -246,6 +249,8 @@ bool run_grouped_conv_fwd(bool do_verification,
                                                  in_element_op,
                                                  wei_element_op,
                                                  out_element_op,
+                                                  {},
+                                                  {},
                                                  d_tensors);

        ref_invoker.Run(ref_argument);

--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = ck::bhalf_t;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+
+#include "../run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = ck::half_t;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+
+#include "../run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = float;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+
+#include "../run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = int8_t;
+using AccDataType = int32_t;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                                  AccDataType,
+                                                                                  ADataTypes,
+                                                                                  BDataTypes,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp>;
+
+#include "../run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
+++ b/example/62_conv_fwd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename DataType,
+          typename AccDataType,
+          typename InDataTypes,
+          typename WeiDataTypes,
+          typename InElementOp,
+          typename WeiElementOp>
+using DeviceGroupedConvNDMultiABFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataTypes,
+        WeiDataTypes,
+        AccDataType,
+        DataType,
+        ck::Tuple<>,
+        DataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+namespace {
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumAs = 2;
+    constexpr ck::index_t NumBs = 2;
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_bias(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<WeiDataType> wei_bias(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        in_bias.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        wei_bias.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        in_bias.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+        wei_bias.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem in_bias_device_buf(sizeof(InDataType) * in_bias.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem wei_bias_device_buf(sizeof(WeiDataType) * wei_bias.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    in_bias_device_buf.ToDevice(in_bias.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    wei_bias_device_buf.ToDevice(wei_bias.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    std::array<const void*, NumAs> as{in_device_buf.GetDeviceBuffer(),
+                                      in_bias_device_buf.GetDeviceBuffer()};
+    std::array<const void*, NumBs> bs{wei_device_buf.GetDeviceBuffer(),
+                                      wei_bias_device_buf.GetDeviceBuffer()};
+    std::array<const void*, 0> ds{};
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(as,
+                                      bs,
+                                      ds,
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      {},
+                                      {},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = conv_param.GetFlops() +
+                       2 * conv_param.GetOutputByte<InDataType>() / sizeof(InDataType) +
+                       2 * conv_param.GetOutputByte<WeiDataType>() / sizeof(WeiDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            conv_param.GetInputByte<InDataType>() +
+                            conv_param.GetWeightByte<WeiDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        const std::array<Tensor<InDataType>, NumAs - 1> elementwise_a_tensors  = {in_bias};
+        const std::array<Tensor<WeiDataType>, NumBs - 1> elementwise_b_tensors = {wei_bias};
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp,
+                                                                     NumAs - 1,
+                                                                     NumBs - 1>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  elementwise_a_tensors,
+                                                  elementwise_b_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
+
+} // namespace
--- a/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+++ b/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -134,6 +134,9 @@
 // inner product using V_DOT with DPP8 modifiers
 #define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1

+// set stochastic rounding as default for f8 conversions
+#define CK_USE_SR_F8_CONVERSION 1
+
 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1


--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -58,4 +58,11 @@ inline bool is_xdl_supported()
           ck::get_device_name() == "gfx942";
 }

+inline bool is_lds_direct_load_supported()
+{
+    // Check if direct loads from global memory to LDS are supported.
+    return ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx940" ||
+           ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942";
+}
+
 } // namespace ck
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -33,10 +33,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        printf("Warm up 1 time\n");
 #endif
        // warm up
+        for(int i = 0; i < stream_config.cold_niters_; ++i)
+        {
            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
            hip_check_error(hipGetLastError());
+        }

-        const int nrepeat = 10;
+        const int nrepeat = stream_config.nrepeat_;
 #if DEBUG_LOG
        printf("Start running %d times...\n", nrepeat);
 #endif

--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
@@ -11,4 +11,6 @@ struct StreamConfig
    hipStream_t stream_id_ = nullptr;
    bool time_kernel_      = false;
    int log_level_         = 0;
+    int cold_niters_       = 1;
+    int nrepeat_           = 10;
 };
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+/**
+ * Transfer that uses direct load instructions to copy data from global to LDS memory.
+ *
+ * Traditional loads first copy data from global to registers, and then from registers to LDS.
+ * Direct loads do not need an intermediate step, data is copied directly from global to LDS,
+ * without the use of additional registers.
+ *
+ * However, the instruction has limitations:
+ * - each thread must copy exactly a single DWORD - 4 bytes;
+ * - threads within a single wavefront must write consecutive DWORDS into LDS,
+ *   (data in global do not need to be contiguous, each thread might have its own offset).
+ *
+ * To make sure that all the transfers finished, the `waitcnt` instruction must be used with
+ * `vmcnt` instead of `lgkmcnt`.
+ *
+ * Limitations of the transfer class:
+ * - `SrcData` must be the same as `DstData` - no possibility to convert the data type in flight;
+ * - `DstVectorDim` must be the last dimension;
+ * - `SrcVectorDim` must be the last dimension if `ScalarPerVector` is greater than 1;
+ * - `ScalarPerVector` times the number of bytes of `DstData` must be equal to a single DWORD = 4B
+ *   (for examlpe if `DstData` is fp32, then `ScalarPerVector` must be 1; if `DstData` is fp16,
+ *   `ScalarPerVector` must be 2);
+ * - if `ScalarPerVector` is greater than 1, the contiguous dimension in src and dst must be
+ *   the same dimension;
+ * - threads in a wavefront must write contiguous data to LDS (when wavefront size is 64,
+ *   they must write 64 contiguous DWORDs) - `ThreadClusterLengths` must be prepared in such a way
+ *   to guarantee that.
+ */
+template <typename ThreadGroup,
+          typename BlockSliceLengths,
+          typename ThreadClusterLengths,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t ScalarPerVector>
+struct ThreadGroupTensorSliceTransfer_DirectLoad
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    static constexpr auto I0 = Number<0>{};
+
+    static constexpr auto block_slice_lengths    = BlockSliceLengths{};
+    static constexpr auto thread_cluster_lengths = ThreadClusterLengths{};
+
+    static constexpr auto thread_single_load_size = generate_sequence(
+        detail::lambda_scalar_per_access<DstVectorDim, ScalarPerVector>{}, Number<nDim>{});
+    // After a load, each thread moves by `thread_steps` instead of loading the next elements.
+    // It makes the whole wavefront load contiguous memory, what is required for direct loads.
+    static constexpr auto thread_steps         = thread_cluster_lengths * thread_single_load_size;
+    static constexpr auto thread_slice_lengths = block_slice_lengths / thread_steps;
+
+    static __device__ constexpr bool AreThreadClusterLengthsValid()
+    {
+        // Make sure that ThreadClusterLengths are set in a way that allows for contiguous writes to
+        // LDS by the threads from a single wavefront.
+        // Examples (assuming 64 threads in a wavefront, 128 in a thread block):
+        // 1. BlockSliceLengths = [K0PerBlock, MPerBlock, K1PerBlock] = [4, 128, 8],
+        //    data type = fp32 -> ScalarPerVector = 1
+        //    INVALID: ThreadClusterLengths = [4, 4, 8] since in the first iteration, threads 0-31
+        //             write [0, 0, 0] - [0, 3, 7] and thread 32 writes [1, 0, 0] instead of
+        //             [0, 4, 0].
+        //    VALID: ThreadClusterLengths = [2, 8, 8] or [1, 16, 8] since in the first iteration,
+        //           threads 0-63 write [0, 0, 0] - [0, 7, 7] -> 64 consecutive elements (DWORDs).
+        // 2. BlockSliceLengths = [K0PerBlock, MPerBlock, K1PerBlock] = [4, 128, 8],
+        //    data type = fp16 -> ScalarPerVector = 2
+        //    NOTE: ThreadClusterLengths must take into account that each thread writes two
+        //          elements (single DWORD) along the contiguous dimension.
+        //    INVALID: ThreadClusterLengths = [4, 4, 8] since each 8 threads would try to write
+        //             8 * 2 elements of K1PerBlock and there are only 8;
+        //             ThreadClusterLengths = [4, 8, 4] since in the first iteration, threads 0-31
+        //             write [0, 0, 0] - [0, 7, 7] (7 since each writes 2 elements) and thread 32
+        //             writes [1, 0, 0] instead of [0, 8, 0].
+        //    VALID: ThreadClusterLengths = [4, 16, 4] or [2, 32, 4] or [1, 64, 4] since in the
+        //           first iteration, threads 0-63 write [0, 0, 0] -  [0, 15, 7] -> 128 consecutive
+        //           elements = 64 consecutive DWORDs.
+        int num_contiguous_dwords = 1;
+        bool is_contiguous        = true;
+        static_for<0, nDim, 1>{}([&](auto i) {
+            if(is_contiguous)
+            {
+                num_contiguous_dwords *= thread_cluster_lengths[nDim - i - 1];
+            }
+            if(thread_slice_lengths[nDim - i - 1] > 1)
+            {
+                is_contiguous = false;
+            }
+        });
+        constexpr index_t wavefront_size = get_warp_size();
+        const bool wave_contiguous       = num_contiguous_dwords % wavefront_size == 0;
+
+        bool thread_slice_lengths_correct = true;
+        static_for<0, nDim, 1>{}([&](auto i) {
+            if(thread_slice_lengths[i] <= 0)
+            {
+                thread_slice_lengths_correct = false;
+            }
+        });
+
+        return wave_contiguous && thread_slice_lengths_correct;
+    }
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_DirectLoad(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin)
+
+    {
+        static_assert(ck::is_same_v<SrcData, DstData>,
+                      "Direct load transfer does not support datatypes conversion. Source and "
+                      "destination data types must be the same.");
+
+        static_assert(
+            DstVectorDim == nDim - 1,
+            "Direct load transfer requires the destination vector dimension to be the last one.");
+
+        static_assert(ScalarPerVector == 1 || SrcVectorDim == DstVectorDim,
+                      "When loading more than one element per thread at once, the contiguous "
+                      "dimension must be the same between source and destination.");
+
+        constexpr auto dword_bytes           = 4;
+        constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
+        static_assert(bytes_per_thread_load == dword_bytes,
+                      "Direct load transfer requires each thread to load exactly a single "
+                      "DWORD of data.");
+
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size(),
+                      "Inconsistent number of dimensions across lengths and descriptors.");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "The number of threads cannot be less than the number of elements in "
+                      "thread cluster lengths.");
+
+        static_assert(
+            AreThreadClusterLengthsValid(),
+            "Thread cluster lengths are incorrect. They must be set in a way that allows a single "
+            "wavefront to write contiguous DWORDs into LDS memory. ");
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc_.CalculateBottomIndex(make_multi_index(ThreadGroup::GetThreadId()));
+
+        const auto thread_data_idx_begin = thread_cluster_idx * thread_single_load_size;
+
+        SetSrcSliceOrigin(src_desc, src_block_slice_origin + thread_data_idx_begin);
+        SetDstSliceOrigin(dst_desc, dst_block_slice_origin + thread_data_idx_begin);
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_        = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+        src_slice_origin_ = src_slice_origin_idx;
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_        = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_slice_origin_ = dst_slice_origin_idx;
+    }
+
+    __device__ void ResetDstSliceWindow(const DstDesc& dst_desc)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_);
+    }
+
+    template <typename SrcBuffer, typename DstBuffer>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global,
+                      "Source data must come from a global memory buffer.");
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "Destination data must be stored in an LDS memory buffer.");
+
+        static_assert(
+            ck::is_same_v<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>,
+            "SrcBuffer and SrcData data types must be consistent.");
+        static_assert(
+            ck::is_same_v<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>,
+            "DstBuffer and DstData data types must be consistent.");
+
+        constexpr auto dst_access_lengths = thread_slice_lengths;
+
+        const auto dst_forward_steps  = generate_steps(dst_desc, 1);
+        const auto dst_backward_steps = generate_steps(dst_desc, -1);
+        const auto src_forward_steps  = generate_steps(src_desc, 1);
+        const auto src_backward_steps = generate_steps(src_desc, -1);
+
+        // Loop over the destination block and copy data.
+        static_ford<decltype(dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            const auto src_offset = src_coord_.GetOffset();
+            const auto dst_offset = dst_coord_.GetOffset();
+
+            // Check if src data is not in the logic padding area.
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            src_buf.template DirectCopyToLds<remove_cvref_t<decltype(dst_buf)>, ScalarPerVector>(
+                dst_buf, src_offset, dst_offset, is_src_valid);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &= ordered_dst_access_idx[j] == dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // Decide whether to move forward or backward.
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(dst_desc, dst_coord_, dst_forward_steps[i]);
+                        move_tensor_coordinate(src_desc, src_coord_, src_forward_steps[i]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(dst_desc, dst_coord_, dst_backward_steps[i]);
+                        move_tensor_coordinate(src_desc, src_coord_, src_backward_steps[i]);
+                    }
+                }
+            });
+        });
+
+        // Reset the destination slice since the entire buffer has been already filled.
+        ResetDstSliceWindow(dst_desc);
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        src_slice_origin_ = src_slice_origin_ + step;
+        src_coord_        = make_tensor_coordinate(src_desc, src_slice_origin_);
+    }
+
+    template <typename DescType>
+    __device__ auto generate_steps(const DescType& desc, int sign)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                Index step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    step_idx(j) = (i.value == j.value) ? sign * thread_steps[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(desc, step_idx);
+            },
+            Number<nDim>{});
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ = make_cluster_descriptor(ThreadClusterLengths{});
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+    Index src_slice_origin_;
+    Index dst_slice_origin_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+#include <array>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          typename Scale,
+          index_t NumDim>
+struct DeviceElementwise : public BaseOperator
+{
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op,
+                        UnaryOperation unary_op,
+                        Scale scale_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+}; // namespace device
+
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          typename Scale,
+          index_t NumDim>
+using DeviceElementwisePtr = std::unique_ptr<DeviceElementwise<InDataTypeTuple,
+                                                               OutDataTypeTuple,
+                                                               ElementwiseOperation,
+                                                               UnaryOperation,
+                                                               Scale,
+                                                               NumDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck