Merge branch 'develop' into lwpck-987

524143e4 · Rostyslav Geyyer · GitHub · d3cd6f41 · 68f2b5e7 · 524143e4
Unverified Commit 524143e4 authored Nov 09, 2023 by Rostyslav Geyyer Committed by GitHub Nov 09, 2023
20 changed files
--- a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
@@ -37,29 +37,29 @@ struct YElementOp
 };
 using DeviceInstance =
-    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+    ck::tensor_operation::device::DeviceNormalizationFwdImpl<XDataType,
-                                                          GammaDataType,
+                                                             GammaDataType,
-                                                          BetaDataType,
+                                                             BetaDataType,
-                                                          ComputeDataType,
+                                                             ComputeDataType,
-                                                          YDataType,
+                                                             YDataType,
-                                                          SaveMeanInvStdDataType,
+                                                             SaveMeanInvStdDataType,
-                                                          YElementOp,
+                                                             YElementOp,
-                                                          Rank,
+                                                             Rank,
-                                                          NumReduceDim,
+                                                             NumReduceDim,
-                                                          1024, // BlockSize
+                                                             1024, // BlockSize
-                                                          1,    // ClusterM
+                                                             1,    // ClusterM
-                                                          1024, // ClusterK
+                                                             1024, // ClusterK
-                                                          1,    // SliceM
+                                                             1,    // SliceM
-                                                          32,   // SliceK
+                                                             32,   // SliceK
-                                                          1,    // SrcVecDim (0=M, 1=K)
+                                                             1,    // SrcVecDim (0=M, 1=K)
-                                                          2,    // SrcScalarPerVector
+                                                             2,    // SrcScalarPerVector
-                                                          1,    // GammaVecDim (0=M, 1=K)
+                                                             1,    // GammaVecDim (0=M, 1=K)
-                                                          2,    // GammaScalarPerVector
+                                                             2,    // GammaScalarPerVector
-                                                          1,    // BetaVecDim (0=M, 1=K)
+                                                             1,    // BetaVecDim (0=M, 1=K)
-                                                          2,    // BetaScalarPerVector
+                                                             2,    // BetaScalarPerVector
-                                                          2,    // YScalarPerVector
+                                                             2,    // YScalarPerVector
-                                                          1>;   // SaveMeanInvStdScalarPerVector
+                                                             1>;   // SaveMeanInvStdScalarPerVector
-#include "run_groupnorm_example.inc"
+#include "run_groupnorm_fwd_example.inc"
-int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); }
+int main(int argc, char* argv[]) { run_groupnorm_fwd_example(argc, argv); }
--- a/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp
+++ b/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using YElementOp             = ck::tensor_operation::element_wise::Swish;
+#define SAVE_MEAN_INV_STD
+using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationFwdSplitKImpl<
+    XDataType,
+    GammaDataType,
+    BetaDataType,
+    ComputeDataType,
+    YDataType,
+    SaveMeanInvStdDataType,
+    YElementOp,
+    Rank,
+    NumReduceDim,
+    256, // BlockSize
+    1,   // ClusterM
+    256, // ClusterK
+    1,   // SliceM
+    16,  // SliceK
+    1,   // SrcVecDim (0=M, 1=K)
+    2,   // SrcScalarPerVector
+    1,   // GammaVecDim (0=M, 1=K)
+    2,   // GammaScalarPerVector
+    1,   // BetaVecDim (0=M, 1=K)
+    2,   // BetaScalarPerVector
+    2,   // YScalarPerVector
+    1>;  // SaveMeanInvStdScalarPerVector
+#include "run_groupnorm_fwd_example.inc"
+int main(int argc, char* argv[]) { run_groupnorm_fwd_example(argc, argv); }
--- a/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp
+++ b/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using YElementOp             = ck::tensor_operation::element_wise::Swish;
+#define SAVE_MEAN_INV_STD
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationFwdImpl<XDataType,
+                                                             GammaDataType,
+                                                             BetaDataType,
+                                                             ComputeDataType,
+                                                             YDataType,
+                                                             SaveMeanInvStdDataType,
+                                                             YElementOp,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             1024, // BlockSize
+                                                             1,    // ClusterM
+                                                             1024, // ClusterK
+                                                             1,    // SliceM
+                                                             32,   // SliceK
+                                                             1,    // SrcVecDim (0=M, 1=K)
+                                                             2,    // SrcScalarPerVector
+                                                             1,    // GammaVecDim (0=M, 1=K)
+                                                             2,    // GammaScalarPerVector
+                                                             1,    // BetaVecDim (0=M, 1=K)
+                                                             2,    // BetaScalarPerVector
+                                                             2,    // YScalarPerVector
+                                                             1>;   // SaveMeanInvStdScalarPerVector
+#include "run_groupnorm_fwd_example.inc"
+int main(int argc, char* argv[]) { run_groupnorm_fwd_example(argc, argv); }
--- a/example/42_groupnorm/run_groupnorm_example.inc
+++ b/example/42_groupnorm/run_groupnorm_example.inc
@@ -3,7 +3,7 @@
 #pragma once
-int run_groupnorm_example(int argc, char* argv[])
+int run_groupnorm_fwd_example(int argc, char* argv[])
 {
    ck::index_t N = 32;
    ck::index_t H = 16;
@@ -65,9 +65,9 @@ int run_groupnorm_example(int argc, char* argv[])
        {0, 0, 0, C, 1},
        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
        {1, 2, 4}, // reduction dimension: [H, W, C]
        1e-6,
        x_dev.GetDeviceBuffer(),

--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
 add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
 add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
+add_example_executable(example_elementwise_permute elementwise_permute.cpp)
+add_example_executable(example_elementwise_permute_3d elementwise_permute_3d.cpp)
--- a/example/44_elementwise_permute/elementwise_permute.cpp
+++ b/example/44_elementwise_permute/elementwise_permute.cpp
+#include <iostream>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using ADataType = F16;
+using BDataType = F16;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceElementwisePermuteInstance =
+    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
+                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
+                                                        PassThrough,          // ElementwiseOp
+                                                        5,                    // NumDim
+                                                        8,                    // MPerThread
+                                                        ck::Sequence<1>,  // InScalarPerVectorSeq
+                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor)
+{
+    for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
+        for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
+            for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
+                for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
+                    for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
+                    {
+                        auto a_val = A_ncdhw(n, c, d, h, w);
+                        functor(B_ndhwc(n, d, h, w, c), a_val);
+                    }
+}
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+    std::vector<std::size_t> ncdhw = {16, 8, 8, 8, 8};
+    std::vector<std::size_t> ndhwc = {16, 8, 8, 8, 8};
+    Tensor<ADataType> a(ncdhw);
+    Tensor<BDataType> b(ndhwc);
+    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a.mData.data());
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+    std::array<ck::index_t, 5> ab_lengths;
+    /**std::array<ck::index_t, 5> a_strides = {
+        static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
+        static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4]),
+        static_cast<int>(ncdhw[3] * ncdhw[4]),
+        static_cast<int>(ncdhw[4]),
+        1};
+    std::array<ck::index_t, 5> b_strides = {
+        static_cast<int>(ndhwc[1] * ndhwc[2] * ndhwc[3] * ndhwc[4]),
+        static_cast<int>(ndhwc[2] * ndhwc[3] * ndhwc[4]),
+        1,
+        static_cast<int>(ndhwc[3] * ndhwc[4]),
+        static_cast<int>(ndhwc[4])};**/
+    std::array<ck::index_t, 5> a_strides = {
+        static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
+        static_cast<int>(ncdhw[3] * ncdhw[4]),
+        static_cast<int>(ncdhw[4]),
+        1,
+        static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4])};
+    std::array<ck::index_t, 5> b_strides = {
+        static_cast<int>(ndhwc[1] * ndhwc[2] * ndhwc[3] * ndhwc[4]),
+        static_cast<int>(ndhwc[2] * ndhwc[3] * ndhwc[4]),
+        static_cast<int>(ndhwc[3] * ndhwc[4]),
+        static_cast<int>(ndhwc[4]),
+        1};
+    ck::ranges::copy(ncdhw, ab_lengths.begin());
+    auto broadcastPermute = DeviceElementwisePermuteInstance{};
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+    if(!broadcastPermute.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+    std::cout << "A (ncdhw): " << a.mDesc << std::endl;
+    std::cout << "B (ndhwc): " << b.mDesc << std::endl;
+    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
+    float ave_time =
+        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    std::size_t flop = std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
+    std::size_t num_btype =
+        sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
+        sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        b_device_buf.FromDevice(b.mData.data());
+        Tensor<BDataType> host_b(ndhwc);
+        host_elementwise4D(host_b, a, PassThrough{});
+        pass &=
+            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+    }
+    return pass ? 0 : 1;
+}
--- a/example/44_elementwise_permute/elementwise_permute_3d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_3d.cpp
+#include <iostream>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using ADataType = F16;
+using BDataType = F16;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceElementwisePermuteInstance =
+    ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
+                                                          ck::Tuple<BDataType>, // OutDataTypeTuple
+                                                          PassThrough,          // ElementwiseOp
+                                                          2,                    // NumDim_m, {N, C}
+                                                          2,                    // NumDim_n, {H, W}
+                                                          1,                    // NumDim_k, {D}
+                                                          8,                    // MPerThread
+                                                          8,                    // NPerThread
+                                                          8,                    // KPerThread
+                                                          ck::Sequence<8>,  // InScalarPerVectorSeq
+                                                          ck::Sequence<4>>; // OutScalarPerVectorSeq
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor)
+{
+    for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
+        for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
+            for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
+                for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
+                    for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
+                    {
+                        auto a_val = A_ncdhw(n, c, d, h, w);
+                        functor(B_ndhwc(n, d, h, w, c), a_val);
+                    }
+}
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+    const int N = 4;
+    const int C = 16;
+    const int H = 32;
+    const int W = 5;
+    const int D = 16;
+    std::vector<std::size_t> ncdhw = {N, C, D, H, W};
+    std::vector<std::size_t> ndhwc = {N, D, H, W, C};
+    Tensor<ADataType> a(ncdhw);
+    Tensor<BDataType> b(ndhwc);
+    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a.mData.data());
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+    std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
+    std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
+    std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
+    auto broadcastPermute = DeviceElementwisePermuteInstance{};
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+    if(!broadcastPermute.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+    std::cout << "A (ncdhw): " << a.mDesc << std::endl;
+    std::cout << "B (ndhwc): " << b.mDesc << std::endl;
+    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
+    float ave_time =
+        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    std::size_t flop = std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
+    std::size_t num_btype =
+        sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
+        sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        b_device_buf.FromDevice(b.mData.data());
+        Tensor<BDataType> host_b(ndhwc);
+        host_elementwise4D(host_b, a, PassThrough{});
+        pass &=
+            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+    }
+    return pass ? 0 : 1;
+}
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
@@ -19,13 +19,13 @@ using BDataType = F16;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
+    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                        ck::Tuple<BDataType>,
+                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                        PassThrough,
+                                                        PassThrough,          // Elementwise op
-                                                        4,
+                                                        4,                    // NumDim
-                                                        8,
+                                                        8,                    // MPerThread
-                                                        ck::Sequence<8>,
+                                                        ck::Sequence<8>,  // InScalarPerVectorSeq
-                                                        ck::Sequence<1>>;
+                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
 template <typename HostTensorA, typename HostTensorB, typename Functor>
 void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
@@ -99,7 +99,6 @@ int main()
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;
    bool pass = true;
    if(do_verification)

--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
@@ -17,15 +17,15 @@ using BDataType = F16;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
+    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                          ck::Tuple<BDataType>,
+                                                          ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                          PassThrough,
+                                                          PassThrough,          // Elementwise op
-                                                          3, // NumDim_M
+                                                          3,                    // NumDim_M
-                                                          1, // NumDim_N
+                                                          1,                    // NumDim_N
-                                                          8,
+                                                          1,                    // MPerThread
-                                                          8,
+                                                          1,                    // NPerThread
-                                                          ck::Sequence<8>,
+                                                          ck::Sequence<1>,  // InScalarPerVectorSeq
-                                                          ck::Sequence<8>>;
+                                                          ck::Sequence<1>>; // OutScalarPerVectorSeq
 template <typename HostTensorA, typename HostTensorB, typename Functor>
 void host_elementwise4D(HostTensorB& B_nhwc,
@@ -53,12 +53,6 @@ int main()
    const int H = 32;
    const int W = 1024;
-    /**const int N = 120;
-    const int H = 32;
-    const int W = 64;
-    const int C = 128;**/
    std::vector<std::size_t> nchw = {N, C, H, W};
    std::vector<std::size_t> nhwc = {N, H, W, C};
@@ -71,7 +65,6 @@ int main()
    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
    a_device_buf.ToDevice(a.mData.data());
-    // LogRangeAsType<float>(std::cout << "Tensor a  : ", a.mData, ",") << std::endl;
    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
@@ -115,13 +108,10 @@ int main()
    if(do_verification)
    {
        b_device_buf.FromDevice(b.mData.data());
-        // LogRangeAsType<float>(std::cout << "Tensor b  : ", b.mData, ",") << std::endl;
        Tensor<BDataType> host_b(nhwc);
        host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
            host_b, a, nchw, PassThrough{});
-        // LogRangeAsType<float>(std::cout << "Host b  : ", host_b.mData, ",") << std::endl;
        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
    }

--- a/example/63_layernorm4d_fwd/CMakeLists.txt
+++ b/example/63_layernorm4d_fwd/CMakeLists.txt
+add_example_executable(example_layernorm4d_fwd_fp16 layernorm4d_fwd_fp16.cpp)
+add_example_executable(example_layernorm4d_fwd_splitk_fp16 layernorm4d_fwd_splitk_fp16.cpp)
--- a/example/63_layernorm4d_fwd/common.hpp
+++ b/example/63_layernorm4d_fwd/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
--- a/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp
+++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using PassThrough            = ck::tensor_operation::element_wise::PassThrough;
+#define SAVE_MEAN_INV_STD
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationFwdImpl<XDataType,
+                                                             GammaDataType,
+                                                             BetaDataType,
+                                                             ComputeDataType,
+                                                             YDataType,
+                                                             SaveMeanInvStdDataType,
+                                                             PassThrough,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             256, // BlockSize
+                                                             8,   // ClusterM
+                                                             32,  // ClusterK
+                                                             1,   // SliceM
+                                                             8,   // SliceK
+                                                             1,   // XYVectorDim (0=M, 1=K)
+                                                             8,   // SrcScalarPerVector
+                                                             1,   // GammaVecDim (0=M, 1=K)
+                                                             8,   // GammaScalarPerVector
+                                                             1,   // BetaVecDim (0=M, 1=K)
+                                                             8,   // BetaScalarPerVector
+                                                             8,   // YScalarPerVector
+                                                             1>;  // SaveMeanInvStdScalarPerVector
+#include "run_layernorm4d_fwd_example.inc"
+int main() { return run_layernorm4d_fwd_example<DeviceInstance>(); }
--- a/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp
+++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using PassThrough            = ck::tensor_operation::element_wise::PassThrough;
+#define SAVE_MEAN_INV_STD
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationFwdSplitKImpl<
+    XDataType,
+    GammaDataType,
+    BetaDataType,
+    ComputeDataType,
+    YDataType,
+    SaveMeanInvStdDataType,
+    PassThrough,
+    Rank,
+    NumReduceDim,
+    256, // BlockSize
+    8,   // ClusterM
+    32,  // ClusterK
+    1,   // SliceM
+    8,   // SliceK
+    1,   // XYVectorDim (0=M, 1=K)
+    8,   // XScalarPerVector
+    1,   // GammaVecDim (0=M, 1=K)
+    8,   // GammaScalarPerVector
+    1,   // BetaVecDim (0=M, 1=K)
+    8,   // BetaScalarPerVector
+    8,   // YScalarPerVector
+    1>;  // SaveMeanInvStdScalarPerVector
+#include "run_layernorm4d_fwd_example.inc"
+int main() { return run_layernorm4d_fwd_example<DeviceInstance>(); }
--- a/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+++ b/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+template <typename DeviceInstance>
+int run_layernorm4d_fwd_example()
+{
+    bool time_kernel = false;
+    ck::index_t N = 256;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t C = 8;
+    Tensor<XDataType> x({N, H, W, C});
+    Tensor<GammaDataType> gamma({H, W, C});
+    Tensor<BetaDataType> beta({H, W, C});
+    Tensor<YDataType> y({N, H, W, C});
+    Tensor<SaveMeanInvStdDataType> save_mean({N});
+    Tensor<SaveMeanInvStdDataType> save_inv_std({N});
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+    gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+    beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+#ifdef SAVE_MEAN_INV_STD
+    DeviceMem save_mean_dev(sizeof(SaveMeanInvStdDataType) * save_mean.mDesc.GetElementSpaceSize());
+    DeviceMem save_inv_std_dev(sizeof(SaveMeanInvStdDataType) *
+                               save_inv_std.mDesc.GetElementSpaceSize());
+#endif
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {N, H, W, C},
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+        {0, W * C, C, 1},
+        {0, W * C, C, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
+                                    save_mean.mDesc.GetStrides().end()},
+        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
+                                    save_mean.mDesc.GetStrides().end()},
+        {1, 2, 3},
+        1e-4,
+        x_dev.GetDeviceBuffer(),
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+        save_mean_dev.GetDeviceBuffer(),
+        save_inv_std_dev.GetDeviceBuffer(),
+#else
+        nullptr,
+        nullptr,
+#endif
+        PassThrough{});
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    bool pass = true;
+    {
+        Tensor<YDataType> host_y({N, H, W, C});
+        Tensor<SaveMeanInvStdDataType> host_save_mean({N});
+        Tensor<SaveMeanInvStdDataType> host_save_inv_std({N});
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                           GammaDataType,
+                                                           BetaDataType,
+                                                           YDataType,
+                                                           SaveMeanInvStdDataType,
+                                                           ComputeDataType,
+                                                           PassThrough,
+                                                           Rank,
+                                                           NumReduceDim>;
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(x,
+                                             gamma,
+                                             beta,
+                                             host_y,
+                                             host_save_mean,
+                                             host_save_inv_std,
+                                             PassThrough{},
+                                             {N, H, W, C},
+                                             {1, 2, 3},
+                                             1e-4);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+        y_dev.FromDevice(y.mData.data());
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results (y)", 1e-3, 1e-3);
+#ifdef SAVE_MEAN_INV_STD
+        save_mean_dev.FromDevice(save_mean.mData.data());
+        save_inv_std_dev.FromDevice(save_inv_std.mData.data());
+        pass &= ck::utils::check_err(
+            save_mean, host_save_mean, "Error: Incorrect results (mean)", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(
+            save_inv_std, host_save_inv_std, "Error: Incorrect results (inv_std)", 1e-3, 1e-3);
+#endif
+    }
+    return (pass ? 0 : 1);
+}
--- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -19,7 +19,7 @@ template <typename XDataType,
          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
-struct DeviceNormalization : public BaseOperator
+struct DeviceNormalizationFwd : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const std::vector<index_t> lengths,
@@ -50,14 +50,14 @@ template <typename XDataType,
          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
-using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization<XDataType,
+using DeviceNormalizationFwdPtr = std::unique_ptr<DeviceNormalizationFwd<XDataType,
-                                                                   GammaDataType,
+                                                                         GammaDataType,
-                                                                   BetaDataType,
+                                                                         BetaDataType,
-                                                                   YDataType,
+                                                                         YDataType,
-                                                                   SaveMeanInvStdDataType,
+                                                                         SaveMeanInvStdDataType,
-                                                                   YElementwiseOperation,
+                                                                         YElementwiseOperation,
-                                                                   Rank,
+                                                                         Rank,
-                                                                   NumReduceDim>>;
+                                                                         NumReduceDim>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -595,7 +595,9 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
            return false;
        }
-        if(ck::get_device_name() != "gfx90a" && std::is_same<ADataType, double>::value)
+        if(ck::get_device_name() != "gfx90a" && ck::get_device_name() != "gfx940" &&
+           ck::get_device_name() != "gfx941" && ck::get_device_name() != "gfx942" &&
+           std::is_same<ADataType, double>::value)
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          index_t NumDim_m, // choose how to set dims
+          index_t NumDim_n,
+          index_t NumDim_k,
+          index_t MPerThread,
+          index_t NPerThread,
+          index_t KPerThread,
+          typename InScalarPerVectorSeq,
+          typename OutScalarPerVectorSeq>
+struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
+                                                          OutDataTypeTuple,
+                                                          ElementwiseOperation,
+                                                          NumDim_m + NumDim_n + NumDim_k>
+{
+    static constexpr index_t NumDim = NumDim_m + NumDim_n + NumDim_k;
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
+                      NumOutput == OutScalarPerVectorSeq::Size(),
+                  "Tuple size is inconsistent with the number of in/out!");
+    static auto GenerateInDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                return static_cast<const DataType*>(nullptr);
+            },
+            Number<NumInput>{});
+    }
+    static auto GenerateOutDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+                return static_cast<DataType*>(nullptr);
+            },
+            Number<NumOutput>{});
+    }
+    using InDataTypePointerTuple  = decltype(GenerateInDataTypePointerTuple());
+    using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
+    template <typename Desc_MNK>
+    static auto PadDescriptor_MNK(Desc_MNK desc_mnk,
+                                  index_t gridSize,
+                                  index_t blockSize,
+                                  index_t num_threads_m,
+                                  index_t num_threads_n,
+                                  index_t num_threads_k)
+    {
+        std::ignore = blockSize;
+        std::ignore = gridSize;
+        const auto m = desc_mnk.GetLength(I0);
+        const auto n = desc_mnk.GetLength(I1);
+        const auto k = desc_mnk.GetLength(I2);
+        const index_t loop_step_m = num_threads_m * MPerThread;
+        const index_t loop_step_n = num_threads_n * NPerThread;
+        const index_t loop_step_k = num_threads_k * KPerThread;
+        const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m;
+        const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n;
+        const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k;
+        const auto desc_mnk_pad =
+            transform_tensor_descriptor(desc_mnk,
+                                        make_tuple(make_right_pad_transform(m, pad_m),
+                                                   make_right_pad_transform(n, pad_n),
+                                                   make_right_pad_transform(k, pad_k)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+        return desc_mnk_pad;
+    }
+    static auto MakeDescriptor_MNK(const std::array<index_t, NumDim>& lengths,
+                                   const std::array<index_t, NumDim>& stride,
+                                   index_t gridSize,
+                                   index_t blockSize,
+                                   index_t num_threads_m,
+                                   index_t num_threads_n,
+                                   index_t num_threads_k)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return lengths[I]; }, Number<NumDim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<NumDim>{});
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDim_m, 1>::type();
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDim_m, NumDim_m + NumDim_n, 1>::type();
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDim_m + NumDim_n, NumDim, 1>::type();
+        const auto mLengths = get_container_subset(tupleOfShape, mDimIds);
+        const auto nLengths = get_container_subset(tupleOfShape, nDimIds);
+        const auto kLengths = get_container_subset(tupleOfShape, kDimIds);
+        // merge nd to 3d desc - [s0 * s1 * ...]
+        if constexpr(NumDim > 3)
+        {
+            const auto desc_mnk = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(mLengths),
+                           make_merge_transform(nLengths),
+                           make_merge_transform(kLengths)),
+                make_tuple(mDimIds, nDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+            return PadDescriptor_MNK(
+                desc_mnk, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k);
+        }
+        else
+            return PadDescriptor_MNK(
+                desc, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k);
+    }
+    template <index_t TupleSize>
+    static auto GenerateInOutGrid3dDescTuple(Number<TupleSize>)
+    {
+        return generate_tuple(
+            [&](auto) {
+                if constexpr(NumDim > 3)
+                {
+                    return MakeDescriptor_MNK({1, 1, 1}, {1, 1, 1}, 1, 1, 1, 1, 1);
+                }
+                else
+                {
+                    return MakeDescriptor_MNK({1}, {1}, 1, 1, 1, 1, 1);
+                };
+            },
+            Number<TupleSize>{});
+    }
+    using OutGrid3dDescTuple = decltype(GenerateInOutGrid3dDescTuple(Number<NumOutput>{}));
+    using InGrid3dDescTuple  = decltype(GenerateInOutGrid3dDescTuple(Number<NumInput>{}));
+    using GridwiseElementwise = GridwiseElementwise_3D<InGrid3dDescTuple,
+                                                       OutGrid3dDescTuple,
+                                                       InDataTypePointerTuple,
+                                                       OutDataTypePointerTuple,
+                                                       ElementwiseOperation,
+                                                       MPerThread,
+                                                       NPerThread,
+                                                       KPerThread,
+                                                       InScalarPerVectorSeq,
+                                                       OutScalarPerVectorSeq>;
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, NumDim> lengths,
+                 const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                 const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                 const std::array<const void*, NumInput> in_dev_buffers,
+                 const std::array<void*, NumOutput> out_dev_buffers,
+                 ElementwiseOperation elementwise_op)
+            : lengths_(lengths),
+              inStridesArray_(inStridesArray),
+              outStridesArray_(outStridesArray),
+              elementwise_op_(elementwise_op),
+              blockSize_(256)
+        {
+            static_assert(NumDim_m > 0, "");
+            static_assert(NumDim_n > 0, "");
+            static_assert(NumDim_k > 0, "");
+            in_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                    return static_cast<const DataType*>(in_dev_buffers[I.value]);
+                },
+                Number<NumInput>{});
+            out_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+                    return static_cast<DataType*>(out_dev_buffers[I.value]);
+                },
+                Number<NumOutput>{});
+        }
+        InDataTypePointerTuple in_dev_buffers_;
+        OutDataTypePointerTuple out_dev_buffers_;
+        std::array<index_t, NumDim> lengths_;
+        std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
+        std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_;
+        ElementwiseOperation elementwise_op_;
+        index_t blockSize_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t gridSize      = getAvailableComputeUnitCount(stream_config) * arg.blockSize_;
+            index_t num_threads_m = gridSize / (16 * 16);
+            index_t num_threads_n = 16;
+            index_t num_threads_k = 16;
+            auto in_grid_3d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_MNK(arg.lengths_,
+                                              arg.inStridesArray_[I.value],
+                                              gridSize,
+                                              arg.blockSize_,
+                                              num_threads_m,
+                                              num_threads_n,
+                                              num_threads_k);
+                },
+                Number<NumInput>{});
+            auto out_grid_3d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_MNK(arg.lengths_,
+                                              arg.outStridesArray_[I.value],
+                                              gridSize,
+                                              arg.blockSize_,
+                                              num_threads_m,
+                                              num_threads_n,
+                                              num_threads_k);
+                },
+                Number<NumOutput>{});
+            const auto kernel = kernel_elementwise_3d<GridwiseElementwise,
+                                                      InGrid3dDescTuple,
+                                                      OutGrid3dDescTuple,
+                                                      InDataTypePointerTuple,
+                                                      OutDataTypePointerTuple,
+                                                      ElementwiseOperation>;
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(gridSize),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        in_grid_3d_desc_tuple,
+                                                        out_grid_3d_desc_tuple,
+                                                        arg.in_dev_buffers_,
+                                                        arg.out_dev_buffers_,
+                                                        arg.elementwise_op_,
+                                                        num_threads_m,
+                                                        num_threads_n,
+                                                        num_threads_k);
+            return elapsed_time;
+        }
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        if(pArg == nullptr)
+            return false;
+        if(pArg->lengths_.back() % MPerThread != 0)
+            return false;
+        auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
+                                          const std::array<index_t, NumDim>& strides,
+                                          index_t scalarPerVector,
+                                          index_t vectorDim) {
+            if(strides[vectorDim] == 1 &&
+               (lengths[vectorDim] % scalarPerVector == 0 ||
+                lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
+            {
+                return true;
+            }
+            if(strides[vectorDim] >= scalarPerVector)
+            {
+                return true;
+            }
+            return false;
+        };
+        bool valid = true;
+        static_for<0, NumInput, 1>{}([&](auto I) {
+            valid = valid && IsScalarPerVectorValid(pArg->lengths_,
+                                                    pArg->inStridesArray_[I.value],
+                                                    InScalarPerVectorSeq::At(I),
+                                                    NumDim_m - 1);
+        });
+        static_for<0, NumOutput, 1>{}([&](auto I) {
+            valid = valid && IsScalarPerVectorValid(pArg->lengths_,
+                                                    pArg->outStridesArray_[I.value],
+                                                    OutScalarPerVectorSeq::At(I),
+                                                    NumDim - 1);
+        });
+        return valid;
+    }
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op) override
+    {
+        return std::make_unique<Argument>(lengths,
+                                          inStridesArray,
+                                          outStridesArray,
+                                          in_dev_buffers,
+                                          out_dev_buffers,
+                                          elementwise_op);
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    }
+}; // namespace device
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
@@ -7,7 +7,7 @@
 #include <sstream>
 #include "ck/utility/reduction_operator.hpp"
-#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
 #include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp"
@@ -46,14 +46,14 @@ template <typename XDataType,
          index_t YDstVectorSize,
          index_t SaveMeanInvStdDstVectorSize,
          bool UseWelford = true>
-struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
+struct DeviceNormalizationFwdImpl : public DeviceNormalizationFwd<XDataType,
-                                                            GammaDataType,
+                                                                  GammaDataType,
-                                                            BetaDataType,
+                                                                  BetaDataType,
-                                                            YDataType,
+                                                                  YDataType,
-                                                            SaveMeanInvStdDataType,
+                                                                  SaveMeanInvStdDataType,
-                                                            YElementwiseOperation,
+                                                                  YElementwiseOperation,
-                                                            Rank,
+                                                                  Rank,
-                                                            NumReduceDim>
+                                                                  NumReduceDim>
 {
    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize);
    static_assert(
@@ -461,7 +461,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceNormalizationImpl<" << BlockSize << ",";
+        str << "DeviceNormalizationFwdImpl<" << BlockSize << ",";
        str << "Cluster_MK_" << MThreadClusterSize << "_" << KThreadClusterSize << ",";
        str << "Slice_MK_" << MThreadSliceSize << "_" << KThreadSliceSize << ",";
        str << "XYSrcVectorDim_" << XYSrcVectorDim  << ",";

--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
@@ -8,7 +8,7 @@
 #include "ck/utility/reduction_operator.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
-#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
 #include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp"
@@ -134,14 +134,14 @@ template <typename XDataType,
          index_t BetaSrcVectorSize,
          index_t YDstVectorSize,
          index_t SaveMeanInvStdDstVectorSize>
-struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
+struct DeviceNormalizationFwdSplitKImpl : public DeviceNormalizationFwd<XDataType,
-                                                                  GammaDataType,
+                                                                        GammaDataType,
-                                                                  BetaDataType,
+                                                                        BetaDataType,
-                                                                  YDataType,
+                                                                        YDataType,
-                                                                  SaveMeanInvStdDataType,
+                                                                        SaveMeanInvStdDataType,
-                                                                  YElementwiseOperation,
+                                                                        YElementwiseOperation,
-                                                                  Rank,
+                                                                        Rank,
-                                                                  NumReduceDim>
+                                                                        NumReduceDim>
 {
    using WorkspaceMeanVarDataType = SaveMeanInvStdDataType;
@@ -732,7 +732,7 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceNormalizationSplitKImpl<" << BlockSize << ",";
+        str << "DeviceNormalizationFwdSplitKImpl<" << BlockSize << ",";
        str << "Cluster_MK_" << MThreadClusterSize << "_" << KThreadClusterSize << ",";
        str << "Slice_MK_" << MThreadSliceSize << "_" << KThreadSliceSize << ",";
        str << "XYSrcVectorDim_" << XYVectorDim  << ",";

--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp
+// SPDX-License-Identifier: MIT
+// // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+//
+#pragma once
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+namespace ck {
+template <typename GridwiseElementwise3dFunctor,
+          typename InGrid3dDescTuple,
+          typename OutGrid3dDescTuple,
+          typename InDataTypePointerTuple,
+          typename OutDataTypePointerTuple,
+          typename ElementwiseOperation>
+__global__ void kernel_elementwise_3d(const InGrid3dDescTuple in_grid_3d_desc_tuple,
+                                      const OutGrid3dDescTuple out_grid_3d_desc_tuple,
+                                      const InDataTypePointerTuple p_in_global_tuple,
+                                      const OutDataTypePointerTuple p_out_global_tuple,
+                                      const ElementwiseOperation elementwise_op,
+                                      const index_t num_threads_m,
+                                      const index_t num_threads_n,
+                                      const index_t num_threads_k)
+{
+    GridwiseElementwise3dFunctor::Run(in_grid_3d_desc_tuple,
+                                      out_grid_3d_desc_tuple,
+                                      p_in_global_tuple,
+                                      p_out_global_tuple,
+                                      elementwise_op,
+                                      num_threads_m,
+                                      num_threads_n,
+                                      num_threads_k);
+}
+template <typename InGrid3dDescTuple,
+          typename OutGrid3dDescTuple,
+          typename InDataTypePointerTuple,
+          typename OutDataTypePointerTuple,
+          typename ElementwiseOperation,
+          index_t MPerThread,
+          index_t NPerThread,
+          index_t KPerThread,
+          typename InScalarPerVectorSeq,
+          typename OutScalarPerVectorSeq>
+struct GridwiseElementwise_3D
+{
+    static constexpr index_t NumInput  = InDataTypePointerTuple::Size();
+    static constexpr index_t NumOutput = OutDataTypePointerTuple::Size();
+    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
+                      NumOutput == OutScalarPerVectorSeq::Size() &&
+                      NumInput == InGrid3dDescTuple::Size() &&
+                      NumOutput == OutGrid3dDescTuple::Size(),
+                  "Tuple size is inconsistent with the number of in/out!");
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto thread_buffer_desc_mnk = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MPerThread>{}, Number<NPerThread>{}, Number<KPerThread>{}));
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+    __device__ static void Run(const InGrid3dDescTuple in_grid_3d_desc_tuple,
+                               const OutGrid3dDescTuple out_grid_3d_desc_tuple,
+                               const InDataTypePointerTuple p_in_global_tuple,
+                               const OutDataTypePointerTuple p_out_global_tuple,
+                               const ElementwiseOperation elementwise_op,
+                               const index_t num_threads_m,
+                               const index_t num_threads_n,
+                               const index_t num_threads_k)
+    {
+        auto in_thread_buf_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    DataType,
+                                    MPerThread * NPerThread * KPerThread,
+                                    true>{};
+            },
+            Number<NumInput>{});
+        auto out_thread_buf_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
+                using DataType        = remove_pointer_t<DataTypePointer>;
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    DataType,
+                                    MPerThread * NPerThread * KPerThread,
+                                    true>{};
+            },
+            Number<NumOutput>{});
+        auto in_global_buf_tuple = generate_tuple(
+            [&](auto I) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_in_global_tuple[I], in_grid_3d_desc_tuple[I].GetElementSpaceSize());
+            },
+            Number<NumInput>{});
+        auto out_global_buf_tuple = generate_tuple(
+            [&](auto I) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_global_tuple[I], out_grid_3d_desc_tuple[I].GetElementSpaceSize());
+            },
+            Number<NumOutput>{});
+        const auto M = in_grid_3d_desc_tuple[I0].GetLength(I0);
+        const auto N = in_grid_3d_desc_tuple[I0].GetLength(I1);
+        const auto K = in_grid_3d_desc_tuple[I0].GetLength(I2);
+        const index_t loop_step_m = num_threads_m * MPerThread;
+        const index_t loop_step_n = num_threads_n * NPerThread;
+        const index_t loop_step_k = num_threads_k * KPerThread;
+        const index_t thread_1d_id = get_thread_global_1d_id();
+        const index_t tid_m  = thread_1d_id / (num_threads_n * num_threads_k);
+        const index_t tid_nk = thread_1d_id % (num_threads_n * num_threads_k);
+        const index_t tid_n  = tid_nk / num_threads_k;
+        const index_t tid_k  = tid_nk % num_threads_k;
+        const auto thread_global_offset =
+            make_multi_index(tid_m * MPerThread, tid_n * NPerThread, tid_k * KPerThread);
+        auto in_global_load_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
+                return ThreadwiseTensorSliceTransfer_v2<
+                    DataType,
+                    DataType,
+                    decltype(in_grid_3d_desc_tuple[I]),
+                    decltype(thread_buffer_desc_mnk),
+                    Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
+                    Sequence<0, 1, 2>,                            // DimAccessOrder
+                    01,                                           // SrcVectorDim
+                    InScalarPerVectorSeq::At(I), // InScalarPerVectorSeq::At(I),                  //
+                                                 // ScalarPerVector
+                    1,                           // SrcScalarStrideInVector
+                    true>{in_grid_3d_desc_tuple[I], thread_global_offset};
+            },
+            Number<NumInput>{});
+        auto out_global_store_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
+                using DataType        = remove_pointer_t<DataTypePointer>;
+                return ThreadwiseTensorSliceTransfer_v1r3<
+                    DataType,
+                    DataType,
+                    decltype(thread_buffer_desc_mnk),
+                    decltype(out_grid_3d_desc_tuple[I]),
+                    PassThroughOp,
+                    Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
+                    Sequence<0, 1, 2>,                            // DimAccessOrder
+                    2,                                            // SrcVectorDim
+                    OutScalarPerVectorSeq::At(I),                 // OutScalarPerVectorSeq::At(I),
+                    InMemoryDataOperationEnum::Set,
+                    1,
+                    true>(out_grid_3d_desc_tuple[I], thread_global_offset, PassThroughOp{});
+            },
+            Number<NumOutput>{});
+        index_t num_iter_m = M / (loop_step_m);
+        do
+        {
+            index_t num_iter_n = N / (loop_step_n);
+            do
+            {
+                index_t num_iter_k = K / (loop_step_k);
+                do
+                {
+                    static_for<0, NumInput, 1>{}([&](auto I) {
+                        in_global_load_tuple(I).Run(in_grid_3d_desc_tuple[I],
+                                                    in_global_buf_tuple[I],
+                                                    thread_buffer_desc_mnk,
+                                                    make_tuple(I0, I0, I0),
+                                                    in_thread_buf_tuple(I));
+                        in_global_load_tuple(I).MoveSrcSliceWindow(
+                            in_grid_3d_desc_tuple[I], make_multi_index(0, 0, loop_step_k));
+                    });
+                    static_for<0, MPerThread, 1>{}([&](auto iM) {
+                        static_for<0, NPerThread, 1>{}([&](auto iN) {
+                            static_for<0, KPerThread, 1>{}([&](auto iK) {
+                                constexpr auto offset =
+                                    thread_buffer_desc_mnk.CalculateOffset(make_tuple(iM, iN, iK));
+                                // get reference to in data
+                                const auto in_data_refs = generate_tie(
+                                    // return type should be lvalue
+                                    [&](auto I) -> const auto& {
+                                        return in_thread_buf_tuple(I)(Number<offset>{});
+                                    },
+                                    Number<NumInput>{});
+                                // get referenec to dst data
+                                auto out_data_refs = generate_tie(
+                                    // return type should be lvalue
+                                    [&](auto I) -> auto& {
+                                        return out_thread_buf_tuple(I)(Number<offset>{});
+                                    },
+                                    Number<NumOutput>{});
+                                unpack2(elementwise_op, out_data_refs, in_data_refs);
+                            });
+                        });
+                    });
+                    static_for<0, NumOutput, 1>{}([&](auto I) {
+                        out_global_store_tuple(I).Run(thread_buffer_desc_mnk,
+                                                      make_tuple(I0, I0, I0),
+                                                      out_thread_buf_tuple[I],
+                                                      out_grid_3d_desc_tuple[I],
+                                                      out_global_buf_tuple(I));
+                        out_global_store_tuple(I).MoveDstSliceWindow(
+                            out_grid_3d_desc_tuple[I], make_multi_index(0, 0, loop_step_k));
+                    });
+                } while(--num_iter_k);
+                static_for<0, NumInput, 1>{}([&](auto I) {
+                    in_global_load_tuple(I).MoveSrcSliceWindow(
+                        in_grid_3d_desc_tuple[I],
+                        make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
+                });
+                static_for<0, NumOutput, 1>{}([&](auto I) {
+                    out_global_store_tuple(I).MoveDstSliceWindow(
+                        out_grid_3d_desc_tuple[I],
+                        make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
+                });
+            } while(--num_iter_n);
+            static_for<0, NumInput, 1>{}([&](auto I) {
+                in_global_load_tuple(I).MoveSrcSliceWindow(
+                    in_grid_3d_desc_tuple[I],
+                    make_multi_index(loop_step_m,
+                                     -(N / loop_step_n) * loop_step_n,
+                                     -(K / loop_step_k) * loop_step_k));
+            });
+            static_for<0, NumOutput, 1>{}([&](auto I) {
+                out_global_store_tuple(I).MoveDstSliceWindow(
+                    out_grid_3d_desc_tuple[I],
+                    make_multi_index(loop_step_m,
+                                     -(N / loop_step_n) * loop_step_n,
+                                     -(K / loop_step_k) * loop_step_k));
+            });
+        } while(--num_iter_m);
+    }
+};
+} // namespace ck