adding functioning implementation with scalar multiplication and unary operator support

4498e2a1 · Astha Rai · f1fc5db3 · 4498e2a1 · f1fc5db3 · 4498e2a1
Commit 4498e2a1 authored Oct 21, 2023 by Astha Rai
8 changed files
--- a/example/65_hip_tensor_permute/CMakeLists.txt
+++ b/example/65_hip_tensor_permute/CMakeLists.txt
-add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
+add_example_executable(example_elementwise_permute_4D_fp32_ht elementwise_permute_4D_fp32_ht.cpp)
-add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
+add_example_executable(example_elementwise_permute_4D_fp16_ht elementwise_permute_4D_fp16_ht.cpp)
--- a/example/65_hip_tensor_permute/elementwise_permute_4D_fp16_2d.cpp
+++ b/example/65_hip_tensor_permute/elementwise_permute_4D_fp16_2d.cpp
-#include <iostream>
-#include <cstdlib>
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-using F16 = ck::half_t;
-using ADataType = F16;
-using BDataType = F16;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
-                                                          ck::Tuple<BDataType>,
-                                                          PassThrough,
-                                                          3, // NumDim_M
-                                                          1, // NumDim_N
-                                                          8,
-                                                          8,
-                                                          ck::Sequence<8>,
-                                                          ck::Sequence<8>>;
-template <typename HostTensorA, typename HostTensorB, typename Functor>
-void host_elementwise4D(HostTensorB& B_nhwc,
-                        const HostTensorA& A_nchw,
-                        const std::vector<std::size_t>& shape_nchw,
-                        Functor functor)
-{
-    for(std::size_t n = 0; n < shape_nchw[0]; ++n)
-        for(std::size_t c = 0; c < shape_nchw[1]; ++c)
-            for(std::size_t h = 0; h < shape_nchw[2]; ++h)
-                for(std::size_t w = 0; w < shape_nchw[3]; ++w)
-                {
-                    auto a_val = A_nchw(n, c, h, w);
-                    functor(B_nhwc(n, h, w, c), a_val);
-                }
-}
-int main()
-{
-    bool do_verification = true;
-    bool time_kernel     = true;
-    const int N = 120;
-    const int C = 128;
-    const int H = 32;
-    const int W = 1024;
-    /**const int N = 120;
-    const int H = 32;
-    const int W = 64;
-    const int C = 128;**/
-    std::vector<std::size_t> nchw = {N, C, H, W};
-    std::vector<std::size_t> nhwc = {N, H, W, C};
-    Tensor<ADataType> a(nchw);
-    Tensor<BDataType> b(nhwc);
-    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a.mData.data());
-    // LogRangeAsType<float>(std::cout << "Tensor a  : ", a.mData, ",") << std::endl;
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-    std::array<ck::index_t, 4> ab_lengths{N, H, W, C};
-    std::array<ck::index_t, 4> a_strides = {C * H * W, W, 1, H * W};
-    std::array<ck::index_t, 4> b_strides = {H * W * C, W * C, C, 1};
-    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(
-        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
-    if(!broadcastPermute.IsSupportedArgument(argument.get()))
-    {
-        throw std::runtime_error(
-            "The runtime parameters seems not supported by the device instance, exiting!");
-    };
-    std::cout << "A (nchw): " << a.mDesc << std::endl;
-    std::cout << "B (nhwc): " << b.mDesc << std::endl;
-    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
-    float ave_time =
-        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-    std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
-    std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
-                            sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-    bool pass = true;
-    if(do_verification)
-    {
-        b_device_buf.FromDevice(b.mData.data());
-        // LogRangeAsType<float>(std::cout << "Tensor b  : ", b.mData, ",") << std::endl;
-        Tensor<BDataType> host_b(nhwc);
-        host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
-            host_b, a, nchw, PassThrough{});
-        // LogRangeAsType<float>(std::cout << "Host b  : ", host_b.mData, ",") << std::endl;
-        pass &=
-            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-    }
-    return pass ? 0 : 1;
-}
--- a/example/65_hip_tensor_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/65_hip_tensor_permute/elementwise_permute_4D_fp16.cpp
--- a/example/65_hip_tensor_permute/elementwise_permute_4D_fp32_2d.cpp
+++ b/example/65_hip_tensor_permute/elementwise_permute_4D_fp32_2d.cpp
-#include <iostream>
-#include <cstdlib>
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-using F16 = ck::half_t;
-using ADataType = F16;
-using BDataType = F16;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
-                                                          ck::Tuple<BDataType>,
-                                                          PassThrough,
-                                                          3, // NumDim_M
-                                                          1, // NumDim_N
-                                                          8,
-                                                          8,
-                                                          ck::Sequence<8>,
-                                                          ck::Sequence<8>>;
-template <typename HostTensorA, typename HostTensorB, typename Functor>
-void host_elementwise4D(HostTensorB& B_nhwc,
-                        const HostTensorA& A_nchw,
-                        const std::vector<std::size_t>& shape_nchw,
-                        Functor functor)
-{
-    for(std::size_t n = 0; n < shape_nchw[0]; ++n)
-        for(std::size_t c = 0; c < shape_nchw[1]; ++c)
-            for(std::size_t h = 0; h < shape_nchw[2]; ++h)
-                for(std::size_t w = 0; w < shape_nchw[3]; ++w)
-                {
-                    auto a_val = A_nchw(n, c, h, w);
-                    functor(B_nhwc(n, h, w, c), a_val);
-                }
-}
-int main()
-{
-    bool do_verification = true;
-    bool time_kernel     = true;
-    const int N = 120;
-    const int C = 128;
-    const int H = 32;
-    const int W = 1024;
-    /**const int N = 120;
-    const int H = 32;
-    const int W = 64;
-    const int C = 128;**/
-    std::vector<std::size_t> nchw = {N, C, H, W};
-    std::vector<std::size_t> nhwc = {N, H, W, C};
-    Tensor<ADataType> a(nchw);
-    Tensor<BDataType> b(nhwc);
-    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a.mData.data());
-    // LogRangeAsType<float>(std::cout << "Tensor a  : ", a.mData, ",") << std::endl;
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-    std::array<ck::index_t, 4> ab_lengths{N, H, W, C};
-    std::array<ck::index_t, 4> a_strides = {C * H * W, W, 1, H * W};
-    std::array<ck::index_t, 4> b_strides = {H * W * C, W * C, C, 1};
-    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(
-        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
-    if(!broadcastPermute.IsSupportedArgument(argument.get()))
-    {
-        throw std::runtime_error(
-            "The runtime parameters seems not supported by the device instance, exiting!");
-    };
-    std::cout << "A (nchw): " << a.mDesc << std::endl;
-    std::cout << "B (nhwc): " << b.mDesc << std::endl;
-    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
-    float ave_time =
-        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-    std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
-    std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
-                            sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-    bool pass = true;
-    if(do_verification)
-    {
-        b_device_buf.FromDevice(b.mData.data());
-        // LogRangeAsType<float>(std::cout << "Tensor b  : ", b.mData, ",") << std::endl;
-        Tensor<BDataType> host_b(nhwc);
-        host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
-            host_b, a, nchw, PassThrough{});
-        // LogRangeAsType<float>(std::cout << "Host b  : ", host_b.mData, ",") << std::endl;
-        pass &=
-            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-    }
-    return pass ? 0 : 1;
-}
--- a/example/65_hip_tensor_permute/elementwise_permute_4D_fp32.cpp
+++ b/example/65_hip_tensor_permute/elementwise_permute_4D_fp32.cpp
@@ -3,7 +3,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl_ht.hpp"
 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -18,25 +18,35 @@ using ADataType = F32;
 using BDataType = F32;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Square      = ck::tensor_operation::element_wise::UnarySquare;
+// ck::index_t scalar_mult = 2;
 using DeviceElementwisePermuteInstance =
    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
                                                        ck::Tuple<BDataType>,
                                                        PassThrough,
+                                                        Square,
                                                        4,
                                                        8,
+                                                        2,
                                                        ck::Sequence<8>,
                                                        ck::Sequence<1>>;
-template <typename HostTensorA, typename HostTensorB, typename Functor>
+template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
-void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
+void host_elementwise4D(HostTensorB& B_nhwc,
+                        const HostTensorA& A_nchw,
+                        FunctorA functor_a,
+                        FunctorB functor_b)
 {
    for(std::size_t n = 0; n < A_nchw.mDesc.GetLengths()[0]; ++n)
        for(std::size_t c = 0; c < A_nchw.mDesc.GetLengths()[1]; ++c)
            for(std::size_t h = 0; h < A_nchw.mDesc.GetLengths()[2]; ++h)
                for(std::size_t w = 0; w < A_nchw.mDesc.GetLengths()[3]; ++w)
                {
+                    ADataType tmp_val;
                    auto a_val = A_nchw(n, c, h, w);
-                    functor(B_nhwc(n, h, w, c), a_val);
+                    functor_b(tmp_val, a_val);
+                    functor_a(B_nhwc(n, h, w, c), 2 * tmp_val);
                }
 }
@@ -74,7 +84,7 @@ int main()
    auto broadcastPermute = DeviceElementwisePermuteInstance{};
    auto argument         = broadcastPermute.MakeArgumentPointer(
-        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{}, Square{});
    if(!broadcastPermute.IsSupportedArgument(argument.get()))
    {
@@ -106,7 +116,7 @@ int main()
    {
        b_device_buf.FromDevice(b.mData.data());
        Tensor<BDataType> host_b(nhwc);
-        host_elementwise4D(host_b, a, PassThrough{});
+        host_elementwise4D(host_b, a, PassThrough{}, Square{});
        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);

--- a/include/ck/tensor_operation/gpu/device/device_elementwise_ht.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_ht.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <memory>
+#include <array>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          index_t NumDim>
+struct DeviceElementwise : public BaseOperator
+{
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op,
+                        UnaryOperation unary_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+}; // namespace device
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          index_t NumDim>
+using DeviceElementwisePtr = std::unique_ptr<DeviceElementwise<InDataTypeTuple,
+                                                               OutDataTypeTuple,
+                                                               ElementwiseOperation,
+                                                               UnaryOperation,
+                                                               NumDim>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl_ht.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl_ht.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise_ht.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_ht.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          index_t NumDim,
+          index_t MPerThread,
+          index_t ScalarMult,
+          typename InScalarPerVectorSeq,
+          typename OutScalarPerVectorSeq>
+struct DeviceElementwiseImpl : public DeviceElementwise<InDataTypeTuple,
+                                                        OutDataTypeTuple,
+                                                        ElementwiseOperation,
+                                                        UnaryOperation,
+                                                        NumDim>
+{
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
+                      NumOutput == OutScalarPerVectorSeq::Size(),
+                  "Tuple size is inconsistent with the number of in/out!");
+    static auto GenerateInDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                return static_cast<const DataType*>(nullptr);
+            },
+            Number<NumInput>{});
+    };
+    static auto GenerateOutDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+                return static_cast<DataType*>(nullptr);
+            },
+            Number<NumOutput>{});
+    };
+    using InDataTypePointerTuple  = decltype(GenerateInDataTypePointerTuple());
+    using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
+    template <typename Desc_M>
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
+    {
+        constexpr auto I0 = Number<0>{};
+        const auto m            = desc_m.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * MPerThread;
+        const auto pad          = math::integer_least_multiple(m, loop_step) - m;
+        const auto desc_m_pad =
+            transform_tensor_descriptor(desc_m,
+                                        make_tuple(make_right_pad_transform(m, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m_pad;
+    }
+    static auto MakeDescriptor_M(const std::array<index_t, NumDim>& lengths,
+                                 const std::array<index_t, NumDim>& stride,
+                                 index_t gridSize,
+                                 index_t blockSize)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return lengths[I]; }, Number<NumDim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<NumDim>{});
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+        // merge nd to 1d desc - [s0 * s1 * ...]
+        if constexpr(NumDim > 1)
+        {
+            const auto desc_m = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(tupleOfShape)),
+                make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NumDim>{})),
+                make_tuple(Sequence<0>{}));
+            return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
+        }
+        else
+            return PadDescriptor_M_1d(desc, gridSize, blockSize);
+    }
+    template <index_t TupleSize>
+    static auto GenerateInOutGrid1dDescTuple(Number<TupleSize>)
+    {
+        return generate_tuple(
+            [&](auto) {
+                if constexpr(NumDim > 1)
+                {
+                    return MakeDescriptor_M({1, 1}, {1, 1}, 1, 1);
+                }
+                else
+                {
+                    return MakeDescriptor_M({1}, {1}, 1, 1);
+                };
+            },
+            Number<TupleSize>{});
+    };
+    using InGrid1dDescTuple  = decltype(GenerateInOutGrid1dDescTuple(Number<NumInput>{}));
+    using OutGrid1dDescTuple = decltype(GenerateInOutGrid1dDescTuple(Number<NumOutput>{}));
+    using GridwiseElementwise = GridwiseElementwise_1D<InGrid1dDescTuple,
+                                                       OutGrid1dDescTuple,
+                                                       InDataTypePointerTuple,
+                                                       OutDataTypePointerTuple,
+                                                       ElementwiseOperation,
+                                                       UnaryOperation,
+                                                       MPerThread,
+                                                       ScalarMult,
+                                                       InScalarPerVectorSeq,
+                                                       OutScalarPerVectorSeq>;
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, NumDim> lengths,
+                 const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                 const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                 const std::array<const void*, NumInput> in_dev_buffers,
+                 const std::array<void*, NumOutput> out_dev_buffers,
+                 ElementwiseOperation elementwise_op,
+                 UnaryOperation unary_op)
+            : lengths_(lengths),
+              inStridesArray_(inStridesArray),
+              outStridesArray_(outStridesArray),
+              elementwise_op_(elementwise_op),
+              unary_op_(unary_op),
+              blockSize_(256)
+        {
+            in_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                    return static_cast<const DataType*>(in_dev_buffers[I.value]);
+                },
+                Number<NumInput>{});
+            out_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+                    return static_cast<DataType*>(out_dev_buffers[I.value]);
+                },
+                Number<NumOutput>{});
+        }
+        InDataTypePointerTuple in_dev_buffers_;
+        OutDataTypePointerTuple out_dev_buffers_;
+        std::array<index_t, NumDim> lengths_;
+        std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
+        std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_;
+        ElementwiseOperation elementwise_op_;
+        UnaryOperation unary_op_;
+        index_t blockSize_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t gridSize = getAvailableComputeUnitCount(stream_config);
+            auto in_grid_1d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_M(
+                        arg.lengths_, arg.inStridesArray_[I.value], gridSize, arg.blockSize_);
+                },
+                Number<NumInput>{});
+            auto out_grid_1d_desc_tuple = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_M(
+                        arg.lengths_, arg.outStridesArray_[I.value], gridSize, arg.blockSize_);
+                },
+                Number<NumOutput>{});
+            const auto kernel = kernel_elementwise_1d<GridwiseElementwise,
+                                                      InGrid1dDescTuple,
+                                                      OutGrid1dDescTuple,
+                                                      InDataTypePointerTuple,
+                                                      OutDataTypePointerTuple,
+                                                      ElementwiseOperation,
+                                                      UnaryOperation>;
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(gridSize),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        in_grid_1d_desc_tuple,
+                                                        out_grid_1d_desc_tuple,
+                                                        arg.in_dev_buffers_,
+                                                        arg.out_dev_buffers_,
+                                                        arg.elementwise_op_,
+                                                        arg.unary_op_);
+            return elapsed_time;
+        }
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(arg.lengths_.back() % MPerThread != 0)
+            return false;
+        auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
+                                          const std::array<index_t, NumDim>& strides,
+                                          index_t scalarPerVector) {
+            if(strides.back() == 1 && lengths.back() % scalarPerVector == 0)
+                return true;
+            if(strides.back() != 1 && scalarPerVector == 1)
+                return true;
+            return false;
+        };
+        bool valid = true;
+        static_for<0, NumInput, 1>{}([&](auto I) {
+            if(!IsScalarPerVectorValid(
+                   arg.lengths_, arg.inStridesArray_[I.value], InScalarPerVectorSeq::At(I)))
+                valid = false;
+        });
+        static_for<0, NumOutput, 1>{}([&](auto I) {
+            if(!IsScalarPerVectorValid(
+                   arg.lengths_, arg.outStridesArray_[I.value], OutScalarPerVectorSeq::At(I)))
+                valid = false;
+        });
+        return valid;
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+    static auto
+    MakeArgument(const std::array<index_t, NumDim> lengths,
+                 const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                 const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                 const std::array<const void*, NumInput> in_dev_buffers,
+                 const std::array<void*, NumOutput> out_dev_buffers,
+                 ElementwiseOperation elementwise_op,
+                 UnaryOperation unary_op)
+    {
+        return Argument{lengths,
+                        inStridesArray,
+                        outStridesArray,
+                        in_dev_buffers,
+                        out_dev_buffers,
+                        elementwise_op,
+                        unary_op};
+    }
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op,
+                        UnaryOperation unary_op) override
+    {
+        return std::make_unique<Argument>(lengths,
+                                          inStridesArray,
+                                          outStridesArray,
+                                          in_dev_buffers,
+                                          out_dev_buffers,
+                                          elementwise_op,
+                                          unary_op);
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+}; // namespace device
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_ht.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_ht.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#define UNUSED(expr)  \
+    do                \
+    {                 \
+        (void)(expr); \
+    } while(0)
+namespace ck {
+template <typename GridwiseElementwise1dFunctor,
+          typename InGrid1dDescTuple,
+          typename OutGrid1dDescTuple,
+          typename InDataTypePointerTuple,
+          typename OutDataTypePointerTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation>
+__global__ void kernel_elementwise_1d(const InGrid1dDescTuple in_grid_1d_desc_tuple,
+                                      const OutGrid1dDescTuple out_grid_1d_desc_tuple,
+                                      const InDataTypePointerTuple p_in_global_tuple,
+                                      const OutDataTypePointerTuple p_out_global_tuple,
+                                      const ElementwiseOperation elementwise_op,
+                                      const UnaryOperation unary_op)
+{
+    GridwiseElementwise1dFunctor::Run(in_grid_1d_desc_tuple,
+                                      out_grid_1d_desc_tuple,
+                                      p_in_global_tuple,
+                                      p_out_global_tuple,
+                                      elementwise_op,
+                                      unary_op);
+}
+template <typename InGrid1dDescTuple,
+          typename OutGrid1dDescTuple,
+          typename InDataTypePointerTuple,
+          typename OutDataTypePointerTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          index_t MPerThread,
+          index_t ScalarMult,
+          typename InScalarPerVectorSeq,
+          typename OutScalarPerVectorSeq>
+struct GridwiseElementwise_1D
+{
+    static constexpr index_t NumInput  = InDataTypePointerTuple::Size();
+    static constexpr index_t NumOutput = OutDataTypePointerTuple::Size();
+    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
+                      NumOutput == OutScalarPerVectorSeq::Size() &&
+                      NumInput == InGrid1dDescTuple::Size() &&
+                      NumOutput == OutGrid1dDescTuple::Size(),
+                  "Tuple size is inconsistent with the number of in/out!");
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto thread_buffer_desc_m =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+    __device__ static void Run(const InGrid1dDescTuple in_grid_1d_desc_tuple,
+                               const OutGrid1dDescTuple out_grid_1d_desc_tuple,
+                               const InDataTypePointerTuple p_in_global_tuple,
+                               const OutDataTypePointerTuple p_out_global_tuple,
+                               const ElementwiseOperation elementwise_op,
+                               const UnaryOperation unary_op)
+    {
+        const index_t thread_global_id = get_thread_global_1d_id();
+        auto in_thread_buf_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
+                return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, MPerThread, true>{};
+            },
+            Number<NumInput>{});
+        auto tmp_thread_buf_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_pointer_t<DataTypePointer>;
+                return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, MPerThread, true>{};
+            },
+            Number<NumInput>{});
+        auto out_thread_buf_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
+                using DataType        = remove_pointer_t<DataTypePointer>;
+                return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, MPerThread, true>{};
+            },
+            Number<NumOutput>{});
+        auto in_global_buf_tuple = generate_tuple(
+            [&](auto I) {
+                static_assert(in_grid_1d_desc_tuple[I].GetNumOfDimension() == 1);
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_in_global_tuple[I], in_grid_1d_desc_tuple[I].GetElementSpaceSize());
+            },
+            Number<NumInput>{});
+        auto out_global_buf_tuple = generate_tuple(
+            [&](auto I) {
+                static_assert(out_grid_1d_desc_tuple[I].GetNumOfDimension() == 1);
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_global_tuple[I], out_grid_1d_desc_tuple[I].GetElementSpaceSize());
+            },
+            Number<NumOutput>{});
+        const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread);
+        const index_t blockSize    = get_block_size();
+        const index_t blockPerGrid = get_grid_size();
+        const auto M               = in_grid_1d_desc_tuple[I0].GetLength(I0);
+        const index_t loop_step    = blockPerGrid * blockSize * MPerThread;
+        const auto loop_step_index = make_multi_index(loop_step);
+        auto in_global_load_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
+                return ThreadwiseTensorSliceTransfer_v2<DataType,
+                                                        DataType,
+                                                        decltype(in_grid_1d_desc_tuple[I]),
+                                                        decltype(thread_buffer_desc_m),
+                                                        Sequence<MPerThread>, // SliceLengths
+                                                        Sequence<0>,          // DimAccessOrder
+                                                        0,                    // SrcVectorDim
+                                                        InScalarPerVectorSeq::At(
+                                                            I), // ScalarPerVector
+                                                        1,      // SrcScalarStrideInVector
+                                                        false>{in_grid_1d_desc_tuple[I],
+                                                               thread_global_offset};
+            },
+            Number<NumInput>{});
+        auto out_global_store_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
+                using DataType        = remove_pointer_t<DataTypePointer>;
+                return ThreadwiseTensorSliceTransfer_v1r3<DataType,
+                                                          DataType,
+                                                          decltype(thread_buffer_desc_m),
+                                                          decltype(out_grid_1d_desc_tuple[I]),
+                                                          PassThroughOp,
+                                                          Sequence<MPerThread>, // SliceLengths
+                                                          Sequence<0>,          // DimAccessOrder
+                                                          0,                    // SrcVectorDim
+                                                          OutScalarPerVectorSeq::At(I),
+                                                          InMemoryDataOperationEnum::Set,
+                                                          1,
+                                                          false>(
+                    out_grid_1d_desc_tuple[I], thread_global_offset, PassThroughOp{});
+            },
+            Number<NumOutput>{});
+        const auto& scalar = ScalarMult;
+        index_t num_iter   = M / (loop_step);
+        do
+        {
+            static_for<0, NumInput, 1>{}([&](auto I) {
+                in_global_load_tuple(I).Run(in_grid_1d_desc_tuple[I],
+                                            in_global_buf_tuple[I],
+                                            thread_buffer_desc_m,
+                                            make_tuple(I0),
+                                            in_thread_buf_tuple(I));
+                in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_1d_desc_tuple[I],
+                                                           loop_step_index);
+            });
+            // static_for<0, MPerThread, 1>{}(
+            //  [&](auto I){
+            // InDataTypePointerTuple tmp;
+            //	unary_op(in_thread_buf_tuple(I), in_thread_buf_tuple(I));
+            // in_thread_buf_tuple(I) = tmp;
+            //});
+            static_for<0, MPerThread, 1>{}([&](auto iM) {
+                // tmp_thread_buf_tuple = [&](auto I){ unary_op(in_thread_buf_tuple(I)(iM),
+                // in_thread_buf_tuple(I)(iM)); }; unary_op(in_thread_buf_tuple(iM),
+                // in_thread_buf_tuple(iM));
+                // get reference to in data
+                auto uop_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto I) -> auto& { return in_thread_buf_tuple(I)(iM); },
+                    Number<NumInput>{});
+                // get reference to dst data
+                auto out_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto I) -> auto& { return out_thread_buf_tuple(I)(iM); },
+                    Number<NumOutput>{});
+                unpack2(unary_op, uop_data_refs, uop_data_refs);
+                const auto in_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto I) -> const auto& { return in_thread_buf_tuple(I)(iM) *= scalar; },
+                    Number<NumInput>{});
+                unpack2(elementwise_op, out_data_refs, in_data_refs);
+                UNUSED(tmp_thread_buf_tuple);
+            });
+            static_for<0, NumOutput, 1>{}([&](auto I) {
+                out_global_store_tuple(I).Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              out_thread_buf_tuple[I],
+                                              out_grid_1d_desc_tuple[I],
+                                              out_global_buf_tuple(I));
+                out_global_store_tuple(I).MoveDstSliceWindow(out_grid_1d_desc_tuple[I],
+                                                             loop_step_index);
+            });
+        } while(--num_iter);
+    }
+};
+} // namespace ck