Revert "Transpose 3d (#984)"

This reverts commit 3af8c81a.

Revert "Transpose 3d (#984)"
This reverts commit 3af8c81a.
f6b8f18f · Jun Liu · e1fa0091 · e1fa0091 · e1fa0091 · f6b8f18f
Commit f6b8f18f authored Nov 16, 2023 by Jun Liu
19 changed files
--- a/client_example/23_elementwise_transpose/CMakeLists.txt
+++ b/client_example/23_elementwise_transpose/CMakeLists.txt
-add_executable(client_elementwise_transpose3d elementwise_transpose_3d.cpp)
-target_link_libraries(client_elementwise_transpose3d PRIVATE composable_kernel::device_operations)
--- a/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
+++ b/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iomanip>
-#include <vector>
-#include <iostream>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using ADataType = F16;
-using BDataType = F16;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-struct SimpleDeviceMem
-{
-    SimpleDeviceMem() = delete;
-
-    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
-    {
-        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
-    }
-
-    void* GetDeviceBuffer() { return p_mem_; }
-
-    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
-
-    void* p_mem_;
-};
-
-int main()
-{
-    const int N = 16;
-    const int C = 8;
-    const int D = 8;
-    const int H = 8;
-    const int W = 8;
-
-    std::vector<std::size_t> ncdhw = {N, C, D, H, W};
-    std::vector<std::size_t> nchwd = {N, C, H, W, D};
-    auto size                      = N * C * D * H * W;
-
-    std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
-    std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
-    std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, C, H, W, D
-
-    SimpleDeviceMem a_dev_buf(sizeof(ADataType) * size);
-    SimpleDeviceMem b_dev_buf(sizeof(BDataType) * size);
-
-    std::array<const void*, 1> input = {a_dev_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_dev_buf.GetDeviceBuffer()};
-
-    using DeviceElementwisePermuteInstance = ck::tensor_operation::device::
-        DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, PassThrough, 5>;
-
-    // get device op instances
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceElementwisePermuteInstance>::GetInstances();
-
-    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
-    std::string best_op_name;
-    bool found            = false;
-    int best_op_id        = -1;
-    float best_ave_time   = std::numeric_limits<float>::max();
-    float best_gb_per_sec = 0;
-
-    // profile device operation instances
-    std::cout << "Run all instances and do timing" << std::endl;
-
-    for(int i = 0; i < op_ptrs.size(); ++i)
-    {
-        auto& op_ptr = op_ptrs[i];
-
-        auto argument_ptr = op_ptr->MakeArgumentPointer(
-            ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        std::string op_name = op_ptr->GetTypeString();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
-
-            std::size_t num_byte =
-                sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
-                sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
-
-            float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
-                      << op_name << std::endl;
-
-            if(ave_time < best_ave_time)
-            {
-                found           = true;
-                best_op_id      = i;
-                best_op_name    = op_name;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
-            }
-        }
-        else
-        {
-            std::cout << op_name << " does not support this problem" << std::endl;
-        }
-    }
-
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
-              << best_op_name << std::endl;
-
-    // run the best intance
-    {
-        auto& op_ptr = op_ptrs[best_op_id];
-        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
-                  << std::endl;
-
-        auto argument_ptr = op_ptr->MakeArgumentPointer(
-            ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
-        }
-
-        std::cout << "Done" << std::endl;
-    }
-
-    return 0;
-}
--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
@@ -4,5 +4,3 @@ add_example_executable(example_elementwise_permute_4D_fp32_row elementwise_permu
 add_example_executable(example_elementwise_permute_4D_fp16_row elementwise_permute_4D_fp16_row.cpp)
 add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permute_4D_fp32_col.cpp)
 add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp)
-add_example_executable(example_elementwise_permute elementwise_permute.cpp)
-add_example_executable(example_elementwise_permute_3d elementwise_permute_3d.cpp)
--- a/example/44_elementwise_permute/elementwise_permute.cpp
+++ b/example/44_elementwise_permute/elementwise_permute.cpp
-#include <iostream>
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
-
-#include "ck/library/utility/algorithm.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using ADataType = F16;
-using BDataType = F16;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                        PassThrough,          // ElementwiseOp
-                                                        5,                    // NumDim
-                                                        8,                    // MPerThread
-                                                        ck::Sequence<1>,  // InScalarPerVectorSeq
-                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
-
-template <typename HostTensorA, typename HostTensorB, typename Functor>
-void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor)
-{
-    for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
-        for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
-            for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
-                for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
-                    for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
-                    {
-                        auto a_val = A_ncdhw(n, c, d, h, w);
-                        functor(B_ndhwc(n, d, h, w, c), a_val);
-                    }
-}
-
-int main()
-{
-    bool do_verification = true;
-    bool time_kernel     = true;
-
-    std::vector<std::size_t> ncdhw = {16, 8, 8, 8, 8};
-    std::vector<std::size_t> ndhwc = {16, 8, 8, 8, 8};
-    Tensor<ADataType> a(ncdhw);
-    Tensor<BDataType> b(ndhwc);
-
-    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-
-    a_device_buf.ToDevice(a.mData.data());
-
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-
-    std::array<ck::index_t, 5> ab_lengths;
-    /**std::array<ck::index_t, 5> a_strides = {
-        static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
-        static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4]),
-        static_cast<int>(ncdhw[3] * ncdhw[4]),
-        static_cast<int>(ncdhw[4]),
-        1};
-    std::array<ck::index_t, 5> b_strides = {
-        static_cast<int>(ndhwc[1] * ndhwc[2] * ndhwc[3] * ndhwc[4]),
-        static_cast<int>(ndhwc[2] * ndhwc[3] * ndhwc[4]),
-        1,
-        static_cast<int>(ndhwc[3] * ndhwc[4]),
-        static_cast<int>(ndhwc[4])};**/
-
-    std::array<ck::index_t, 5> a_strides = {
-        static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
-        static_cast<int>(ncdhw[3] * ncdhw[4]),
-        static_cast<int>(ncdhw[4]),
-        1,
-        static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4])};
-
-    std::array<ck::index_t, 5> b_strides = {
-        static_cast<int>(ndhwc[1] * ndhwc[2] * ndhwc[3] * ndhwc[4]),
-        static_cast<int>(ndhwc[2] * ndhwc[3] * ndhwc[4]),
-        static_cast<int>(ndhwc[3] * ndhwc[4]),
-        static_cast<int>(ndhwc[4]),
-        1};
-    ck::ranges::copy(ncdhw, ab_lengths.begin());
-
-    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(
-        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
-
-    if(!broadcastPermute.IsSupportedArgument(argument.get()))
-    {
-        throw std::runtime_error(
-            "The runtime parameters seems not supported by the device instance, exiting!");
-    };
-
-    std::cout << "A (ncdhw): " << a.mDesc << std::endl;
-    std::cout << "B (ndhwc): " << b.mDesc << std::endl;
-
-    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
-    float ave_time =
-        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-    std::size_t flop = std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
-
-    std::size_t num_btype =
-        sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
-        sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-
-    bool pass = true;
-
-    if(do_verification)
-    {
-        b_device_buf.FromDevice(b.mData.data());
-        Tensor<BDataType> host_b(ndhwc);
-        host_elementwise4D(host_b, a, PassThrough{});
-
-        pass &=
-            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-    }
-
-    return pass ? 0 : 1;
-}
--- a/example/44_elementwise_permute/elementwise_permute_3d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_3d.cpp
-#include <iostream>
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
-
-#include "ck/library/utility/algorithm.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using ADataType = F16;
-using BDataType = F16;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                          ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                          PassThrough,          // ElementwiseOp
-                                                          2,                    // NumDim_m, {N, C}
-                                                          2,                    // NumDim_n, {H, W}
-                                                          1,                    // NumDim_k, {D}
-                                                          8,                    // MPerThread
-                                                          8,                    // NPerThread
-                                                          8,                    // KPerThread
-                                                          ck::Sequence<8>,  // InScalarPerVectorSeq
-                                                          ck::Sequence<4>>; // OutScalarPerVectorSeq
-
-template <typename HostTensorA, typename HostTensorB, typename Functor>
-void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor)
-{
-    for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
-        for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
-            for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
-                for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
-                    for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
-                    {
-                        auto a_val = A_ncdhw(n, c, d, h, w);
-                        functor(B_ndhwc(n, d, h, w, c), a_val);
-                    }
-}
-
-int main()
-{
-    bool do_verification = true;
-    bool time_kernel     = true;
-
-    const int N = 4;
-    const int C = 16;
-    const int H = 32;
-    const int W = 5;
-    const int D = 16;
-
-    std::vector<std::size_t> ncdhw = {N, C, D, H, W};
-    std::vector<std::size_t> ndhwc = {N, D, H, W, C};
-    Tensor<ADataType> a(ncdhw);
-    Tensor<BDataType> b(ndhwc);
-
-    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-
-    a_device_buf.ToDevice(a.mData.data());
-
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-
-    std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
-    std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
-    std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
-
-    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(
-        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
-
-    if(!broadcastPermute.IsSupportedArgument(argument.get()))
-    {
-        throw std::runtime_error(
-            "The runtime parameters seems not supported by the device instance, exiting!");
-    };
-
-    std::cout << "A (ncdhw): " << a.mDesc << std::endl;
-    std::cout << "B (ndhwc): " << b.mDesc << std::endl;
-
-    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
-    float ave_time =
-        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-    std::size_t flop = std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
-
-    std::size_t num_btype =
-        sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
-        sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-
-    bool pass = true;
-
-    if(do_verification)
-    {
-        b_device_buf.FromDevice(b.mData.data());
-        Tensor<BDataType> host_b(ndhwc);
-        host_elementwise4D(host_b, a, PassThrough{});
-
-        pass &=
-            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-    }
-
-    return pass ? 0 : 1;
-}
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
@@ -19,13 +19,13 @@ using BDataType = F16;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                        PassThrough,          // Elementwise op
-                                                        4,                    // NumDim
-                                                        8,                    // MPerThread
-                                                        ck::Sequence<8>,  // InScalarPerVectorSeq
-                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
+    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
+                                                        ck::Tuple<BDataType>,
+                                                        PassThrough,
+                                                        4,
+                                                        8,
+                                                        ck::Sequence<8>,
+                                                        ck::Sequence<1>>;

 template <typename HostTensorA, typename HostTensorB, typename Functor>
 void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
@@ -99,6 +99,7 @@ int main()

    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;
+
    bool pass = true;

    if(do_verification)

--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
@@ -17,15 +17,15 @@ using BDataType = F16;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                          ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                          PassThrough,          // Elementwise op
+    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
+                                                          ck::Tuple<BDataType>,
+                                                          PassThrough,
                                                          3, // NumDim_M
                                                          1, // NumDim_N
-                                                          1,                    // MPerThread
-                                                          1,                    // NPerThread
-                                                          ck::Sequence<1>,  // InScalarPerVectorSeq
-                                                          ck::Sequence<1>>; // OutScalarPerVectorSeq
+                                                          8,
+                                                          8,
+                                                          ck::Sequence<8>,
+                                                          ck::Sequence<8>>;

 template <typename HostTensorA, typename HostTensorB, typename Functor>
 void host_elementwise4D(HostTensorB& B_nhwc,
@@ -53,6 +53,12 @@ int main()
    const int H = 32;
    const int W = 1024;

+    /**const int N = 120;
+    const int H = 32;
+    const int W = 64;
+
+    const int C = 128;**/
+
    std::vector<std::size_t> nchw = {N, C, H, W};
    std::vector<std::size_t> nhwc = {N, H, W, C};

@@ -65,6 +71,7 @@ int main()
    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());

    a_device_buf.ToDevice(a.mData.data());
+    // LogRangeAsType<float>(std::cout << "Tensor a  : ", a.mData, ",") << std::endl;

    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
@@ -108,10 +115,13 @@ int main()
    if(do_verification)
    {
        b_device_buf.FromDevice(b.mData.data());
+        // LogRangeAsType<float>(std::cout << "Tensor b  : ", b.mData, ",") << std::endl;

        Tensor<BDataType> host_b(nhwc);
        host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
            host_b, a, nchw, PassThrough{});
+
+        // LogRangeAsType<float>(std::cout << "Host b  : ", host_b.mData, ",") << std::endl;
        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
    }

--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-
-#include "ck/utility/math.hpp"
-#include "ck/utility/sequence.hpp"
-#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp"
-#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/host_utility/stream_utility.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-template <typename InDataTypeTuple,
-          typename OutDataTypeTuple,
-          typename ElementwiseOperation,
-          index_t NumDim_m, // choose how to set dims
-          index_t NumDim_n,
-          index_t NumDim_k,
-          index_t MPerThread,
-          index_t NPerThread,
-          index_t KPerThread,
-          typename InScalarPerVectorSeq,
-          typename OutScalarPerVectorSeq>
-struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
-                                                          OutDataTypeTuple,
-                                                          ElementwiseOperation,
-                                                          NumDim_m + NumDim_n + NumDim_k>
-{
-    static constexpr index_t NumDim = NumDim_m + NumDim_n + NumDim_k;
-
-    static constexpr int NumInput  = InDataTypeTuple::Size();
-    static constexpr int NumOutput = OutDataTypeTuple::Size();
-
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
-    static constexpr auto I4 = Number<4>{};
-
-    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
-                      NumOutput == OutScalarPerVectorSeq::Size(),
-                  "Tuple size is inconsistent with the number of in/out!");
-
-    static auto GenerateInDataTypePointerTuple()
-    {
-        return generate_tuple(
-            [&](auto I) {
-                using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
-
-                return static_cast<const DataType*>(nullptr);
-            },
-            Number<NumInput>{});
-    }
-
-    static auto GenerateOutDataTypePointerTuple()
-    {
-        return generate_tuple(
-            [&](auto I) {
-                using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
-
-                return static_cast<DataType*>(nullptr);
-            },
-            Number<NumOutput>{});
-    }
-
-    using InDataTypePointerTuple  = decltype(GenerateInDataTypePointerTuple());
-    using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
-
-    template <typename Desc_MNK>
-    static auto PadDescriptor_MNK(Desc_MNK desc_mnk,
-                                  index_t gridSize,
-                                  index_t blockSize,
-                                  index_t num_threads_m,
-                                  index_t num_threads_n,
-                                  index_t num_threads_k)
-    {
-        std::ignore = blockSize;
-        std::ignore = gridSize;
-
-        const auto m = desc_mnk.GetLength(I0);
-        const auto n = desc_mnk.GetLength(I1);
-        const auto k = desc_mnk.GetLength(I2);
-
-        const index_t loop_step_m = num_threads_m * MPerThread;
-        const index_t loop_step_n = num_threads_n * NPerThread;
-        const index_t loop_step_k = num_threads_k * KPerThread;
-
-        const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m;
-        const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n;
-        const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k;
-
-        const auto desc_mnk_pad =
-            transform_tensor_descriptor(desc_mnk,
-                                        make_tuple(make_right_pad_transform(m, pad_m),
-                                                   make_right_pad_transform(n, pad_n),
-                                                   make_right_pad_transform(k, pad_k)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-        return desc_mnk_pad;
-    }
-
-    static auto MakeDescriptor_MNK(const std::array<index_t, NumDim>& lengths,
-                                   const std::array<index_t, NumDim>& stride,
-                                   index_t gridSize,
-                                   index_t blockSize,
-                                   index_t num_threads_m,
-                                   index_t num_threads_n,
-                                   index_t num_threads_k)
-    {
-        auto tupleOfShape  = generate_tuple([&](auto I) { return lengths[I]; }, Number<NumDim>{});
-        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<NumDim>{});
-
-        // nd desc - [s0, s1, s2, ...]
-        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
-
-        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDim_m, 1>::type();
-        constexpr auto nDimIds =
-            typename arithmetic_sequence_gen<NumDim_m, NumDim_m + NumDim_n, 1>::type();
-        constexpr auto kDimIds =
-            typename arithmetic_sequence_gen<NumDim_m + NumDim_n, NumDim, 1>::type();
-
-        const auto mLengths = get_container_subset(tupleOfShape, mDimIds);
-        const auto nLengths = get_container_subset(tupleOfShape, nDimIds);
-        const auto kLengths = get_container_subset(tupleOfShape, kDimIds);
-
-        // merge nd to 3d desc - [s0 * s1 * ...]
-        if constexpr(NumDim > 3)
-        {
-            const auto desc_mnk = transform_tensor_descriptor(
-                desc,
-                make_tuple(make_merge_transform(mLengths),
-                           make_merge_transform(nLengths),
-                           make_merge_transform(kLengths)),
-                make_tuple(mDimIds, nDimIds, kDimIds),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            return PadDescriptor_MNK(
-                desc_mnk, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k);
-        }
-        else
-            return PadDescriptor_MNK(
-                desc, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k);
-    }
-
-    template <index_t TupleSize>
-    static auto GenerateInOutGrid3dDescTuple(Number<TupleSize>)
-    {
-        return generate_tuple(
-            [&](auto) {
-                if constexpr(NumDim > 3)
-                {
-                    return MakeDescriptor_MNK({1, 1, 1}, {1, 1, 1}, 1, 1, 1, 1, 1);
-                }
-                else
-                {
-                    return MakeDescriptor_MNK({1}, {1}, 1, 1, 1, 1, 1);
-                };
-            },
-            Number<TupleSize>{});
-    }
-
-    using OutGrid3dDescTuple = decltype(GenerateInOutGrid3dDescTuple(Number<NumOutput>{}));
-    using InGrid3dDescTuple  = decltype(GenerateInOutGrid3dDescTuple(Number<NumInput>{}));
-
-    using GridwiseElementwise = GridwiseElementwise_3D<InGrid3dDescTuple,
-                                                       OutGrid3dDescTuple,
-                                                       InDataTypePointerTuple,
-                                                       OutDataTypePointerTuple,
-                                                       ElementwiseOperation,
-                                                       MPerThread,
-                                                       NPerThread,
-                                                       KPerThread,
-                                                       InScalarPerVectorSeq,
-                                                       OutScalarPerVectorSeq>;
-
-    struct Argument : public BaseArgument
-    {
-        Argument(const std::array<index_t, NumDim> lengths,
-                 const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
-                 const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
-                 const std::array<const void*, NumInput> in_dev_buffers,
-                 const std::array<void*, NumOutput> out_dev_buffers,
-                 ElementwiseOperation elementwise_op)
-
-            : lengths_(lengths),
-              inStridesArray_(inStridesArray),
-              outStridesArray_(outStridesArray),
-              elementwise_op_(elementwise_op),
-              blockSize_(256)
-        {
-            static_assert(NumDim_m > 0, "");
-            static_assert(NumDim_n > 0, "");
-            static_assert(NumDim_k > 0, "");
-
-            in_dev_buffers_ = generate_tuple(
-                [&](auto I) {
-                    using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
-                    return static_cast<const DataType*>(in_dev_buffers[I.value]);
-                },
-                Number<NumInput>{});
-
-            out_dev_buffers_ = generate_tuple(
-                [&](auto I) {
-                    using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
-                    return static_cast<DataType*>(out_dev_buffers[I.value]);
-                },
-                Number<NumOutput>{});
-        }
-
-        InDataTypePointerTuple in_dev_buffers_;
-        OutDataTypePointerTuple out_dev_buffers_;
-
-        std::array<index_t, NumDim> lengths_;
-        std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
-        std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_;
-
-        ElementwiseOperation elementwise_op_;
-        index_t blockSize_;
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            index_t gridSize      = getAvailableComputeUnitCount(stream_config) * arg.blockSize_;
-            index_t num_threads_m = gridSize / (16 * 16);
-            index_t num_threads_n = 16;
-            index_t num_threads_k = 16;
-
-            auto in_grid_3d_desc_tuple = generate_tuple(
-                [&](auto I) {
-                    return MakeDescriptor_MNK(arg.lengths_,
-                                              arg.inStridesArray_[I.value],
-                                              gridSize,
-                                              arg.blockSize_,
-                                              num_threads_m,
-                                              num_threads_n,
-                                              num_threads_k);
-                },
-                Number<NumInput>{});
-
-            auto out_grid_3d_desc_tuple = generate_tuple(
-                [&](auto I) {
-                    return MakeDescriptor_MNK(arg.lengths_,
-                                              arg.outStridesArray_[I.value],
-                                              gridSize,
-                                              arg.blockSize_,
-                                              num_threads_m,
-                                              num_threads_n,
-                                              num_threads_k);
-                },
-                Number<NumOutput>{});
-
-            const auto kernel = kernel_elementwise_3d<GridwiseElementwise,
-                                                      InGrid3dDescTuple,
-                                                      OutGrid3dDescTuple,
-                                                      InDataTypePointerTuple,
-                                                      OutDataTypePointerTuple,
-                                                      ElementwiseOperation>;
-
-            float elapsed_time = launch_and_time_kernel(stream_config,
-                                                        kernel,
-                                                        dim3(gridSize),
-                                                        dim3(arg.blockSize_),
-                                                        0,
-                                                        in_grid_3d_desc_tuple,
-                                                        out_grid_3d_desc_tuple,
-                                                        arg.in_dev_buffers_,
-                                                        arg.out_dev_buffers_,
-                                                        arg.elementwise_op_,
-                                                        num_threads_m,
-                                                        num_threads_n,
-                                                        num_threads_k);
-            return elapsed_time;
-        }
-
-        // polymorphic
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-
-        if(pArg == nullptr)
-            return false;
-
-        if(pArg->lengths_.back() % MPerThread != 0)
-            return false;
-
-        auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
-                                          const std::array<index_t, NumDim>& strides,
-                                          index_t scalarPerVector,
-                                          index_t vectorDim) {
-            if(strides[vectorDim] == 1 &&
-               (lengths[vectorDim] % scalarPerVector == 0 ||
-                lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
-            {
-                return true;
-            }
-
-            if(strides[vectorDim] >= scalarPerVector)
-            {
-                return true;
-            }
-            return false;
-        };
-
-        bool valid = true;
-        static_for<0, NumInput, 1>{}([&](auto I) {
-            valid = valid && IsScalarPerVectorValid(pArg->lengths_,
-                                                    pArg->inStridesArray_[I.value],
-                                                    InScalarPerVectorSeq::At(I),
-                                                    NumDim_m - 1);
-        });
-
-        static_for<0, NumOutput, 1>{}([&](auto I) {
-            valid = valid && IsScalarPerVectorValid(pArg->lengths_,
-                                                    pArg->outStridesArray_[I.value],
-                                                    OutScalarPerVectorSeq::At(I),
-                                                    NumDim - 1);
-        });
-
-        return valid;
-    }
-
-    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
-                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
-                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
-                        const std::array<const void*, NumInput> in_dev_buffers,
-                        const std::array<void*, NumOutput> out_dev_buffers,
-                        ElementwiseOperation elementwise_op) override
-    {
-        return std::make_unique<Argument>(lengths,
-                                          inStridesArray,
-                                          outStridesArray,
-                                          in_dev_buffers,
-                                          out_dev_buffers,
-                                          elementwise_op);
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>();
-    }
-}; // namespace device
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp
-// SPDX-License-Identifier: MIT
-// // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-#include "ck/tensor_description/cluster_descriptor.hpp"
-#include "ck/utility/data_type.hpp"
-#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-namespace ck {
-
-template <typename GridwiseElementwise3dFunctor,
-          typename InGrid3dDescTuple,
-          typename OutGrid3dDescTuple,
-          typename InDataTypePointerTuple,
-          typename OutDataTypePointerTuple,
-          typename ElementwiseOperation>
-__global__ void kernel_elementwise_3d(const InGrid3dDescTuple in_grid_3d_desc_tuple,
-                                      const OutGrid3dDescTuple out_grid_3d_desc_tuple,
-                                      const InDataTypePointerTuple p_in_global_tuple,
-                                      const OutDataTypePointerTuple p_out_global_tuple,
-                                      const ElementwiseOperation elementwise_op,
-                                      const index_t num_threads_m,
-                                      const index_t num_threads_n,
-                                      const index_t num_threads_k)
-{
-    GridwiseElementwise3dFunctor::Run(in_grid_3d_desc_tuple,
-                                      out_grid_3d_desc_tuple,
-                                      p_in_global_tuple,
-                                      p_out_global_tuple,
-                                      elementwise_op,
-                                      num_threads_m,
-                                      num_threads_n,
-                                      num_threads_k);
-}
-
-template <typename InGrid3dDescTuple,
-          typename OutGrid3dDescTuple,
-          typename InDataTypePointerTuple,
-          typename OutDataTypePointerTuple,
-          typename ElementwiseOperation,
-          index_t MPerThread,
-          index_t NPerThread,
-          index_t KPerThread,
-          typename InScalarPerVectorSeq,
-          typename OutScalarPerVectorSeq>
-struct GridwiseElementwise_3D
-{
-    static constexpr index_t NumInput  = InDataTypePointerTuple::Size();
-    static constexpr index_t NumOutput = OutDataTypePointerTuple::Size();
-
-    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
-                      NumOutput == OutScalarPerVectorSeq::Size() &&
-                      NumInput == InGrid3dDescTuple::Size() &&
-                      NumOutput == OutGrid3dDescTuple::Size(),
-                  "Tuple size is inconsistent with the number of in/out!");
-
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-
-    static constexpr auto thread_buffer_desc_mnk = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MPerThread>{}, Number<NPerThread>{}, Number<KPerThread>{}));
-
-    using PassThroughOp = tensor_operation::element_wise::PassThrough;
-
-    __device__ static void Run(const InGrid3dDescTuple in_grid_3d_desc_tuple,
-                               const OutGrid3dDescTuple out_grid_3d_desc_tuple,
-                               const InDataTypePointerTuple p_in_global_tuple,
-                               const OutDataTypePointerTuple p_out_global_tuple,
-                               const ElementwiseOperation elementwise_op,
-                               const index_t num_threads_m,
-                               const index_t num_threads_n,
-                               const index_t num_threads_k)
-    {
-        auto in_thread_buf_tuple = generate_tuple(
-            [&](auto I) {
-                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
-                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
-
-                return StaticBuffer<AddressSpaceEnum::Vgpr,
-                                    DataType,
-                                    MPerThread * NPerThread * KPerThread,
-                                    true>{};
-            },
-            Number<NumInput>{});
-
-        auto out_thread_buf_tuple = generate_tuple(
-            [&](auto I) {
-                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
-                using DataType        = remove_pointer_t<DataTypePointer>;
-
-                return StaticBuffer<AddressSpaceEnum::Vgpr,
-                                    DataType,
-                                    MPerThread * NPerThread * KPerThread,
-                                    true>{};
-            },
-            Number<NumOutput>{});
-
-        auto in_global_buf_tuple = generate_tuple(
-            [&](auto I) {
-                return make_dynamic_buffer<AddressSpaceEnum::Global>(
-                    p_in_global_tuple[I], in_grid_3d_desc_tuple[I].GetElementSpaceSize());
-            },
-            Number<NumInput>{});
-
-        auto out_global_buf_tuple = generate_tuple(
-            [&](auto I) {
-                return make_dynamic_buffer<AddressSpaceEnum::Global>(
-                    p_out_global_tuple[I], out_grid_3d_desc_tuple[I].GetElementSpaceSize());
-            },
-            Number<NumOutput>{});
-
-        const auto M = in_grid_3d_desc_tuple[I0].GetLength(I0);
-        const auto N = in_grid_3d_desc_tuple[I0].GetLength(I1);
-        const auto K = in_grid_3d_desc_tuple[I0].GetLength(I2);
-
-        const index_t loop_step_m = num_threads_m * MPerThread;
-        const index_t loop_step_n = num_threads_n * NPerThread;
-        const index_t loop_step_k = num_threads_k * KPerThread;
-
-        const index_t thread_1d_id = get_thread_global_1d_id();
-
-        const index_t tid_m  = thread_1d_id / (num_threads_n * num_threads_k);
-        const index_t tid_nk = thread_1d_id % (num_threads_n * num_threads_k);
-        const index_t tid_n  = tid_nk / num_threads_k;
-        const index_t tid_k  = tid_nk % num_threads_k;
-
-        const auto thread_global_offset =
-            make_multi_index(tid_m * MPerThread, tid_n * NPerThread, tid_k * KPerThread);
-
-        auto in_global_load_tuple = generate_tuple(
-            [&](auto I) {
-                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
-                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
-
-                return ThreadwiseTensorSliceTransfer_v2<
-                    DataType,
-                    DataType,
-                    decltype(in_grid_3d_desc_tuple[I]),
-                    decltype(thread_buffer_desc_mnk),
-                    Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
-                    Sequence<0, 1, 2>,                            // DimAccessOrder
-                    01,                                           // SrcVectorDim
-                    InScalarPerVectorSeq::At(I), // InScalarPerVectorSeq::At(I),                  //
-                                                 // ScalarPerVector
-                    1,                           // SrcScalarStrideInVector
-                    true>{in_grid_3d_desc_tuple[I], thread_global_offset};
-            },
-            Number<NumInput>{});
-
-        auto out_global_store_tuple = generate_tuple(
-            [&](auto I) {
-                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
-                using DataType        = remove_pointer_t<DataTypePointer>;
-
-                return ThreadwiseTensorSliceTransfer_v1r3<
-                    DataType,
-                    DataType,
-                    decltype(thread_buffer_desc_mnk),
-                    decltype(out_grid_3d_desc_tuple[I]),
-                    PassThroughOp,
-                    Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
-                    Sequence<0, 1, 2>,                            // DimAccessOrder
-                    2,                                            // SrcVectorDim
-                    OutScalarPerVectorSeq::At(I),                 // OutScalarPerVectorSeq::At(I),
-                    InMemoryDataOperationEnum::Set,
-                    1,
-                    true>(out_grid_3d_desc_tuple[I], thread_global_offset, PassThroughOp{});
-            },
-            Number<NumOutput>{});
-
-        index_t num_iter_m = M / (loop_step_m);
-        do
-        {
-            index_t num_iter_n = N / (loop_step_n);
-            do
-            {
-                index_t num_iter_k = K / (loop_step_k);
-                do
-                {
-                    static_for<0, NumInput, 1>{}([&](auto I) {
-                        in_global_load_tuple(I).Run(in_grid_3d_desc_tuple[I],
-                                                    in_global_buf_tuple[I],
-                                                    thread_buffer_desc_mnk,
-                                                    make_tuple(I0, I0, I0),
-                                                    in_thread_buf_tuple(I));
-
-                        in_global_load_tuple(I).MoveSrcSliceWindow(
-                            in_grid_3d_desc_tuple[I], make_multi_index(0, 0, loop_step_k));
-                    });
-
-                    static_for<0, MPerThread, 1>{}([&](auto iM) {
-                        static_for<0, NPerThread, 1>{}([&](auto iN) {
-                            static_for<0, KPerThread, 1>{}([&](auto iK) {
-                                constexpr auto offset =
-                                    thread_buffer_desc_mnk.CalculateOffset(make_tuple(iM, iN, iK));
-                                // get reference to in data
-                                const auto in_data_refs = generate_tie(
-                                    // return type should be lvalue
-                                    [&](auto I) -> const auto& {
-                                        return in_thread_buf_tuple(I)(Number<offset>{});
-                                    },
-                                    Number<NumInput>{});
-
-                                // get referenec to dst data
-                                auto out_data_refs = generate_tie(
-                                    // return type should be lvalue
-                                    [&](auto I) -> auto& {
-                                        return out_thread_buf_tuple(I)(Number<offset>{});
-                                    },
-                                    Number<NumOutput>{});
-                                unpack2(elementwise_op, out_data_refs, in_data_refs);
-                            });
-                        });
-                    });
-
-                    static_for<0, NumOutput, 1>{}([&](auto I) {
-                        out_global_store_tuple(I).Run(thread_buffer_desc_mnk,
-                                                      make_tuple(I0, I0, I0),
-                                                      out_thread_buf_tuple[I],
-                                                      out_grid_3d_desc_tuple[I],
-                                                      out_global_buf_tuple(I));
-
-                        out_global_store_tuple(I).MoveDstSliceWindow(
-                            out_grid_3d_desc_tuple[I], make_multi_index(0, 0, loop_step_k));
-                    });
-                } while(--num_iter_k);
-
-                static_for<0, NumInput, 1>{}([&](auto I) {
-                    in_global_load_tuple(I).MoveSrcSliceWindow(
-                        in_grid_3d_desc_tuple[I],
-                        make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
-                });
-
-                static_for<0, NumOutput, 1>{}([&](auto I) {
-                    out_global_store_tuple(I).MoveDstSliceWindow(
-                        out_grid_3d_desc_tuple[I],
-                        make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
-                });
-
-            } while(--num_iter_n);
-
-            static_for<0, NumInput, 1>{}([&](auto I) {
-                in_global_load_tuple(I).MoveSrcSliceWindow(
-                    in_grid_3d_desc_tuple[I],
-                    make_multi_index(loop_step_m,
-                                     -(N / loop_step_n) * loop_step_n,
-                                     -(K / loop_step_k) * loop_step_k));
-            });
-
-            static_for<0, NumOutput, 1>{}([&](auto I) {
-                out_global_store_tuple(I).MoveDstSliceWindow(
-                    out_grid_3d_desc_tuple[I],
-                    make_multi_index(loop_step_m,
-                                     -(N / loop_step_n) * loop_step_n,
-                                     -(K / loop_step_k) * loop_step_k));
-            });
-        } while(--num_iter_m);
-    }
-};
-
-} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using device_transpose_f16_instances = std::tuple<
-    // FOR 16, 32, 16, 32, 16
-    // clang-format off
-    DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<1>, ck::Sequence<1>>,
-    DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 1, 1, ck::Sequence<1>, ck::Sequence<1>>,
-    DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 4, 4, ck::Sequence<1>, ck::Sequence<1>>
-    // clang-format on
-    >;
-
-using device_transpose_f32_instances = std::tuple<
-    // for 16, 8, 16, 32, 8 -> test with instances for fp16
-    // clang-format off
-    DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
-    DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 4, ck::Sequence<1>, ck::Sequence<1>>,
-    DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 8, ck::Sequence<1>, ck::Sequence<1>>
-    // clang-format on
-    >;
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-#include <memory>
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-void add_device_transpose_f16_instances(
-    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 5>>>&
-        instances);
-
-void add_device_transpose_f32_instances(
-    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 5>>>&
-        instances);
-
-template <typename InDataTypeTuple,
-          typename OutDataTypeTuple,
-          typename ElementwiseOperation,
-          index_t NumDim>
-struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::
-        DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>
-{
-    using DeviceOp =
-        DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>;
-
-    static auto GetInstances()
-    {
-        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-        if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
-                     is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
-        {
-            add_device_transpose_f32_instances(op_ptrs);
-        }
-        else if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
-                          is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
-        {
-            add_device_transpose_f16_instances(op_ptrs);
-        }
-        return op_ptrs;
-    }
-};
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/transpose/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/transpose/CMakeLists.txt
-add_instance_library(device_transpose_instance 
-	device_transpose_instances_3d.cpp
-)
--- a/library/src/tensor_operation_instance/gpu/transpose/device_transpose_instances_3d.cpp
+++ b/library/src/tensor_operation_instance/gpu/transpose/device_transpose_instances_3d.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using F16         = ck::half_t;
-using F32         = float;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-void add_device_transpose_f16_instances(
-    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 5>>>&
-        instances)
-{
-#ifdef CK_ENABLE_FP16
-    add_device_operation_instances(instances, device_transpose_f16_instances{});
-#else
-    ignore = instances;
-#endif
-}
-
-void add_device_transpose_f32_instances(
-    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 5>>>&
-        instances)
-{
-#ifdef CK_ENABLE_FP32
-    add_device_operation_instances(instances, device_transpose_f32_instances{});
-#else
-    ignore = instances;
-#endif
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/profiler/include/profiler/profile_transpose_impl.hpp
+++ b/profiler/include/profiler/profile_transpose_impl.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iomanip>
-#include <iostream>
-#include <typeinfo>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-
-namespace ck {
-namespace profiler {
-
-template <typename HostTensorA, typename HostTensorB, typename Functor>
-void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
-{
-    for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
-        for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
-            for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
-                for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
-                    for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
-                    {
-                        auto a_val = A_ncdhw(n, c, d, h, w);
-                        functor(B_nchwd(n, c, h, w, d), a_val);
-                    }
-}
-
-template <typename ADataType, typename BDataType, index_t NumDim>
-bool profile_transpose_impl(int do_verification,
-                            int init_method,
-                            bool do_log,
-                            bool time_kernel,
-                            std::vector<index_t> lengths)
-{
-    bool pass = true;
-
-    index_t N = lengths[0];
-    index_t C = lengths[1];
-    index_t D = lengths[2];
-    index_t H = lengths[3];
-    index_t W = lengths[4];
-
-    std::vector<ck::index_t> ncdhw = {N, C, D, H, W};
-    std::vector<ck::index_t> ndhwc = {N, D, H, W, C};
-    Tensor<ADataType> a(ncdhw);
-    Tensor<BDataType> b(ndhwc);
-    Tensor<BDataType> host_b(ndhwc);
-
-    // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-
-    std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
-    std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
-    std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
-
-    std::cout << "A: " << a.mDesc << std::endl;
-    std::cout << "B: " << b.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
-    default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-    }
-
-    using ElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-    // const auto element_op = ElementOp{};
-
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-
-    a_device_buf.ToDevice(a.mData.data());
-
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-    using DeviceOp                   = ck::tensor_operation::device::
-        DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, ElementOp, NumDim>;
-
-    // get device op instances
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-
-    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
-
-    if(do_verification)
-    {
-        host_elementwise4D(host_b, a, ElementOp{});
-    }
-
-    std::string best_op_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-
-    for(auto& op_ptr : op_ptrs)
-    {
-        auto argument_ptr = op_ptr->MakeArgumentPointer(
-            ab_lengths, {a_strides}, {b_strides}, input, output, ElementOp{});
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-
-            // re-init C to zero before profiling next kernel
-            b_device_buf.SetZero();
-
-            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
-
-            if(do_verification)
-            {
-                b_device_buf.FromDevice(b.mData.data());
-
-                pass &= ck::utils::check_err(
-                    b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-
-                if(do_log)
-                {
-                    LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
-                }
-            }
-
-            std::string op_name = op_ptr->GetTypeString();
-
-            float ave_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-
-            std::size_t flop =
-                std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
-
-            std::size_t num_btype =
-                sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
-                sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
-
-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
-
-            // pass = pass & ck::utils::check_err(b_device_result, b_host_result);
-            pass &= ck::utils::check_err(
-                b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-
-            if(tflops > best_tflops)
-            {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
-            }
-        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
-        }
-    }
-
-    std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W
-              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
-              << " GB/s, " << best_op_name << std::endl;
-
-    return pass;
-}
-
-} // namespace profiler
-} // namespace ck
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -148,7 +148,6 @@ add_subdirectory(pool)
 add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
 add_subdirectory(conv_tensor_rearrange)
-add_subdirectory(transpose)
 if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
--- a/test/transpose/CMakeLists.txt
+++ b/test/transpose/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_gtest_executable(test_transpose test_transpose.cpp)
-   target_link_libraries(test_transpose PRIVATE utility device_transpose_instance)
-   set(target 1)
- endif()
-endforeach()
--- a/test/transpose/test_transpose.cpp
+++ b/test/transpose/test_transpose.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "test_transpose_util.hpp"
-
-using F16 = ck::half_t;
-using F32 = float;
-
-template <typename Tuple>
-class TestTranspose : public ::testing::Test
-{
-};
-
-// clang-format off
-using KernelTypes = ::testing::Types<
-    std::tuple<      F16,       F16>,
-    std::tuple<      F32,       F32>
-    >;
-// clang-format on
-
-TYPED_TEST_SUITE(TestTranspose, KernelTypes);
-
-//#include "test_transpose_ut_cases.inc"
--- a/test/transpose/test_transpose_ut_cases.inc
+++ b/test/transpose/test_transpose_ut_cases.inc
-#pragma once
-
-TYPED_TEST(TestTranspose, Test1)
-{
-    // for 16, 8, 16, 32, 8
-    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
-    std::vector<index_t> lengths{16, 8, 16, 32, 8};
-    /**constexpr int N = 16;
-    constexpr int C = 8;
-    constexpr int D = 16;
-    constexpr int H = 32;
-    constexpr int W = 8;**/
-
-    this->Run();
-}
-
-
-TYPED_TEST(TestTranpose, Test2)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    std::vector<index_t> lengths{16, 8, 16, 32, 16};
-    /**constexpr int N = 16;
-    constexpr int C = 8;
-    constexpr int D = 16;
-    constexpr int H = 32;
-    constexpr int W = 8;**/
-
-    this->Run();
-}
-
--- a/test/transpose/test_transpose_util.hpp
+++ b/test/transpose/test_transpose_util.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <string>
-#include <sstream>
-#include <tuple>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "include/ck/utility/data_type.hpp"
-#include "profiler/profile_transpose_impl.hpp"
-
-namespace ck {
-namespace test {
-
-template <typename Tuple>
-class TestTranspose : public testing::Test
-{
-    using F32 = float;
-
-    protected:
-    using ADataType = std::tuple_element_t<0, Tuple>;
-    using BDataType = std::tuple_element_t<1, Tuple>;
-
-    public:
-    static constexpr bool verify_              = true;
-    static constexpr int init_method_          = 1; // decimal value initialization
-    static constexpr bool log_                 = false;
-    static constexpr bool bench_               = false; // measure kernel performance
-    std::vector<std::vector<index_t>> lengths_ = {{16, 32, 16, 32, 16}, {16, 8, 16, 32, 8}};
-
-    void Run()
-    {
-        for(auto length : this->lengths_)
-        {
-            this->RunSingle(length);
-        }
-    }
-
-    void RunSingle()
-    {
-        bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
-            verify_, init_method_, log_, bench_, lengths_);
-        EXPECT_TRUE(pass);
-    }
-};
-
-} // namespace test
-} // namespace ck