adding client example

574fd35a · Astha Rai · f157518d · 574fd35a · 574fd35a · 574fd35a
Commit 574fd35a authored Oct 09, 2023 by Astha Rai
4 changed files
--- a/client_example/23_elementwise_transpose/CMakeLists.txt
+++ b/client_example/23_elementwise_transpose/CMakeLists.txt
+add_executable(client_elementwise_transpose3d elementwise_transpose_3d.cpp)
+target_link_libraries(client_elementwise_transpose3d PRIVATE composable_kernel::device_operations)
--- a/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
+++ b/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using ADataType = F16;
+using BDataType = F16;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main()
+{
+    bool time_kernel = true;
+    const int N = 16;
+    const int C = 8;
+    const int D = 8;
+    const int H = 8;
+    const int W = 8;
+    std::vector<std::size_t> ncdhw = {N, C, D, H, W};
+    std::vector<std::size_t> nchwd = {N, C, H, W, D};
+    Tensor<ADataType> a(ncdhw);
+    Tensor<BDataType> b(nchwd);
+    auto size = N * C * D * H * W;
+    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+    std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
+    std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, 1, D * H, D};
+    std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    SimpleDeviceMem a_dev_buf(sizeof(ADataType) * size);
+    SimpleDeviceMem b_dev_buf(sizeof(BDataType) * size);
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+    using DeviceElementwisePermuteInstance =
+        ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>,
+                                                              ck::Tuple<BDataType>,
+                                                              PassThrough,
+                                                              2,
+                                                              2,
+                                                              1,
+                                                              8,
+                                                              8,
+                                                              8,
+                                                              ck::Sequence<8>,
+                                                              ck::Sequence<1>>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t num_btype =
+                sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
+                sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using namespace ck::tensor_layout::convolution;
+using F16 = ck::half_t;
+using F32 = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using device_transpose_f16_instances = std::tuple<
+    // clang-format off   FOR 16, 32, 16, 32, 16
+    DeviceElementwise3dImpl<ck::Tuple<F16>,
+                            ck::Tuple<F16>,
+                            2,
+                            2,
+                            1,
+                            8,
+                            8,
+                            8,
+                            ck::Sequence<8>,
+                            ck::Sequence<8>>,
+    DeviceElementwise3dImpl<ck::Tuple<F16>,
+                            ck::Tuple<F16>,
+                            2,
+                            2,
+                            1,
+                            8,
+                            8,
+                            8,
+                            ck::Sequence<8>,
+                            ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F16>,
+                            ck::Tuple<F16>,
+                            2,
+                            2,
+                            1,
+                            8,
+                            8,
+                            8,
+                            ck::Sequence<1>,
+                            ck::Sequence<8>>,
+    DeviceElementwise3dImpl<ck::Tuple<F16>,
+                            ck::Tuple<F16>,
+                            2,
+                            2,
+                            1,
+                            8,
+                            8,
+                            8,
+                            ck::Sequence<1>,
+                            ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F16>,
+                            ck::Tuple<F16>,
+                            2,
+                            2,
+                            1,
+                            8,
+                            1,
+                            1,
+                            ck::Sequence<1>,
+                            ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F16>,
+                            ck::Tuple<F16>,
+                            2,
+                            2,
+                            1,
+                            8,
+                            1,
+                            1,
+                            ck::Sequence<8>,
+                            ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F16>,
+                            ck::Tuple<F16>,
+                            2,
+                            2,
+                            1,
+                            8,
+                            4,
+                            4,
+                            ck::Sequence<1>,
+                            ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F16>,
+                            ck::Tuple<F16>,
+                            2,
+                            2,
+                            1,
+                            8,
+                            4,
+                            4,
+                            ck::Sequence<8>,
+                            ck::Sequence<8>>
+    // clang-format on
+    >;
+using device_transpose_f32_instances = std::tuple<
+    // clang-format off // for 16, 8, 16, 32, 8 -> test with instances for fp16
+    DeviceElementwise3dImpl<ck::Tuple<F32>,
+                            ck::Tuple<F32>,
+                            2,
+                            2,
+                            1,
+                            4,
+                            4,
+                            4,
+                            ck::Sequence<1>,
+                            ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>,
+                            ck::Tuple<F32>,
+                            2,
+                            2,
+                            1,
+                            4,
+                            4,
+                            4,
+                            ck::Sequence<8>,
+                            ck::Sequence<1>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>,
+                            ck::Tuple<F32>,
+                            2,
+                            2,
+                            1,
+                            4,
+                            4,
+                            4,
+                            ck::Sequence<8>,
+                            ck::Sequence<8>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>,
+                            ck::Tuple<F32>,
+                            2,
+                            2,
+                            1,
+                            4,
+                            8,
+                            4,
+                            ck::Sequence<8>,
+                            ck::Sequence<8>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>,
+                            ck::Tuple<F32>,
+                            2,
+                            2,
+                            1,
+                            4,
+                            8,
+                            8,
+                            ck::Sequence<8>,
+                            ck::Sequence<8>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>,
+                            ck::Tuple<F32>,
+                            2,
+                            2,
+                            1,
+                            4,
+                            8,
+                            8,
+                            ck::Sequence<4>,
+                            ck::Sequence<8>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>,
+                            ck::Tuple<F32>,
+                            2,
+                            2,
+                            1,
+                            4,
+                            8,
+                            8,
+                            ck::Sequence<4>,
+                            ck::Sequence<4>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>,
+                            ck::Tuple<F32>,
+                            2,
+                            2,
+                            1,
+                            4,
+                            8,
+                            8,
+                            ck::Sequence<8>,
+                            ck::Sequence<4>>,
+    DeviceElementwise3dImpl<ck::Tuple<F32>,
+                            ck::Tuple<F32>,
+                            2,
+                            2,
+                            1,
+                            4,
+                            4,
+                            8,
+                            ck::Sequence<8>,
+                            ck::Sequence<8>>,
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
\ No newline at end of file
--- a/library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise_3d_impl.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using namespace ck::transpose_op;
+void add_device_transpose_f16_instances(
+    std::vector<std::unique_ptr<DeviceElementwise3dImpl<F16, F16, NCDHW, 3>>>& instances);
+void add_device_transpose_f32_instances(
+    std::vector<std::unique_ptr<DeviceElementwise3dImpl<F32, F32, NCDHW, 3>>>& instances);
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          index_t NumDim>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::
+        DeviceElementwise3dImpl<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>
+{
+    using DeviceOp = DeviceElementwise3dImpl<InDataTypeTuple,
+                                             OutDataTypeTuple,
+                                             ElementwiseOperation,
+                                             NumDim_m, // choose how to set dims
+                                             NumDim_n,
+                                             NumDim_k,
+                                             MPerThread,
+                                             NPerThread,
+                                             KPerThread,
+                                             InScalarPerVectorSeq,
+                                             OutScalarPerVectorSeq>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+        {
+            add_device_transpose_f32_instances(op_ptrs);
+        }
+        else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+        {
+            add_device_transpose_f16_instances(op_ptrs);
+        }
+    }
+    return op_ptrs;
+}
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
\ No newline at end of file