adding test/profiler/instance files for hipTensor permute unit test

6be4ff70 · Astha Rai · 9d171dc5 · 6be4ff70 · 6be4ff70 · 6be4ff70
Commit 6be4ff70 authored Nov 29, 2023 by Astha Rai
7 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/permute_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/permute_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_permute_scale_f16_instances(
+    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, element_wise::UnarySquare, Scale, 4>>>&);
+void add_device_permute_scale_f32_instances(
+    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, element_wise::UnarySquare, Scale, 4>>>&);
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          typename Scale,
+          index_t NumDim>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::
+        DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, UnaryOperation, Scale, NumDim>>
+{
+    using DeviceOp =
+        DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, UnaryOperation, Scale, NumDim>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
+                     is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
+        {
+            add_device_permute_scale_f32_instances(op_ptrs);
+        }
+        else if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
+                          is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
+        {
+            add_device_permute_scale_f16_instances(op_ptrs);
+        }
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt
+add_instance_library(device_permute_scale_instance 
+	device_permute_scale_instances.cpp
+)
--- a/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.cpp
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using F16 = ck::half_t;
+using F32 = float;
+using Pass        = ck::tensor_operation::element_wise::PassThrough;
+using UnaryOp     = ck::tensor_operation::element_wise::UnarySquare;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+// clang-format off
+using device_permute_scale_f16_instances =
+    std::tuple <
+        DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<1>, ck::Sequence<1>>
+    >;
+using device_permute_scale_f32_instances = std::tuple<
+         DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<1>, ck::Sequence<1>>
+    >;
+// clang-format on
+void add_device_permute_scale_f16_instances(
+    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_permute_scale_f16_instances{});
+}
+void add_device_permute_scale_f32_instances(
+    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_permute_scale_f32_instances{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/include/profiler/profile_permute_scale_impl.hpp
+++ b/profiler/include/profiler/profile_permute_scale_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+namespace ck {
+namespace profiler {
+template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
+void host_elementwise4D(HostTensorB& B_nhwc,
+                        const HostTensorA& A_nchw,
+                        FunctorA functor_a,
+                        FunctorB functor_b,
+                        float scale)
+{
+    std::size_t N = A_nchw.mDesc.GetLengths()[0];
+    std::size_t C = A_nchw.mDesc.GetLengths()[1];
+    std::size_t H = A_nchw.mDesc.GetLengths()[2];
+    std::size_t W = A_nchw.mDesc.GetLengths()[3];
+    for(std::size_t w = 0; w < W; ++w)
+        for(std::size_t h = 0; h < H; ++h)
+            for(std::size_t c = 0; c < C; ++c)
+                for(std::size_t n = 0; n < N; ++n)
+                {
+	            using tmp_type = ck::remove_reference_t<decltype(B_nhwc(0, 0))>;
+                    tmp_type tmp_val = 0;
+                    auto a_val = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
+                    functor_b(tmp_val, a_val);
+                    functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
+                              scale * tmp_val);
+                }
+}
+template <typename ADataType, typename BDataType, index_t NumDim>
+bool profile_permute_scale_impl(int do_verification,
+                            int init_method,
+                            bool do_log,
+                            bool time_kernel,
+                            std::vector<index_t> lengths)
+{
+    bool pass = true;
+    using ElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using UnaryOp     = ck::tensor_operation::element_wise::UnarySquare;
+    using Scale       = ck::tensor_operation::element_wise::Scale;
+    float scale = 2.f;
+    index_t N = lengths[0];
+    index_t C = lengths[1];
+    index_t H = lengths[2];
+    index_t W = lengths[3];
+    std::vector<ck::index_t> nchw = {N, C, H, W};
+    std::vector<ck::index_t> nhwc = {N, H, W, C};
+    Tensor<ADataType> a(nchw);
+    Tensor<BDataType> b(nhwc);
+    Tensor<BDataType> host_b(nhwc);
+    // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+    std::array<ck::index_t, 4> ab_lengths;
+    std::array<ck::index_t, 4> a_strides = {1,
+                                            static_cast<int>(nchw[0]),
+                                            static_cast<int>(nchw[0] * nchw[1]),
+                                            static_cast<int>(nchw[0] * nchw[1] * nchw[2])};
+    std::array<ck::index_t, 4> b_strides = {1,
+                                            static_cast<int>(nhwc[0] * nhwc[1] * nhwc[2]),
+                                            static_cast<int>(nhwc[0]),
+                                            static_cast<int>(nhwc[0] * nhwc[1])};
+    ck::ranges::copy(nchw, ab_lengths.begin());
+    std::cout << "A: " << a.mDesc << std::endl;
+    std::cout << "B: " << b.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
+    default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a.mData.data());
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+    using DeviceOp                   = ck::tensor_operation::device::
+        DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, ElementOp, UnaryOp, Scale, NumDim>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_instance_name;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    if(do_verification)
+    {
+        host_elementwise4D(host_b, a, ElementOp{}, UnaryOp{}, scale);
+    }
+    int num_kernel = 0;
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            ab_lengths, {a_strides}, {b_strides}, input, output, ElementOp{}, UnaryOp{}, Scale{scale});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            b_device_buf.SetZero();
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+            if(do_verification)
+            {
+                b_device_buf.FromDevice(b.mData.data());
+                pass &= ck::utils::check_err(
+                    b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
+                }
+            }
+            std::string op_name = op_ptr->GetTypeString();
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            std::size_t flop =
+                std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
+            std::size_t num_btype =
+                sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
+                sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            // pass = pass & ck::utils::check_err(b_device_result, b_host_result);
+            pass &= ck::utils::check_err(
+                b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+            if(tflops > best_tflops)
+            {
+                best_instance_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", lengths, ",") << ", ";
+        std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_ave_time << " ms, "
+                  << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
+    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is tested" << std::endl;
+        return false;
+    }
+    return true;
+}
+} // namespace profiler
+} // namespace ck
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -149,6 +149,7 @@ add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
 add_subdirectory(conv_tensor_rearrange)
 add_subdirectory(transpose)
+add_subdirectory(permute_scale)
 if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
--- a/test/permute_scale/CMakeLists.txt
+++ b/test/permute_scale/CMakeLists.txt
+add_custom_target(test_permute)
+add_gtest_executable(test_permute_scale test_permute_scale.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_permute_scale PRIVATE utility device_permute_scale_instance)
+  add_dependencies(test_permute test_permute_scale)
+endif()
--- a/test/permute_scale/test_permute_scale.cpp
+++ b/test/permute_scale/test_permute_scale.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "profiler/profile_permute_scale_impl.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+template <typename Tuple>
+class TestPermute : public ::testing::Test
+{
+    protected:
+    using ADataType = std::tuple_element_t<0, Tuple>;
+    using BDataType = std::tuple_element_t<1, Tuple>;
+    void Run()
+    {
+        std::vector<std::vector<ck::index_t>> lengths = {{4, 2, 1, 8}, {4, 2, 8, 8}};
+        for(auto length : lengths)
+        {
+            bool success = ck::profiler::profile_permute_scale_impl<ADataType, BDataType, 4>(
+                true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<std::tuple<F16, F16>>;
+TYPED_TEST_SUITE(TestPermute, KernelTypes);
+TYPED_TEST(TestPermute, Test_FP16) { this->Run(); }