adding test files and profiler

244681cf · Astha Rai · 991ce41a · 244681cf · 244681cf · 244681cf
Commit 244681cf authored Oct 18, 2023 by Astha Rai
7 changed files
--- a/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
+++ b/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
@@ -59,21 +59,21 @@ int main()
    SimpleDeviceMem a_dev_buf(sizeof(ADataType) * size);
    SimpleDeviceMem b_dev_buf(sizeof(BDataType) * size);
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<const void*, 1> input = {a_dev_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_dev_buf.GetDeviceBuffer()};
    using DeviceElementwisePermuteInstance =
-        ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>,
+        ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
-                                                              ck::Tuple<BDataType>,
+                                                        ck::Tuple<BDataType>,
-                                                              PassThrough,
+                                                        PassThrough,
-                                                              2,
+                                                        2,
-                                                              2,
+                                                        2,
-                                                              1,
+                                                        1,
-                                                              8,
+                                                        8,
-                                                              8,
+                                                        8,
-                                                              8,
+                                                        8,
-                                                              ck::Sequence<8>,
+                                                        ck::Sequence<8>,
-                                                              ck::Sequence<1>>;
+                                                        ck::Sequence<1>>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -104,7 +104,7 @@ int main()
        {
            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
-            std::size_t num_btype =
+            std::size_t num_byte =
                sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
                sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
@@ -151,4 +151,4 @@ int main()
    }
    return 0;
 }
\ No newline at end of file
--- a/profiler/include/profiler/profile_transpose.cpp
+++ b/profiler/include/profiler/profile_transpose.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "profiler/profile_transpose_impl.hpp"
+#include "profiler_operation_registry.hpp"
+enum struct MatrixLayout
+{
+    NCDHW, // 0
+    NCHWD, // 1
+};
+enum struct DataType
+{
+    F32_F32_F32_F32_F32, // 0
+    F16_F16_F16_F16_F16, // 1
+};
+#define OP_NAME "transpose"
+#define OP_DESC "Transpose"
+int profile_transpose(int argc, char* argv[])
+{
+    if(argc != 15)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (NCDHW -> NDCHW);\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 13: N, C, D, H, W\n");
+        exit(1);
+    }
+    const auto data_type       = static_cast<DataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int N = std::stoi(argv[8]);
+    const int C = std::stoi(argv[9]);
+    const int D = std::stoi(argv[10]);
+    const int H = std::stoi(argv[11]);
+    const int W = std::stoi(argv[12]);
+    using F32 = float;
+    using F16 = ck::half_t;
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+    auto profile = [&](auto a_type, auto b_type) {
+        using ADataType = decltype(a_type);
+        using BDataType = decltype(b_type);
+        // using ALayout = decltype(a_layout);
+        // using BLayout = decltype(b_layout);
+        bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType>(
+            do_verification, init_method, do_log, time_kernel, N, C, D, H, W);
+        return pass ? 0 : 1;
+    };
+    if(data_type == GemmDataType::F32_F32_F32_F32_F32)
+    {
+        return profile(F32{}, F32{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16_F16_F16)
+    {
+        return profile(F16{}, F16{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+        return 1;
+    }
+}
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_splitk);
\ No newline at end of file
--- a/profiler/include/profiler/profile_transpose_impl.hpp
+++ b/profiler/include/profiler/profile_transpose_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+namespace ck {
+namespace profiler {
+template <typename ADataType, typename BDataType>
+bool profile_gemm_splitk_impl(int do_verification,
+                              int init_method,
+                              bool do_log,
+                              bool time_kernel,
+                              int N,
+                              int C,
+                              int D,
+                              int H,
+                              int W)
+{
+    bool pass = true;
+    std::vector<std::size_t> ncdhw = {N, C, D, H, W};
+    std::vector<std::size_t> nchwd = {N, C, H, W, D};
+    Tensor<ADataType> a(ncdhw);
+    Tensor<BDataType> b(nchwd);
+    // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+    std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
+    std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, 1, D * H, D};
+    std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    std::cout << "A: " << a.mDesc << std::endl;
+    std::cout << "B: " << b.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
+    default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+    }
+    using ElementOp = ck::tensor_operation::element_wise::PassThrough;
+    const auto element_op = ElementOp{};
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a.mData.data());
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>,
+                                                              ck::Tuple<BDataType>,
+                                                              ElementOp,
+                                                              NumDim_m,
+                                                              NumDim_n,
+                                                              NumDim_k,
+                                                              MPerThread,
+                                                              NPerThread,
+                                                              KPerThread,
+                                                              ck::Sequence<InScalarPerVector>,
+                                                              ck::Sequence<OutScalarPerVector>>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    if(do_verification)
+    {
+        using ReferenceTransposeInstance = ck::tensor_operation::host::ReferenceTranspose
+                                           << ck::Tuple<ADataType>,
+              ck::Tuple<BDataType>, ElementOp, NumDim_m, NumDim_n, NumDim_k, MPerThread, NPerThread,
+              KPerThread, ck::Sequence<InScalarPerVector>, ck::Sequence<OutScalarPerVector> > ;
+        auto ref_transpose = ReferenceTransposeInstance{};
+        auto ref_invoker   = ref_transpose.MakeInvoker();
+        auto ref_argument =
+            ref_transpose
+                .MakeArgument(ab_lengths, {a_strides}, {b_strides}, input, output, element_op{})
+                    ref_invoker.Run(ref_argument);
+    }
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            ab_lengths, {a_strides}, {b_strides}, input, output, element_op{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init C to zero before profiling next kernel
+            b_device_buf.SetZero();
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+            if(do_verification)
+            {
+                b_device_buf.FromDevice(b_device_result.mData.data());
+                pass = pass & ck::utils::check_err(b_device_result, b_host_result);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
+                }
+            }
+            std::string op_name = op_ptr->GetTypeString();
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            std::size_t flop =
+                std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
+            std::size_t num_btype =
+                sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
+                sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            pass = pass & ck::utils::check_err(b_device_result, b_host_result);
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+}
+if constexpr(is_same<BDataType, float>::value)
+{
+    std::cout << "Best Perf for datatype = f32";
+}
+else if constexpr(is_same<BDataType, half_t>::value)
+{
+    std::cout << "Best Perf for datatype = f16";
+}
+std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W << " : "
+          << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
+          << best_op_name << std::endl;
+return pass;
+}
+} // namespace profiler
+} // namespace ck
\ No newline at end of file
--- a/test/transpose/CMakeLists.txt
+++ b/test/transpose/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+   add_gtest_executable(test_transpose test_transpose.cpp)
+   target_link_libraries(test_transpose PRIVATE utility device_transpose_instance)
+   set(target 1)
+ endif()
+endforeach()
--- a/test/transpose/test_transpose.cpp
+++ b/test/transpose/test_transpose.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <tuple>
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_tranpose_util.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+enum struct MatrixLayout
+{
+    NCDHW, // 0
+    NCHWD, // 1
+};
+template <typename Tuple>
+class TestTranspose : public ck::test::TestTranspose<typename MatrixLayout<NCDHW>::type>
+{
+};
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<      F16,       F16>,
+    std::tuple<      F32,       F32>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestGemmSplitK_MK_KN, KernelTypes);
+//#include "test_transpose_ut_cases.inc"
\ No newline at end of file
--- a/test/transpose/test_transpose_interface.cpp
+++ b/test/transpose/test_transpose_interface.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_transpose_impl.hpp"
+namespace ck {
+namespace test {
+template <typename Tuple>
+class TestTranspose : public testing::Test
+{
+    using F32 = float;
+    protected:
+    // using ALayout   = std::tuple_element_t<0, Tuple>;
+    // using BLayout   = std::tuple_element_t<1, Tuple>;
+    using ADataType = std::tuple_element_t<0, Tuple>;
+    using BDataType = std::tuple_element_t<1, Tuple>;
+    public:
+    static constexpr bool verify_     = true;
+    static constexpr int init_method_ = 1; // decimal value initialization
+    static constexpr bool log_        = false;
+    static constexpr bool bench_      = false; // measure kernel performance
+    void Run(const int N, const int C, const int D, const int H, const int W)
+    {
+        RunSingle(N, H, C, D, W);
+    }
+    void RunSingle(const int N, const int C, const int D, const int H, const int W)
+    {
+        bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, >(
+            verify_, init_method_, log_, bench_, N, C, D, H, W);
+        EXPECT_TRUE(pass);
+    }
+};
+} // namespace test
+} // namespace ck
\ No newline at end of file
--- a/test/transpose/test_transpose_ut_cases.inc
+++ b/test/transpose/test_transpose_ut_cases.inc
+#pragma once
+TYPED_TEST(TestTranspose, Test1)
+{
+    // for 16, 8, 16, 32, 8
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 16;
+    constexpr int C = 8;
+    constexpr int D = 16;
+    constexpr int H = 32;
+    constexpr int W = 8;
+    this->Run(N, C, D, H, W);
+}
+TYPED_TEST(TestTranpose, Test2)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 16;
+    constexpr int C = 8;
+    constexpr int D = 16;
+    constexpr int H = 32;
+    constexpr int W = 8;
+    this->Run(N, C, D, H, W);
+}