fixed errors in test/profiler

f3b6e205 · Astha Rai · 4dab86fe · f3b6e205 · f3b6e205 · f3b6e205
Commit f3b6e205 authored Oct 29, 2023 by Astha Rai
5 changed files
--- a/example/44_elementwise_permute/elementwise_permute_3d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_3d.cpp
@@ -50,14 +50,20 @@ int main()
    bool do_verification = true;
    bool time_kernel     = true;

-    const int N = 4;
+    /**const int N = 4;
    const int C = 16;
    const int H = 32;
    const int W = 5;
-    const int D = 16;
+    const int D = 16;**/

-    std::vector<std::size_t> ncdhw = {N, C, D, H, W};
-    std::vector<std::size_t> nchwd = {N, C, H, W, D};
+    ck::index_t N = 4;
+    ck::index_t C = 16;
+    ck::index_t H = 32;
+    ck::index_t W = 5;
+    ck::index_t D = 16;
+
+    std::vector<ck::index_t> ncdhw = {N, C, D, H, W};
+    std::vector<ck::index_t> nchwd = {N, C, H, W, D};
    Tensor<ADataType> a(ncdhw);
    Tensor<BDataType> b(nchwd);


--- a/profiler/include/profiler/profile_transpose_impl.hpp
+++ b/profiler/include/profiler/profile_transpose_impl.hpp
@@ -9,44 +9,56 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"

-#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
+#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 namespace ck {
 namespace profiler {

-template <typename ADataType, typename BDataType>
-bool profile_gemm_splitk_impl(int do_verification,
-                              int init_method,
-                              bool do_log,
-                              bool time_kernel,
-                              int N,
-                              int C,
-                              int D,
-                              int H,
-                              int W)
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
+{
+    for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
+        for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
+            for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
+                for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
+                    for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
+                    {
+                        auto a_val = A_ncdhw(n, c, d, h, w);
+                        functor(B_nchwd(n, c, h, w, d), a_val);
+                    }
+}
+
+template <typename ADataType, typename BDataType, index_t NumDim>
+bool profile_transpose_impl(int do_verification,
+                            int init_method,
+                            bool do_log,
+                            bool time_kernel,
+                            ck::index_t N,
+                            ck::index_t C,
+                            ck::index_t D,
+                            ck::index_t H,
+                            ck::index_t W)
 {
    bool pass = true;

-    std::vector<std::size_t> ncdhw = {N, C, D, H, W};
-    std::vector<std::size_t> ndhwc = {N, D, H, W, C};
+    std::vector<ck::index_t> ncdhw = {N, C, D, H, W};
+    std::vector<ck::index_t> ndhwc = {N, D, H, W, C};
    Tensor<ADataType> a(ncdhw);
    Tensor<BDataType> b(ndhwc);
+    Tensor<BDataType> host_b(ndhwc);

    // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});

-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-
    std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
    std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
    std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
@@ -63,25 +75,17 @@ bool profile_gemm_splitk_impl(int do_verification,

    using ElementOp = ck::tensor_operation::element_wise::PassThrough;

-    const auto element_op = ElementOp{};
+    // const auto element_op = ElementOp{};

    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());

    a_device_buf.ToDevice(a.mData.data());

-    using DeviceOp =
-        ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>,
-                                                              ck::Tuple<BDataType>,
-                                                              ElementOp,
-                                                              NumDim_m,
-                                                              NumDim_n,
-                                                              NumDim_k,
-                                                              MPerThread,
-                                                              NPerThread,
-                                                              KPerThread,
-                                                              ck::Sequence<InScalarPerVector>,
-                                                              ck::Sequence<OutScalarPerVector>>;
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+    using DeviceOp                   = ck::tensor_operation::device::
+        DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, ElementOp, NumDim>;

    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -91,19 +95,7 @@ bool profile_gemm_splitk_impl(int do_verification,

    if(do_verification)
    {
-        using ReferenceTransposeInstance = ck::tensor_operation::host::ReferenceTranspose
-                                           << ck::Tuple<ADataType>,
-              ck::Tuple<BDataType>, ElementOp, NumDim_m, NumDim_n, NumDim_k, MPerThread, NPerThread,
-              KPerThread, ck::Sequence<InScalarPerVector>, ck::Sequence<OutScalarPerVector> > ;
-
-        auto ref_transpose = ReferenceTransposeInstance{};
-        auto ref_invoker   = ref_transpose.MakeInvoker();
-
-        auto ref_argument =
-            ref_transpose
-                .MakeArgument(ab_lengths, {a_strides}, {b_strides}, input, output, element_op{})
-
-                    ref_invoker.Run(ref_argument);
+        host_elementwise4D(host_b, a, ElementOp{});
    }

    std::string best_op_name;
@@ -114,7 +106,7 @@ bool profile_gemm_splitk_impl(int do_verification,
    for(auto& op_ptr : op_ptrs)
    {
        auto argument_ptr = op_ptr->MakeArgumentPointer(
-            ab_lengths, {a_strides}, {b_strides}, input, output, element_op{});
+            ab_lengths, {a_strides}, {b_strides}, input, output, ElementOp{});

        auto invoker_ptr = op_ptr->MakeInvokerPointer();

@@ -128,9 +120,11 @@ bool profile_gemm_splitk_impl(int do_verification,

            if(do_verification)
            {
-                b_device_buf.FromDevice(b_device_result.mData.data());
+                b_device_buf.FromDevice(b.mData.data());

-                pass = pass & ck::utils::check_err(b_device_result, b_host_result);
+                // pass = pass & ck::utils::check_err(b_device_result, b_host_result);
+                pass &= ck::utils::check_err(
+                    b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);

                if(do_log)
                {
@@ -158,7 +152,9 @@ bool profile_gemm_splitk_impl(int do_verification,
            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
                      << gb_per_sec << " GB/s, " << op_name << std::endl;

-            pass = pass & ck::utils::check_err(b_device_result, b_host_result);
+            // pass = pass & ck::utils::check_err(b_device_result, b_host_result);
+            pass &= ck::utils::check_err(
+                b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);

            if(tflops > best_tflops)
            {
@@ -173,22 +169,12 @@ bool profile_gemm_splitk_impl(int do_verification,
            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
        }
    }
-}
-
-if constexpr(is_same<BDataType, float>::value)
-{
-    std::cout << "Best Perf for datatype = f32";
-}
-else if constexpr(is_same<BDataType, half_t>::value)
-{
-    std::cout << "Best Perf for datatype = f16";
-}

-std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W << " : "
-          << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
-          << best_op_name << std::endl;
+    std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;

-return pass;
+    return pass;
 }

 } // namespace profiler

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -156,6 +156,7 @@ add_subdirectory(pool)
 add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
 add_subdirectory(conv_tensor_rearrange)
+add_subdirectory(transpose)
 if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
--- a/test/transpose/test_transpose.cpp
+++ b/test/transpose/test_transpose.cpp
@@ -5,19 +5,13 @@

 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "test_tranpose_util.hpp"
+#include "test_transpose_util.hpp"

 using F16 = ck::half_t;
 using F32 = float;

-enum struct MatrixLayout
-{
-    NCDHW, // 0
-    NCHWD, // 1
-};
-
 template <typename Tuple>
-class TestTranspose : public ck::test::TestTranspose<typename MatrixLayout<NCDHW>::type>
+class TestTranspose : public ck::test::TestTranspose<Tuple>
 {
 };

@@ -28,6 +22,6 @@ using KernelTypes = ::testing::Types<
    >;
 // clang-format on

-TYPED_TEST_SUITE(TestGemmSplitK_MK_KN, KernelTypes);
+TYPED_TEST_SUITE(TestTranspose, KernelTypes);

-//#include "test_transpose_ut_cases.inc"
\ No newline at end of file
+//#include "test_transpose_ut_cases.inc"
--- a/test/transpose/test_transpose_interface.cpp
+++ b/test/transpose/test_transpose_interface.cpp
@@ -24,8 +24,6 @@ class TestTranspose : public testing::Test
    using F32 = float;

    protected:
-    // using ALayout   = std::tuple_element_t<0, Tuple>;
-    // using BLayout   = std::tuple_element_t<1, Tuple>;
    using ADataType = std::tuple_element_t<0, Tuple>;
    using BDataType = std::tuple_element_t<1, Tuple>;

@@ -42,11 +40,11 @@ class TestTranspose : public testing::Test

    void RunSingle(const int N, const int C, const int D, const int H, const int W)
    {
-        bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, >(
+        bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
            verify_, init_method_, log_, bench_, N, C, D, H, W);
        EXPECT_TRUE(pass);
    }
 };

 } // namespace test
-} // namespace ck
\ No newline at end of file
+} // namespace ck