Commit f3b6e205 authored by Astha Rai's avatar Astha Rai
Browse files

fixed errors in test/profiler

parent 4dab86fe
...@@ -50,14 +50,20 @@ int main() ...@@ -50,14 +50,20 @@ int main()
bool do_verification = true; bool do_verification = true;
bool time_kernel = true; bool time_kernel = true;
const int N = 4; /**const int N = 4;
const int C = 16; const int C = 16;
const int H = 32; const int H = 32;
const int W = 5; const int W = 5;
const int D = 16; const int D = 16;**/
std::vector<std::size_t> ncdhw = {N, C, D, H, W}; ck::index_t N = 4;
std::vector<std::size_t> nchwd = {N, C, H, W, D}; ck::index_t C = 16;
ck::index_t H = 32;
ck::index_t W = 5;
ck::index_t D = 16;
std::vector<ck::index_t> ncdhw = {N, C, D, H, W};
std::vector<ck::index_t> nchwd = {N, C, H, W, D};
Tensor<ADataType> a(ncdhw); Tensor<ADataType> a(ncdhw);
Tensor<BDataType> b(nchwd); Tensor<BDataType> b(nchwd);
......
...@@ -9,44 +9,56 @@ ...@@ -9,44 +9,56 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp" #include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp" #include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <typename ADataType, typename BDataType> template <typename HostTensorA, typename HostTensorB, typename Functor>
bool profile_gemm_splitk_impl(int do_verification, void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
int init_method, {
bool do_log, for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
bool time_kernel, for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
int N, for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
int C, for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
int D, for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
int H, {
int W) auto a_val = A_ncdhw(n, c, d, h, w);
functor(B_nchwd(n, c, h, w, d), a_val);
}
}
template <typename ADataType, typename BDataType, index_t NumDim>
bool profile_transpose_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
ck::index_t N,
ck::index_t C,
ck::index_t D,
ck::index_t H,
ck::index_t W)
{ {
bool pass = true; bool pass = true;
std::vector<std::size_t> ncdhw = {N, C, D, H, W}; std::vector<ck::index_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> ndhwc = {N, D, H, W, C}; std::vector<ck::index_t> ndhwc = {N, D, H, W, C};
Tensor<ADataType> a(ncdhw); Tensor<ADataType> a(ncdhw);
Tensor<BDataType> b(ndhwc); Tensor<BDataType> b(ndhwc);
Tensor<BDataType> host_b(ndhwc);
// a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D}; std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
...@@ -63,25 +75,17 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -63,25 +75,17 @@ bool profile_gemm_splitk_impl(int do_verification,
using ElementOp = ck::tensor_operation::element_wise::PassThrough; using ElementOp = ck::tensor_operation::element_wise::PassThrough;
const auto element_op = ElementOp{}; // const auto element_op = ElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data()); a_device_buf.ToDevice(a.mData.data());
using DeviceOp = std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
ck::Tuple<BDataType>, using DeviceOp = ck::tensor_operation::device::
ElementOp, DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, ElementOp, NumDim>;
NumDim_m,
NumDim_n,
NumDim_k,
MPerThread,
NPerThread,
KPerThread,
ck::Sequence<InScalarPerVector>,
ck::Sequence<OutScalarPerVector>>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -91,19 +95,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -91,19 +95,7 @@ bool profile_gemm_splitk_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
using ReferenceTransposeInstance = ck::tensor_operation::host::ReferenceTranspose host_elementwise4D(host_b, a, ElementOp{});
<< ck::Tuple<ADataType>,
ck::Tuple<BDataType>, ElementOp, NumDim_m, NumDim_n, NumDim_k, MPerThread, NPerThread,
KPerThread, ck::Sequence<InScalarPerVector>, ck::Sequence<OutScalarPerVector> > ;
auto ref_transpose = ReferenceTransposeInstance{};
auto ref_invoker = ref_transpose.MakeInvoker();
auto ref_argument =
ref_transpose
.MakeArgument(ab_lengths, {a_strides}, {b_strides}, input, output, element_op{})
ref_invoker.Run(ref_argument);
} }
std::string best_op_name; std::string best_op_name;
...@@ -114,7 +106,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -114,7 +106,7 @@ bool profile_gemm_splitk_impl(int do_verification,
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr = op_ptr->MakeArgumentPointer(
ab_lengths, {a_strides}, {b_strides}, input, output, element_op{}); ab_lengths, {a_strides}, {b_strides}, input, output, ElementOp{});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -128,9 +120,11 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -128,9 +120,11 @@ bool profile_gemm_splitk_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
b_device_buf.FromDevice(b_device_result.mData.data()); b_device_buf.FromDevice(b.mData.data());
pass = pass & ck::utils::check_err(b_device_result, b_host_result); // pass = pass & ck::utils::check_err(b_device_result, b_host_result);
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(do_log) if(do_log)
{ {
...@@ -158,7 +152,9 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -158,7 +152,9 @@ bool profile_gemm_splitk_impl(int do_verification,
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl; << gb_per_sec << " GB/s, " << op_name << std::endl;
pass = pass & ck::utils::check_err(b_device_result, b_host_result); // pass = pass & ck::utils::check_err(b_device_result, b_host_result);
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(tflops > best_tflops) if(tflops > best_tflops)
{ {
...@@ -173,22 +169,12 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -173,22 +169,12 @@ bool profile_gemm_splitk_impl(int do_verification,
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
} }
} }
}
if constexpr(is_same<BDataType, float>::value)
{
std::cout << "Best Perf for datatype = f32";
}
else if constexpr(is_same<BDataType, half_t>::value)
{
std::cout << "Best Perf for datatype = f16";
}
std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W << " : " std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W
<< best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
<< best_op_name << std::endl; << " GB/s, " << best_op_name << std::endl;
return pass; return pass;
} }
} // namespace profiler } // namespace profiler
......
...@@ -156,6 +156,7 @@ add_subdirectory(pool) ...@@ -156,6 +156,7 @@ add_subdirectory(pool)
add_subdirectory(batched_gemm_multi_d) add_subdirectory(batched_gemm_multi_d)
add_subdirectory(grouped_convnd_bwd_data) add_subdirectory(grouped_convnd_bwd_data)
add_subdirectory(conv_tensor_rearrange) add_subdirectory(conv_tensor_rearrange)
add_subdirectory(transpose)
if(GPU_TARGETS MATCHES "gfx11") if(GPU_TARGETS MATCHES "gfx11")
add_subdirectory(wmma_op) add_subdirectory(wmma_op)
endif() endif()
...@@ -5,19 +5,13 @@ ...@@ -5,19 +5,13 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "test_tranpose_util.hpp" #include "test_transpose_util.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
enum struct MatrixLayout
{
NCDHW, // 0
NCHWD, // 1
};
template <typename Tuple> template <typename Tuple>
class TestTranspose : public ck::test::TestTranspose<typename MatrixLayout<NCDHW>::type> class TestTranspose : public ck::test::TestTranspose<Tuple>
{ {
}; };
...@@ -28,6 +22,6 @@ using KernelTypes = ::testing::Types< ...@@ -28,6 +22,6 @@ using KernelTypes = ::testing::Types<
>; >;
// clang-format on // clang-format on
TYPED_TEST_SUITE(TestGemmSplitK_MK_KN, KernelTypes); TYPED_TEST_SUITE(TestTranspose, KernelTypes);
//#include "test_transpose_ut_cases.inc" //#include "test_transpose_ut_cases.inc"
\ No newline at end of file
...@@ -24,8 +24,6 @@ class TestTranspose : public testing::Test ...@@ -24,8 +24,6 @@ class TestTranspose : public testing::Test
using F32 = float; using F32 = float;
protected: protected:
// using ALayout = std::tuple_element_t<0, Tuple>;
// using BLayout = std::tuple_element_t<1, Tuple>;
using ADataType = std::tuple_element_t<0, Tuple>; using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>; using BDataType = std::tuple_element_t<1, Tuple>;
...@@ -42,11 +40,11 @@ class TestTranspose : public testing::Test ...@@ -42,11 +40,11 @@ class TestTranspose : public testing::Test
void RunSingle(const int N, const int C, const int D, const int H, const int W) void RunSingle(const int N, const int C, const int D, const int H, const int W)
{ {
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, >( bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
verify_, init_method_, log_, bench_, N, C, D, H, W); verify_, init_method_, log_, bench_, N, C, D, H, W);
EXPECT_TRUE(pass); EXPECT_TRUE(pass);
} }
}; };
} // namespace test } // namespace test
} // namespace ck } // namespace ck
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment