Commit 244681cf authored by Astha Rai's avatar Astha Rai
Browse files

adding test files and profiler

parent 991ce41a
...@@ -59,21 +59,21 @@ int main() ...@@ -59,21 +59,21 @@ int main()
SimpleDeviceMem a_dev_buf(sizeof(ADataType) * size); SimpleDeviceMem a_dev_buf(sizeof(ADataType) * size);
SimpleDeviceMem b_dev_buf(sizeof(BDataType) * size); SimpleDeviceMem b_dev_buf(sizeof(BDataType) * size);
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()}; std::array<const void*, 1> input = {a_dev_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()}; std::array<void*, 1> output = {b_dev_buf.GetDeviceBuffer()};
using DeviceElementwisePermuteInstance = using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
ck::Tuple<BDataType>, ck::Tuple<BDataType>,
PassThrough, PassThrough,
2, 2,
2, 2,
1, 1,
8, 8,
8, 8,
8, 8,
ck::Sequence<8>, ck::Sequence<8>,
ck::Sequence<1>>; ck::Sequence<1>>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -104,7 +104,7 @@ int main() ...@@ -104,7 +104,7 @@ int main()
{ {
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_btype = std::size_t num_byte =
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) + sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]); sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
...@@ -151,4 +151,4 @@ int main() ...@@ -151,4 +151,4 @@ int main()
} }
return 0; return 0;
} }
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_transpose_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct MatrixLayout
{
NCDHW, // 0
NCHWD, // 1
};
enum struct DataType
{
F32_F32_F32_F32_F32, // 0
F16_F16_F16_F16_F16, // 1
};
#define OP_NAME "transpose"
#define OP_DESC "Transpose"
int profile_transpose(int argc, char* argv[])
{
if(argc != 15)
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: matrix layout (NCDHW -> NDCHW);\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 13: N, C, D, H, W\n");
exit(1);
}
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int N = std::stoi(argv[8]);
const int C = std::stoi(argv[9]);
const int D = std::stoi(argv[10]);
const int H = std::stoi(argv[11]);
const int W = std::stoi(argv[12]);
using F32 = float;
using F16 = ck::half_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type, auto b_type) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
// using ALayout = decltype(a_layout);
// using BLayout = decltype(b_layout);
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType>(
do_verification, init_method, do_log, time_kernel, N, C, D, H, W);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F32_F32_F32_F32_F32)
{
return profile(F32{}, F32{});
}
else if(data_type == GemmDataType::F16_F16_F16_F16_F16)
{
return profile(F16{}, F16{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_splitk);
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
namespace profiler {
template <typename ADataType, typename BDataType>
bool profile_gemm_splitk_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
int N,
int C,
int D,
int H,
int W)
{
bool pass = true;
std::vector<std::size_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> nchwd = {N, C, H, W, D};
Tensor<ADataType> a(ncdhw);
Tensor<BDataType> b(nchwd);
// a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, 1, D * H, D};
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1};
std::cout << "A: " << a.mDesc << std::endl;
std::cout << "B: " << b.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
}
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
const auto element_op = ElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data());
using DeviceOp =
ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
ElementOp,
NumDim_m,
NumDim_n,
NumDim_k,
MPerThread,
NPerThread,
KPerThread,
ck::Sequence<InScalarPerVector>,
ck::Sequence<OutScalarPerVector>>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
if(do_verification)
{
using ReferenceTransposeInstance = ck::tensor_operation::host::ReferenceTranspose
<< ck::Tuple<ADataType>,
ck::Tuple<BDataType>, ElementOp, NumDim_m, NumDim_n, NumDim_k, MPerThread, NPerThread,
KPerThread, ck::Sequence<InScalarPerVector>, ck::Sequence<OutScalarPerVector> > ;
auto ref_transpose = ReferenceTransposeInstance{};
auto ref_invoker = ref_transpose.MakeInvoker();
auto ref_argument =
ref_transpose
.MakeArgument(ab_lengths, {a_strides}, {b_strides}, input, output, element_op{})
ref_invoker.Run(ref_argument);
}
std::string best_op_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
ab_lengths, {a_strides}, {b_strides}, input, output, element_op{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init C to zero before profiling next kernel
b_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
if(do_verification)
{
b_device_buf.FromDevice(b_device_result.mData.data());
pass = pass & ck::utils::check_err(b_device_result, b_host_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
}
}
std::string op_name = op_ptr->GetTypeString();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop =
std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
std::size_t num_btype =
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
pass = pass & ck::utils::check_err(b_device_result, b_host_result);
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
}
}
if constexpr(is_same<BDataType, float>::value)
{
std::cout << "Best Perf for datatype = f32";
}
else if constexpr(is_same<BDataType, half_t>::value)
{
std::cout << "Best Perf for datatype = f16";
}
std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W << " : "
<< best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
return pass;
}
} // namespace profiler
} // namespace ck
\ No newline at end of file
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_gtest_executable(test_transpose test_transpose.cpp)
target_link_libraries(test_transpose PRIVATE utility device_transpose_instance)
set(target 1)
endif()
endforeach()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "test_tranpose_util.hpp"
using F16 = ck::half_t;
using F32 = float;
enum struct MatrixLayout
{
NCDHW, // 0
NCHWD, // 1
};
template <typename Tuple>
class TestTranspose : public ck::test::TestTranspose<typename MatrixLayout<NCDHW>::type>
{
};
// clang-format off
using KernelTypes = ::testing::Types<
std::tuple< F16, F16>,
std::tuple< F32, F32>
>;
// clang-format on
TYPED_TEST_SUITE(TestGemmSplitK_MK_KN, KernelTypes);
//#include "test_transpose_ut_cases.inc"
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <sstream>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/data_type.hpp"
#include "profiler/profile_transpose_impl.hpp"
namespace ck {
namespace test {
template <typename Tuple>
class TestTranspose : public testing::Test
{
using F32 = float;
protected:
// using ALayout = std::tuple_element_t<0, Tuple>;
// using BLayout = std::tuple_element_t<1, Tuple>;
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
public:
static constexpr bool verify_ = true;
static constexpr int init_method_ = 1; // decimal value initialization
static constexpr bool log_ = false;
static constexpr bool bench_ = false; // measure kernel performance
void Run(const int N, const int C, const int D, const int H, const int W)
{
RunSingle(N, H, C, D, W);
}
void RunSingle(const int N, const int C, const int D, const int H, const int W)
{
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, >(
verify_, init_method_, log_, bench_, N, C, D, H, W);
EXPECT_TRUE(pass);
}
};
} // namespace test
} // namespace ck
\ No newline at end of file
#pragma once
TYPED_TEST(TestTranspose, Test1)
{
// for 16, 8, 16, 32, 8
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 16;
constexpr int C = 8;
constexpr int D = 16;
constexpr int H = 32;
constexpr int W = 8;
this->Run(N, C, D, H, W);
}
TYPED_TEST(TestTranpose, Test2)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
constexpr int N = 16;
constexpr int C = 8;
constexpr int D = 16;
constexpr int H = 32;
constexpr int W = 8;
this->Run(N, C, D, H, W);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment