Commit db775824 authored by Jehandad Khan's avatar Jehandad Khan
Browse files

Merge branch 'develop' into jd/dev_pkg

parents 74397984 7c0b1498
......@@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
vim \
zlib1g-dev \
openssh-server \
clang-format-10 \
kmod && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
......
......@@ -204,7 +204,7 @@ pipeline {
stage('Clang Format') {
agent{ label rocmnode("nogpu") }
environment{
execute_cmd = "find . -iname \'*.h\' \
execute_cmd = "find .. -iname \'*.h\' \
-o -iname \'*.hpp\' \
-o -iname \'*.cpp\' \
-o -iname \'*.h.in\' \
......
add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util)
add_example_executable(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util)
add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp)
target_link_libraries(example_convnd_fwd_xdl PRIVATE conv_fwd_util)
add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util)
add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util)
add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp)
target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util)
add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp)
target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util)
......@@ -3,7 +3,6 @@
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
......@@ -27,10 +26,6 @@ using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;
using HostInDataType = half_float::half;
using HostOutDataType = half_float::half;
using HostAccDataType = float;
constexpr int Rank = 4;
constexpr int NumReduceDim = 3;
......@@ -306,9 +301,9 @@ int main(int argc, char* argv[])
if(args.do_verification)
{
ReductionHost<HostInDataType,
HostAccDataType,
HostOutDataType,
ReductionHost<InDataType,
AccDataType,
OutDataType,
ReduceOpId,
Rank,
NumReduceDim,
......@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha,
reinterpret_cast<const HostInDataType*>(in.mData.data()),
beta,
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
out_indices_ref.mData.data());
hostReduce.Run(
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
};
const auto i_inLengths = to_int_vector(args.inLengths);
......
add_example_executable(example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp)
target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_fwd_util)
......@@ -39,6 +39,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};
static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
......@@ -71,7 +73,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
return make_tuple(0, waveId_m, xdlops_a_idx[I1], Number<KPack>{} * xdlops_a_idx[I0]);
return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]);
}
__device__ static auto CalculateBThreadOriginDataIndex()
......@@ -82,7 +84,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
return make_tuple(0, waveId_n, xdlops_b_idx[I1], Number<KPack>{} * xdlops_b_idx[I0]);
return make_tuple(0, waveId_n, xdlops_b_idx[I1], KPerThread * xdlops_b_idx[I0]);
}
template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
......@@ -273,7 +275,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
make_tuple(I0, I0, I0, I0),
b_thread_buf);
static_for<0, KPerBlock, KPack * xdlops_gemm.K0PerXdlops>{}([&](auto k) {
static_for<0, KPerThread, KPack>{}([&](auto k) {
vector_type<FloatAB, KPack> a_thread_vec;
vector_type<FloatAB, KPack> b_thread_vec;
......@@ -300,13 +302,13 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
}
private:
// A[M0, M1, M2, KPerBlock]
// A[M0, M1, M2, KPerThread]
static constexpr auto a_thread_desc_ =
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerBlock>{}));
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
// B[N0, N1, N2, KPerBlock]
// B[N0, N1, N2, KPerThread]
static constexpr auto b_thread_desc_ =
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerBlock>{}));
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
// C[M, N, NumRegXdlops]
static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
......@@ -316,7 +318,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
FloatAB,
decltype(a_block_desc_m0_m1_m2_k),
decltype(a_thread_desc_),
Sequence<1, 1, 1, KPerBlock>,
Sequence<1, 1, 1, KPerThread>,
Sequence<0, 1, 2, 3>,
3,
A_K1,
......@@ -326,7 +328,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
FloatAB,
decltype(b_block_desc_n0_n1_n2_k),
decltype(b_thread_desc_),
Sequence<1, 1, 1, KPerBlock>,
Sequence<1, 1, 1, KPerThread>,
Sequence<0, 1, 2, 3>,
3,
B_K1,
......
......@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
// TODO make this logic more generic for more sub-dword datatype
if constexpr(SrcVectorDim != DstVectorDim &&
is_same<half_t, remove_cvref_t<SrcData>>::value &&
((is_same<half_t, remove_cvref_t<SrcData>>::value &&
is_same<half_t, remove_cvref_t<DstData>>::value &&
SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0)
SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
(is_same<int8_t, remove_cvref_t<SrcData>>::value &&
is_same<int8_t, remove_cvref_t<DstData>>::value &&
SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
{
// each transpose does
// DstScalarPerVector # of src vectors in src_thread_scratch_
......
#ifndef CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#include <cmath>
#include "data_type.hpp"
#include "half.hpp"
namespace ck {
namespace math {
static inline __device__ half_t abs(half_t x) { return __habs(x); };
static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
static inline __host__ float abs(float x) { return std::abs(x); };
static inline __host__ double abs(double x) { return std::abs(x); };
static inline __host__ int8_t abs(int8_t x)
{
int8_t sgn = x >> (8 - 1);
return (x ^ sgn) - sgn;
};
static inline __host__ int32_t abs(int32_t x)
{
int32_t sgn = x >> (32 - 1);
return (x ^ sgn) - sgn;
};
static inline __host__ half_t abs(half_t x)
{
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
half_float::half abs_xx = half_float::abs(xx);
half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
return abs_x;
};
static inline __host__ float isnan(float x) { return std::isnan(x); };
static inline __host__ double isnan(double x) { return std::isnan(x); };
static inline __host__ int8_t isnan(int8_t x)
{
(void)x;
return false;
};
static inline __host__ int32_t isnan(int32_t x)
{
(void)x;
return false;
};
static inline __host__ bool isnan(half_t x)
{
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
return half_float::isnan(xx);
};
} // namespace math
} // namespace ck
......
......@@ -33,7 +33,7 @@ namespace ck {
struct float_equal_one
{
template <class T>
__device__ inline bool operator()(T x)
__host__ __device__ inline bool operator()(T x)
{
return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
};
......@@ -42,7 +42,7 @@ struct float_equal_one
struct float_equal_zero
{
template <class T>
__device__ inline bool operator()(T x)
__host__ __device__ inline bool operator()(T x)
{
return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
};
......
......@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
template <index_t NX, index_t NY>
struct transpose_vectors<half_t, NX, NY>
{
// we got [NY * NX] ammount of S data to be transposed
// we got [NY * NX] amount of S data to be transposed
static constexpr index_t s_per_x = NY;
static constexpr index_t s_per_y = NX;
......@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
}
};
// transpose int8 4x4
__device__ void transpose_int8_4x4(const int8x4_t& x0,
const int8x4_t& x1,
const int8x4_t& x2,
const int8x4_t& x3,
int8x4_t& y0,
int8x4_t& y1,
int8x4_t& y2,
int8x4_t& y3)
{
int32_t t0, t1;
int32_t z0, z1, z2, z3;
constexpr int32_t m0 = 0x05010400;
constexpr int32_t m1 = 0x05040100;
constexpr int32_t m2 = 0x07060302;
constexpr int32_t m3 = 0x07030602;
// ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
// -- -- -- -- -- -- -- -- - - - -
// index 7 6 5 4 3 2 1 0 33 77 44 88
// index is reversed because of little endianness (least significant bits first)
// clang-format off
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m0));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m0));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z0) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z1) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m3));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m3));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z2) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z3) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
// clang-format on
y0 = bit_cast<int8x4_t>(z0);
y1 = bit_cast<int8x4_t>(z1);
y2 = bit_cast<int8x4_t>(z2);
y3 = bit_cast<int8x4_t>(z3);
}
template <index_t NX, index_t NY>
struct transpose_vectors<int8_t, NX, NY>
{
// we got [NY * NX] amount of S data to be transposed
static constexpr index_t s_per_x = NY;
static constexpr index_t s_per_y = NX;
using S = int8_t;
using VX = vector_type<int8_t, s_per_x>;
using VY = vector_type<int8_t, s_per_y>;
__device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
StaticallyIndexedArray<VY&, NY>& vy_tuple)
{
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{};
static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
// loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
static_for<0, NY, 4>{}([&](auto iy) {
static_for<0, NX, 4>{}([&](auto ix) {
// reference to 4 int8 data from vx_tuple
const auto& x_s4_0 = vx_tuple[ix].template AsType<int8x4_t>()[iy / I4];
const auto& x_s4_1 = vx_tuple[ix + I1].template AsType<int8x4_t>()[iy / I4];
const auto& x_s4_2 = vx_tuple[ix + I2].template AsType<int8x4_t>()[iy / I4];
const auto& x_s4_3 = vx_tuple[ix + I3].template AsType<int8x4_t>()[iy / I4];
// reference to 4 int8 data from vy_tuple
auto& y_s4_0 = vy_tuple(iy).template AsType<int8x4_t>()(ix / I4);
auto& y_s4_1 = vy_tuple(iy + I1).template AsType<int8x4_t>()(ix / I4);
auto& y_s4_2 = vy_tuple(iy + I2).template AsType<int8x4_t>()(ix / I4);
auto& y_s4_3 = vy_tuple(iy + I3).template AsType<int8x4_t>()(ix / I4);
// transpose
transpose_int8_4x4(x_s4_0, x_s4_1, x_s4_2, x_s4_3, y_s4_0, y_s4_1, y_s4_2, y_s4_3);
});
});
}
};
} // namespace ck
#endif
add_subdirectory(src/host_tensor)
add_subdirectory(src/tensor_operation_instance/gpu)
add_subdirectory(src/utility)
......@@ -26,7 +26,6 @@
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <half.hpp>
#include <limits>
#include <cmath>
#include <cassert>
......@@ -34,6 +33,8 @@
#include <string>
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace ck {
......@@ -42,34 +43,10 @@ namespace host_reduce {
using ck::NanPropagation;
using ck::ReduceTensorOp;
template <typename T>
static inline bool float_equal_one(T);
static inline bool float_equal_one(float x) { return x == 1.0f; };
static inline bool float_equal_one(double x) { return x == 1.0; };
static inline bool float_equal_one(half_float::half x)
{
return x == static_cast<half_float::half>(1.0f);
};
template <typename T>
static inline bool float_equal_zero(T x);
static inline bool float_equal_zero(float x) { return x == 0.0f; };
static inline bool float_equal_zero(double x) { return x == 0.0; };
static inline bool float_equal_zero(half_float::half x)
{
return x == static_cast<half_float::half>(0.0f);
};
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
{
using std::abs;
using ck::math::abs;
if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
{
......@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
}
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{
return (std::numeric_limits<AccDataType>::max());
return (ck::NumericLimits<AccDataType>::Max());
}
else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
{
return (std::numeric_limits<AccDataType>::lowest());
return (ck::NumericLimits<AccDataType>::Lowest());
}
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
{
......@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
AccDataType& accuVal,
AccDataType currVal)
{
using std::isnan;
using ck::math::isnan;
if constexpr(!PropagateNan)
{
......@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
int& accuIndex,
int currIndex)
{
using std::isnan;
using ck::math::isnan;
if constexpr(!PropagateNan)
{
......
......@@ -32,6 +32,7 @@
#include <functional>
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
......@@ -196,10 +197,10 @@ struct ReductionHost
OutDataType* out_data,
IndexDataType* out_indices)
{
using ck::float_equal_one;
using ck::float_equal_zero;
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
......@@ -227,10 +228,10 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
......@@ -263,13 +264,13 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
......@@ -303,10 +304,10 @@ struct ReductionHost
void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::float_equal_one;
using ck::float_equal_zero;
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
......@@ -330,10 +331,10 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
......@@ -361,13 +362,13 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
......
#pragma once
#include <algorithm>
#include <random>
#include "data_type.hpp"
namespace ck {
namespace utils {
// template <typename T, class Enable = void>
// struct FillUniform;
// TODO: what's wrong with this specialization???
// err: segmentation fault in mt19937 - infinite loop like.
// template <typename T>
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
// !std::is_same<T, bhalf_t>::value>::type>
// {
// int a_{0};
// int b_{5};
// // T a_ = T{0};
// // T b_ = T{5};
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen{11939};
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
// std::is_same<T, bhalf_t>::value>::type>
template <typename T>
struct FillUniform
{
float a_{0};
float b_{5};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::mt19937 gen{11939};
std::uniform_real_distribution<> dis(a_, b_);
std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
}
};
template <typename T>
struct FillMonotonicSeq
{
T init_value_{0};
T step_{1};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::generate(first, last, [=, n = init_value_]() mutable {
auto tmp = n;
n += step_;
return tmp;
});
}
};
template <typename T>
struct FillConstant
{
T value_{0};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::fill(first, last, value_);
}
};
} // namespace utils
} // namespace ck
#pragma once
#include <cstdlib>
#include <limits>
#include <memory>
#include <stdexcept>
#include <tuple>
#include <utility>
#include <vector>
#include "check_err.hpp"
#include "device_base.hpp"
#include "functional2.hpp"
namespace ck {
namespace utils {
struct ProfileBestConfig
{
std::string best_op_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_tflops = std::numeric_limits<float>::max();
float best_gb_per_sec = std::numeric_limits<float>::max();
};
/**
* @brief This class describes an operation instance(s).
*
* Op instance defines a particular specializations of operator
* template. Thanks to this specific input/output data types, data
* layouts and modifying elementwise operations it is able to create
* it's input/output tensors, provide pointers to instances which
* can execute it and all operation specific parameters.
*/
template <typename OutDataType, typename... InArgTypes>
class OpInstance
{
public:
template <typename T>
using TensorPtr = std::unique_ptr<Tensor<T>>;
using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
using DeviceMemPtr = std::unique_ptr<DeviceMem>;
using DeviceBuffers = std::vector<DeviceMemPtr>;
OpInstance() = default;
OpInstance(const OpInstance&) = default;
OpInstance& operator=(const OpInstance&) = default;
virtual ~OpInstance(){};
virtual InTensorsTuple GetInputTensors() const = 0;
virtual TensorPtr<OutDataType> GetOutputTensor() const = 0;
virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0;
virtual std::unique_ptr<tensor_operation::device::BaseArgument>
MakeArgumentPointer(tensor_operation::device::BaseOperator*,
const DeviceBuffers&,
const DeviceMemPtr&) const = 0;
virtual std::size_t GetFlops() const = 0;
virtual std::size_t GetBtype() const = 0;
};
/**
* @brief A generic operation instance run engine.
*/
template <typename OutDataType, typename... InArgTypes>
class OpInstanceRunEngine
{
public:
using OpInstanceT = OpInstance<InArgTypes..., OutDataType>;
template <typename T>
using TensorPtr = std::unique_ptr<Tensor<T>>;
using DeviceMemPtr = std::unique_ptr<DeviceMem>;
using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
using DeviceBuffers = std::vector<DeviceMemPtr>;
using InArgsTypesTuple = std::tuple<InArgTypes...>;
OpInstanceRunEngine() = delete;
template <typename ReferenceOp = std::function<void()>>
OpInstanceRunEngine(const OpInstanceT& op_instance,
const ReferenceOp& reference_op = ReferenceOp{})
: op_instance_{op_instance}
{
in_tensors_ = op_instance_.GetInputTensors();
out_tensor_ = op_instance_.GetOutputTensor();
if constexpr(std::is_invocable_v<ReferenceOp,
const Tensor<InArgTypes>&...,
Tensor<OutDataType>&>)
{
ref_output_ = op_instance_.GetOutputTensor();
CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
}
AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
out_device_buffer_ =
std::make_unique<DeviceMem>(sizeof(OutDataType) * out_tensor_->mDesc.GetElementSpace());
out_device_buffer_->SetZero();
}
virtual ~OpInstanceRunEngine(){};
template <typename OpInstancePtr>
bool Test(const std::vector<OpInstancePtr>& op_ptrs)
{
bool res{true};
for(auto& op_ptr : op_ptrs)
{
auto invoker = op_instance_.MakeInvokerPointer(op_ptr.get());
auto argument = op_instance_.MakeArgumentPointer(
op_ptr.get(), in_device_buffers_, out_device_buffer_);
if(op_ptr->IsSupportedArgument(argument.get()))
{
invoker->Run(argument.get());
out_device_buffer_->FromDevice(out_tensor_->mData.data());
if(!ref_output_)
{
throw std::runtime_error(
"OpInstanceRunEngine::Test: Reference value not availabe."
" You have to provide reference function.");
}
// TODO: enable flexible use of custom check_error functions
res = res && check_err(out_tensor_->mData, ref_output_->mData);
out_device_buffer_->SetZero();
}
}
return res;
}
template <typename OpInstancePtr>
ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
int nrepeat = 100,
bool do_verification = false,
bool do_log = false)
{
bool res{true};
ProfileBestConfig best_config;
for(auto& op_ptr : op_ptrs)
{
auto invoker = op_instance_.MakeInvokerPointer(op_ptr.get());
auto argument = op_instance_.MakeArgumentPointer(
op_ptr.get(), in_device_buffers_, out_device_buffer_);
if(op_ptr->IsSupportedArgument(argument.get()))
{
std::string op_name = op_ptr->GetTypeString();
float avg_time = invoker->Run(argument.get(), nrepeat);
std::size_t flops = op_instance_.GetFlops();
std::size_t num_btype = op_instance_.GetBtype();
float tflops = static_cast<float>(flops) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << op_name << std::endl;
if(tflops < best_config.best_tflops)
{
best_config.best_op_name = op_name;
best_config.best_tflops = tflops;
best_config.best_gb_per_sec = gb_per_sec;
best_config.best_avg_time = avg_time;
}
if(do_verification)
{
out_device_buffer_->FromDevice(out_tensor_->mData.data());
if(!ref_output_)
{
throw std::runtime_error(
"OpInstanceRunEngine::Profile: Reference value not availabe."
" You have to provide reference function.");
}
// TODO: enable flexible use of custom check_error functions
res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
if(do_log) {}
}
out_device_buffer_->SetZero();
}
}
return best_config;
}
void SetAtol(double a) { atol_ = a; }
void SetRtol(double r) { rtol_ = r; }
private:
template <typename F, std::size_t... Is>
void CallRefOpUnpackArgs(const F& f, std::index_sequence<Is...>) const
{
f(*std::get<Is>(in_tensors_)..., *ref_output_);
}
template <std::size_t... Is>
void AllocateDeviceInputTensors(std::index_sequence<Is...>)
{
(AllocateDeviceInputTensorsImpl<Is>(), ...);
}
template <std::size_t Index>
void AllocateDeviceInputTensorsImpl()
{
const auto& ts = std::get<Index>(in_tensors_);
in_device_buffers_
.emplace_back(
std::make_unique<DeviceMem>(sizeof(std::tuple_element_t<Index, InArgsTypesTuple>) *
ts->mDesc.GetElementSpace()))
->ToDevice(ts->mData.data());
}
static constexpr std::size_t kNInArgs_ = std::tuple_size_v<InTensorsTuple>;
const OpInstanceT& op_instance_;
double rtol_{1e-5};
double atol_{1e-8};
InTensorsTuple in_tensors_;
TensorPtr<OutDataType> out_tensor_;
TensorPtr<OutDataType> ref_output_;
DeviceBuffers in_device_buffers_;
DeviceMemPtr out_device_buffer_;
template <typename T>
bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
{
return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
}
};
} // namespace utils
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment