Commit 5e6cca6f authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents afc7d431 3956085d
......@@ -45,6 +45,7 @@ __global__ void
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
GridwiseGemm::template Run<HasMainK0BlockLoop>(
......@@ -61,6 +62,20 @@ __global__ void
b_element_op,
c_element_op,
block_2_ctile_map);
#else
ignore = p_a_grid;
ignore = p_b_grid;
ignore = p_c_grid;
ignore = p_c0_grid;
ignore = a_grid_desc_k0_m_k1;
ignore = b_grid_desc_k0_n_k1;
ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = a_element_op;
ignore = b_element_op;
ignore = c_element_op;
ignore = block_2_ctile_map;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
template <
......
......@@ -49,6 +49,7 @@ __global__ void
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
GridwiseGemm::template Run<HasMainK0BlockLoop>(
......@@ -67,6 +68,22 @@ __global__ void
b_element_op,
c_element_op,
block_2_ctile_map);
#else
ignore = p_a_grid;
ignore = p_b_grid;
ignore = p_c_grid;
ignore = p_c0_grid;
ignore = p_c1_grid;
ignore = a_grid_desc_k0_m_k1;
ignore = b_grid_desc_k0_n_k1;
ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = a_element_op;
ignore = b_element_op;
ignore = c_element_op;
ignore = block_2_ctile_map;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
template <
......
......@@ -36,6 +36,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
DataType value)
{
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<DataType, DataType>;
constexpr auto I0 = Number<0>{};
......
......@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
// TODO make this logic more generic for more sub-dword datatype
if constexpr(SrcVectorDim != DstVectorDim &&
is_same<half_t, remove_cvref_t<SrcData>>::value &&
is_same<half_t, remove_cvref_t<DstData>>::value &&
SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0)
((is_same<half_t, remove_cvref_t<SrcData>>::value &&
is_same<half_t, remove_cvref_t<DstData>>::value &&
SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
(is_same<int8_t, remove_cvref_t<SrcData>>::value &&
is_same<int8_t, remove_cvref_t<DstData>>::value &&
SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
{
// each transpose does
// DstScalarPerVector # of src vectors in src_thread_scratch_
......
......@@ -13,6 +13,7 @@
#include "functional3.hpp"
#include "functional4.hpp"
#include "enable_if.hpp"
#include "ignore.hpp"
#include "integral_constant.hpp"
#include "math.hpp"
#include "number.hpp"
......@@ -30,6 +31,7 @@
#include "debug.hpp"
#include "amd_buffer_addressing.hpp"
#include "generic_memory_space_atomic_add.hpp"
#include "get_id.hpp"
#include "synchronization.hpp"
#include "amd_address_space.hpp"
......
......@@ -992,77 +992,6 @@ inline __host__ __device__ bhalf_t type_convert<bhalf_t, float>(float x)
return uint16_t(u.int32 >> 16);
}
// TODO: deprecate this
template <typename T>
struct inner_product_with_conversion
{
template <typename X, index_t N>
__device__ T operator()(typename vector_type<X, N>::type a,
typename vector_type<X, N>::type b) const
{
const vector_type<X, N> a_vector{a};
const vector_type<X, N> b_vector{b};
T acc = 0;
static_for<0, N, 1>{}([&](auto i) {
acc += type_convert<T>(a_vector.Scalars()[i]) * type_convert<T>(b_vector.Scalars()[i]);
});
return acc;
}
__device__ T operator()(float_t a, float_t b) const
{
return type_convert<T>(a) * type_convert<T>(b);
}
__device__ T operator()(int8x4_t a, int8x4_t b) const
{
const vector_type<int8_t, 4> a_vector{a};
const vector_type<int8_t, 4> b_vector{b};
T acc = 0;
static_for<0, 4, 1>{}([&](auto i) {
acc += type_convert<T>(a_vector.AsType<int8_t>()[i]) *
type_convert<T>(b_vector.AsType<int8_t>()[i]);
});
return acc;
}
__device__ T operator()(int8x8_t a, int8x8_t b) const
{
const vector_type<int8_t, 8> a_vector{a};
const vector_type<int8_t, 8> b_vector{b};
T acc = 0;
static_for<0, 8, 1>{}([&](auto i) {
acc += type_convert<T>(a_vector.AsType<int8_t>()[i]) *
type_convert<T>(b_vector.AsType<int8_t>()[i]);
});
return acc;
}
__device__ T operator()(int8x16_t a, int8x16_t b) const
{
const vector_type<int8_t, 16> a_vector{a};
const vector_type<int8_t, 16> b_vector{b};
T acc = 0;
static_for<0, 16, 1>{}([&](auto i) {
acc += type_convert<T>(a_vector.AsType<int8_t>()[i]) *
type_convert<T>(b_vector.AsType<int8_t>()[i]);
});
return acc;
}
};
template <typename T>
struct NumericLimits
{
......
#pragma once
#include "amd_buffer_addressing.hpp"
#include "c_style_pointer_cast.hpp"
#include "config.hpp"
#include "enable_if.hpp"
#include "c_style_pointer_cast.hpp"
#include "amd_buffer_addressing.hpp"
#include "generic_memory_space_atomic_add.hpp"
namespace ck {
// T may be scalar or vector
// X may be scalar or vector
// T and X have same scalar type
// X contains multiple T
template <AddressSpaceEnum BufferAddressSpace,
typename T,
typename ElementSpaceSize,
......@@ -316,9 +321,7 @@ struct DynamicBuffer
{
if(is_valid_element)
{
// FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
// calling it
atomicAdd(c_style_pointer_cast<X*>(&p_data_[i]), x);
atomic_add<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
}
}
}
......
#pragma once
#include "data_type.hpp"
namespace ck {
template <typename X>
__device__ X atomic_add(X* p_dst, const X& x);
template <>
__device__ int32_t atomic_add<int32_t>(int32_t* p_dst, const int32_t& x)
{
return atomicAdd(p_dst, x);
}
template <>
__device__ uint32_t atomic_add<uint32_t>(uint32_t* p_dst, const uint32_t& x)
{
return atomicAdd(p_dst, x);
}
template <>
__device__ float atomic_add<float>(float* p_dst, const float& x)
{
return atomicAdd(p_dst, x);
}
template <>
__device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const vector_type<float, 2> vx{x};
vector_type<float, 2> vy{0};
vy.template AsType<float>()(I0) =
atomicAdd(c_style_pointer_cast<float*>(p_dst), vx.template AsType<float>()[I0]);
vy.template AsType<float>()(I1) =
atomicAdd(c_style_pointer_cast<float*>(p_dst) + 1, vx.template AsType<float>()[I1]);
return vy.template AsType<float2_t>()[I0];
}
} // namespace ck
#ifndef CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#include <cmath>
#include "data_type.hpp"
#include "half.hpp"
namespace ck {
namespace math {
static inline __device__ half_t abs(half_t x) { return __habs(x); };
static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
static inline __host__ float abs(float x) { return std::abs(x); };
static inline __host__ double abs(double x) { return std::abs(x); };
static inline __host__ int8_t abs(int8_t x)
{
int8_t sgn = x >> (8 - 1);
return (x ^ sgn) - sgn;
};
static inline __host__ int32_t abs(int32_t x)
{
int32_t sgn = x >> (32 - 1);
return (x ^ sgn) - sgn;
};
static inline __host__ half_t abs(half_t x)
{
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
half_float::half abs_xx = half_float::abs(xx);
half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
return abs_x;
};
static inline __host__ float isnan(float x) { return std::isnan(x); };
static inline __host__ double isnan(double x) { return std::isnan(x); };
static inline __host__ int8_t isnan(int8_t x)
{
(void)x;
return false;
};
static inline __host__ int32_t isnan(int32_t x)
{
(void)x;
return false;
};
static inline __host__ bool isnan(half_t x)
{
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
return half_float::isnan(xx);
};
} // namespace math
} // namespace ck
......
......@@ -33,7 +33,7 @@ namespace ck {
struct float_equal_one
{
template <class T>
__device__ inline bool operator()(T x)
__host__ __device__ inline bool operator()(T x)
{
return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
};
......@@ -42,7 +42,7 @@ struct float_equal_one
struct float_equal_zero
{
template <class T>
__device__ inline bool operator()(T x)
__host__ __device__ inline bool operator()(T x)
{
return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
};
......
......@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
template <index_t NX, index_t NY>
struct transpose_vectors<half_t, NX, NY>
{
// we got [NY * NX] ammount of S data to be transposed
// we got [NY * NX] amount of S data to be transposed
static constexpr index_t s_per_x = NY;
static constexpr index_t s_per_y = NX;
......@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
}
};
// transpose int8 4x4
__device__ void transpose_int8_4x4(const int8x4_t& x0,
const int8x4_t& x1,
const int8x4_t& x2,
const int8x4_t& x3,
int8x4_t& y0,
int8x4_t& y1,
int8x4_t& y2,
int8x4_t& y3)
{
int32_t t0, t1;
int32_t z0, z1, z2, z3;
constexpr int32_t m0 = 0x05010400;
constexpr int32_t m1 = 0x05040100;
constexpr int32_t m2 = 0x07060302;
constexpr int32_t m3 = 0x07030602;
// ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
// -- -- -- -- -- -- -- -- - - - -
// index 7 6 5 4 3 2 1 0 33 77 44 88
// index is reversed because of little endianness (least significant bits first)
// clang-format off
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m0));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m0));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z0) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z1) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m3));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m3));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z2) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z3) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
// clang-format on
y0 = bit_cast<int8x4_t>(z0);
y1 = bit_cast<int8x4_t>(z1);
y2 = bit_cast<int8x4_t>(z2);
y3 = bit_cast<int8x4_t>(z3);
}
template <index_t NX, index_t NY>
struct transpose_vectors<int8_t, NX, NY>
{
// we got [NY * NX] amount of S data to be transposed
static constexpr index_t s_per_x = NY;
static constexpr index_t s_per_y = NX;
using S = int8_t;
using VX = vector_type<int8_t, s_per_x>;
using VY = vector_type<int8_t, s_per_y>;
__device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
StaticallyIndexedArray<VY&, NY>& vy_tuple)
{
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{};
static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
// loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
static_for<0, NY, 4>{}([&](auto iy) {
static_for<0, NX, 4>{}([&](auto ix) {
// reference to 4 int8 data from vx_tuple
const auto& x_s4_0 = vx_tuple[ix].template AsType<int8x4_t>()[iy / I4];
const auto& x_s4_1 = vx_tuple[ix + I1].template AsType<int8x4_t>()[iy / I4];
const auto& x_s4_2 = vx_tuple[ix + I2].template AsType<int8x4_t>()[iy / I4];
const auto& x_s4_3 = vx_tuple[ix + I3].template AsType<int8x4_t>()[iy / I4];
// reference to 4 int8 data from vy_tuple
auto& y_s4_0 = vy_tuple(iy).template AsType<int8x4_t>()(ix / I4);
auto& y_s4_1 = vy_tuple(iy + I1).template AsType<int8x4_t>()(ix / I4);
auto& y_s4_2 = vy_tuple(iy + I2).template AsType<int8x4_t>()(ix / I4);
auto& y_s4_3 = vy_tuple(iy + I3).template AsType<int8x4_t>()(ix / I4);
// transpose
transpose_int8_4x4(x_s4_0, x_s4_1, x_s4_2, x_s4_3, y_s4_0, y_s4_1, y_s4_2, y_s4_3);
});
});
}
};
} // namespace ck
#endif
add_subdirectory(src/host_tensor)
add_subdirectory(src/tensor_operation_instance/gpu)
add_subdirectory(src/tensor_operation_instance/cpu)
\ No newline at end of file
add_subdirectory(src/utility)
add_subdirectory(src/tensor_operation_instance/cpu)
......@@ -26,7 +26,6 @@
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <half.hpp>
#include <limits>
#include <cmath>
#include <cassert>
......@@ -34,6 +33,8 @@
#include <string>
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace ck {
......@@ -42,34 +43,10 @@ namespace host_reduce {
using ck::NanPropagation;
using ck::ReduceTensorOp;
template <typename T>
static inline bool float_equal_one(T);
static inline bool float_equal_one(float x) { return x == 1.0f; };
static inline bool float_equal_one(double x) { return x == 1.0; };
static inline bool float_equal_one(half_float::half x)
{
return x == static_cast<half_float::half>(1.0f);
};
template <typename T>
static inline bool float_equal_zero(T x);
static inline bool float_equal_zero(float x) { return x == 0.0f; };
static inline bool float_equal_zero(double x) { return x == 0.0; };
static inline bool float_equal_zero(half_float::half x)
{
return x == static_cast<half_float::half>(0.0f);
};
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
{
using std::abs;
using ck::math::abs;
if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
{
......@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
}
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{
return (std::numeric_limits<AccDataType>::max());
return (ck::NumericLimits<AccDataType>::Max());
}
else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
{
return (std::numeric_limits<AccDataType>::lowest());
return (ck::NumericLimits<AccDataType>::Lowest());
}
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
{
......@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
AccDataType& accuVal,
AccDataType currVal)
{
using std::isnan;
using ck::math::isnan;
if constexpr(!PropagateNan)
{
......@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
int& accuIndex,
int currIndex)
{
using std::isnan;
using ck::math::isnan;
if constexpr(!PropagateNan)
{
......
......@@ -32,6 +32,7 @@
#include <functional>
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
......@@ -196,10 +197,10 @@ struct ReductionHost
OutDataType* out_data,
IndexDataType* out_indices)
{
using ck::float_equal_one;
using ck::float_equal_zero;
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
......@@ -227,10 +228,10 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
......@@ -263,13 +264,13 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
......@@ -303,10 +304,10 @@ struct ReductionHost
void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::float_equal_one;
using ck::float_equal_zero;
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
......@@ -330,10 +331,10 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
......@@ -361,13 +362,13 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
......
#pragma once
#include <algorithm>
#include <random>
#include "data_type.hpp"
namespace ck {
namespace utils {
// template <typename T, class Enable = void>
// struct FillUniform;
// TODO: what's wrong with this specialization???
// err: segmentation fault in mt19937 - infinite loop like.
// template <typename T>
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
// !std::is_same<T, bhalf_t>::value>::type>
// {
// int a_{0};
// int b_{5};
// // T a_ = T{0};
// // T b_ = T{5};
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen{11939};
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
// std::is_same<T, bhalf_t>::value>::type>
template <typename T>
struct FillUniform
{
float a_{0};
float b_{5};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::mt19937 gen{11939};
std::uniform_real_distribution<> dis(a_, b_);
std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
}
};
template <typename T>
struct FillMonotonicSeq
{
T init_value_{0};
T step_{1};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::generate(first, last, [=, n = init_value_]() mutable {
auto tmp = n;
n += step_;
return tmp;
});
}
};
template <typename T>
struct FillConstant
{
T value_{0};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::fill(first, last, value_);
}
};
} // namespace utils
} // namespace ck
#pragma once
#include <cstdlib>
#include <limits>
#include <memory>
#include <stdexcept>
#include <tuple>
#include <utility>
#include <vector>
#include "check_err.hpp"
#include "device_base.hpp"
#include "functional2.hpp"
namespace ck {
namespace utils {
struct ProfileBestConfig
{
std::string best_op_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_tflops = std::numeric_limits<float>::max();
float best_gb_per_sec = std::numeric_limits<float>::max();
};
/**
* @brief This class describes an operation instance(s).
*
* Op instance defines a particular specializations of operator
* template. Thanks to this specific input/output data types, data
* layouts and modifying elementwise operations it is able to create
* it's input/output tensors, provide pointers to instances which
* can execute it and all operation specific parameters.
*/
template <typename OutDataType, typename... InArgTypes>
class OpInstance
{
public:
template <typename T>
using TensorPtr = std::unique_ptr<Tensor<T>>;
using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
using DeviceMemPtr = std::unique_ptr<DeviceMem>;
using DeviceBuffers = std::vector<DeviceMemPtr>;
OpInstance() = default;
OpInstance(const OpInstance&) = default;
OpInstance& operator=(const OpInstance&) = default;
virtual ~OpInstance(){};
virtual InTensorsTuple GetInputTensors() const = 0;
virtual TensorPtr<OutDataType> GetOutputTensor() const = 0;
virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0;
virtual std::unique_ptr<tensor_operation::device::BaseArgument>
MakeArgumentPointer(tensor_operation::device::BaseOperator*,
const DeviceBuffers&,
const DeviceMemPtr&) const = 0;
virtual std::size_t GetFlops() const = 0;
virtual std::size_t GetBtype() const = 0;
};
/**
* @brief A generic operation instance run engine.
*/
template <typename OutDataType, typename... InArgTypes>
class OpInstanceRunEngine
{
public:
using OpInstanceT = OpInstance<InArgTypes..., OutDataType>;
template <typename T>
using TensorPtr = std::unique_ptr<Tensor<T>>;
using DeviceMemPtr = std::unique_ptr<DeviceMem>;
using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
using DeviceBuffers = std::vector<DeviceMemPtr>;
using InArgsTypesTuple = std::tuple<InArgTypes...>;
OpInstanceRunEngine() = delete;
template <typename ReferenceOp = std::function<void()>>
OpInstanceRunEngine(const OpInstanceT& op_instance,
const ReferenceOp& reference_op = ReferenceOp{})
: op_instance_{op_instance}
{
in_tensors_ = op_instance_.GetInputTensors();
out_tensor_ = op_instance_.GetOutputTensor();
if constexpr(std::is_invocable_v<ReferenceOp,
const Tensor<InArgTypes>&...,
Tensor<OutDataType>&>)
{
ref_output_ = op_instance_.GetOutputTensor();
CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
}
AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
out_device_buffer_ =
std::make_unique<DeviceMem>(sizeof(OutDataType) * out_tensor_->mDesc.GetElementSpace());
out_device_buffer_->SetZero();
}
virtual ~OpInstanceRunEngine(){};
template <typename OpInstancePtr>
bool Test(const std::vector<OpInstancePtr>& op_ptrs)
{
bool res{true};
for(auto& op_ptr : op_ptrs)
{
auto invoker = op_instance_.MakeInvokerPointer(op_ptr.get());
auto argument = op_instance_.MakeArgumentPointer(
op_ptr.get(), in_device_buffers_, out_device_buffer_);
if(op_ptr->IsSupportedArgument(argument.get()))
{
invoker->Run(argument.get());
out_device_buffer_->FromDevice(out_tensor_->mData.data());
if(!ref_output_)
{
throw std::runtime_error(
"OpInstanceRunEngine::Test: Reference value not availabe."
" You have to provide reference function.");
}
// TODO: enable flexible use of custom check_error functions
res = res && check_err(out_tensor_->mData, ref_output_->mData);
out_device_buffer_->SetZero();
}
}
return res;
}
template <typename OpInstancePtr>
ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
int nrepeat = 100,
bool do_verification = false,
bool do_log = false)
{
bool res{true};
ProfileBestConfig best_config;
for(auto& op_ptr : op_ptrs)
{
auto invoker = op_instance_.MakeInvokerPointer(op_ptr.get());
auto argument = op_instance_.MakeArgumentPointer(
op_ptr.get(), in_device_buffers_, out_device_buffer_);
if(op_ptr->IsSupportedArgument(argument.get()))
{
std::string op_name = op_ptr->GetTypeString();
float avg_time = invoker->Run(argument.get(), nrepeat);
std::size_t flops = op_instance_.GetFlops();
std::size_t num_btype = op_instance_.GetBtype();
float tflops = static_cast<float>(flops) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << op_name << std::endl;
if(tflops < best_config.best_tflops)
{
best_config.best_op_name = op_name;
best_config.best_tflops = tflops;
best_config.best_gb_per_sec = gb_per_sec;
best_config.best_avg_time = avg_time;
}
if(do_verification)
{
out_device_buffer_->FromDevice(out_tensor_->mData.data());
if(!ref_output_)
{
throw std::runtime_error(
"OpInstanceRunEngine::Profile: Reference value not availabe."
" You have to provide reference function.");
}
// TODO: enable flexible use of custom check_error functions
res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
if(do_log) {}
}
out_device_buffer_->SetZero();
}
}
return best_config;
}
void SetAtol(double a) { atol_ = a; }
void SetRtol(double r) { rtol_ = r; }
private:
template <typename F, std::size_t... Is>
void CallRefOpUnpackArgs(const F& f, std::index_sequence<Is...>) const
{
f(*std::get<Is>(in_tensors_)..., *ref_output_);
}
template <std::size_t... Is>
void AllocateDeviceInputTensors(std::index_sequence<Is...>)
{
(AllocateDeviceInputTensorsImpl<Is>(), ...);
}
template <std::size_t Index>
void AllocateDeviceInputTensorsImpl()
{
const auto& ts = std::get<Index>(in_tensors_);
in_device_buffers_
.emplace_back(
std::make_unique<DeviceMem>(sizeof(std::tuple_element_t<Index, InArgsTypesTuple>) *
ts->mDesc.GetElementSpace()))
->ToDevice(ts->mData.data());
}
static constexpr std::size_t kNInArgs_ = std::tuple_size_v<InTensorsTuple>;
const OpInstanceT& op_instance_;
double rtol_{1e-5};
double atol_{1e-8};
InTensorsTuple in_tensors_;
TensorPtr<OutDataType> out_tensor_;
TensorPtr<OutDataType> ref_output_;
DeviceBuffers in_device_buffers_;
DeviceMemPtr out_device_buffer_;
template <typename T>
bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
{
return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
}
};
} // namespace utils
} // namespace ck
......@@ -28,19 +28,19 @@ using device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances = std::tuple<
//#####################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 32, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 256, 32, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 128, 32, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 32, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 64, 32, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 64, 128, 32, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 64, 64, 32, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 64, 32, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 64, 128, 32, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 32, 32, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 32, 128, 32, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 64, 32, 32, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 32, 64, 32, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>,
DeviceGemm_Xdl_CShuffle< Row, Col, Row, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>
// clang-format on
>;
......
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
)
set(CONV_FWD_UTIL_SOURCE
conv_fwd_util.cpp
)
add_library(conv_fwd_util SHARED ${CONV_FWD_UTIL_SOURCE})
target_link_libraries(conv_fwd_util PRIVATE host_tensor)
target_compile_features(conv_fwd_util PUBLIC)
set_target_properties(conv_fwd_util PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(conv_fwd_util SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
clang_tidy_check(conv_fwd_util)
#include "conv_fwd_util.hpp"
namespace ck {
namespace utils {
namespace conv {
/**
* @brief Calculate number of FLOPs for Convolution
*
* @param[in] N Batch size.
* @param[in] C Number of input channels.
* @param[in] K Number of output channels.
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
* @param[in] output_spatial_lengths Convolution output spatial dimensions
* lengths.
*
* @return The number of flops.
*/
std::size_t get_flops(ck::index_t N,
ck::index_t C,
ck::index_t K,
const std::vector<ck::index_t>& filter_spatial_lengths,
const std::vector<ck::index_t>& output_spatial_lengths)
{
// 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
return static_cast<std::size_t>(2) * N * K *
std::accumulate(std::begin(output_spatial_lengths),
std::end(output_spatial_lengths),
static_cast<std::size_t>(1),
std::multiplies<std::size_t>()) *
C *
std::accumulate(std::begin(filter_spatial_lengths),
std::end(filter_spatial_lengths),
static_cast<std::size_t>(1),
std::multiplies<std::size_t>());
}
ConvParams::ConvParams()
: num_dim_spatial(2),
N(128),
K(256),
C(192),
filter_spatial_lengths(2, 3),
input_spatial_lengths(2, 71),
conv_filter_strides(2, 2),
conv_filter_dilations(2, 1),
input_left_pads(2, 1),
input_right_pads(2, 1)
{
}
ConvParams::ConvParams(ck::index_t n_dim,
ck::index_t n_batch,
ck::index_t n_out_channels,
ck::index_t n_in_channels,
const std::vector<ck::index_t>& filters_len,
const std::vector<ck::index_t>& input_len,
const std::vector<ck::index_t>& strides,
const std::vector<ck::index_t>& dilations,
const std::vector<ck::index_t>& left_pads,
const std::vector<ck::index_t>& right_pads)
: num_dim_spatial(n_dim),
N(n_batch),
K(n_out_channels),
C(n_in_channels),
filter_spatial_lengths(filters_len),
input_spatial_lengths(input_len),
conv_filter_strides(strides),
conv_filter_dilations(dilations),
input_left_pads(left_pads),
input_right_pads(right_pads)
{
if(filter_spatial_lengths.size() != num_dim_spatial ||
input_spatial_lengths.size() != num_dim_spatial ||
conv_filter_strides.size() != num_dim_spatial ||
conv_filter_dilations.size() != num_dim_spatial ||
input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
{
throw(std::runtime_error(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"));
}
}
std::vector<ck::index_t> ConvParams::GetOutputSpatialLengths() const
{
if(filter_spatial_lengths.size() != num_dim_spatial ||
input_spatial_lengths.size() != num_dim_spatial ||
conv_filter_strides.size() != num_dim_spatial ||
conv_filter_dilations.size() != num_dim_spatial ||
input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
{
throw(std::runtime_error(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"));
}
std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0);
for(ck::index_t i = 0; i < num_dim_spatial; ++i)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const ck::index_t idx_eff =
(filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1;
out_spatial_len[i] =
(input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) /
conv_filter_strides[i] +
1;
}
return out_spatial_len;
}
ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{
ck::utils::conv::ConvParams params;
params.num_dim_spatial = num_dim_spatial;
params.N = std::stoi(argv[arg_idx++]);
params.K = std::stoi(argv[arg_idx++]);
params.C = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWK{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWK{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWK{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KZYXC{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KYXC{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KXC{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
} // namespace conv
} // namespace utils
} // namespace ck
std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParams& p)
{
os << "ConvParams {"
<< "\nnum_dim_spatial: " << p.num_dim_spatial << "\nN: " << p.N << "\nK: " << p.K
<< "\nC: " << p.C << "\nfilter_spatial_lengths: " << p.filter_spatial_lengths
<< "\ninput_spatial_lengths: " << p.input_spatial_lengths
<< "\nconv_filter_strides: " << p.conv_filter_strides
<< "\nconv_filter_dilations: " << p.conv_filter_dilations
<< "\ninput_left_pads: " << p.input_left_pads
<< "\ninput_right_pads: " << p.input_right_pads;
return os;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment