Commit d3405258 authored by Chao Liu's avatar Chao Liu
Browse files

prototype dynamic descriptor

parent 834eb24c
#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#define CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
namespace ck {
template <index_t BlockSize>
struct DummyDynamicTransform
{
__device__ void Run_(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
const Array<index_t, 2> conv_strides,
const Array<index_t, 2> conv_dilations,
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads,
index_t k_block_num,
index_t c_block_num,
index_t y_block_num,
index_t x_block_num) const
{
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
p_wei_global[0] = wei_k_c_y_x_global_desc.GetElementSize();
p_wei_global[1] = wei_k_c_y_x_global_desc.GetElementSpace();
const index_t k_block_num_stride = c_block_num * y_block_num * x_block_num;
const index_t c_block_num_stride = y_block_num * x_block_num;
const index_t y_block_num_stride = x_block_num;
index_t tmp = get_block_1d_id();
#if 0
const index_t k_block = tmp / k_block_num_stride;
tmp -= k_block * k_block_num_stride;
const index_t c_block = tmp / c_block_num_stride;
tmp -= c_block * c_block_num_stride;
const index_t y_block = tmp / y_block_num_stride;
tmp -= y_block * y_block_num_stride;
const index_t x_block = tmp;
#else
const index_t k_block = __llvm_amdgcn_readfirstlane_i32(tmp / k_block_num_stride);
tmp -= k_block * k_block_num_stride;
const index_t c_block = __llvm_amdgcn_readfirstlane_i32(tmp / c_block_num_stride);
tmp -= c_block * c_block_num_stride;
const index_t y_block = __llvm_amdgcn_readfirstlane_i32(tmp / y_block_num_stride);
tmp -= y_block * y_block_num_stride;
const index_t x_block = __llvm_amdgcn_readfirstlane_i32(tmp);
#endif
const index_t k_thread = p_in_global[get_thread_local_1d_id()];
const index_t c_thread = p_in_global[get_thread_local_1d_id() + 1];
const index_t y_thread = p_in_global[get_thread_local_1d_id() + 2];
const index_t x_thread = p_in_global[get_thread_local_1d_id() + 3];
p_wei_global[3] = wei_k_c_y_x_global_desc.CalculateOffset(
{k_block + k_thread, c_block + c_thread, y_block + y_thread, x_block + x_thread});
}
__device__ void Run(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
const Array<index_t, 2> conv_strides,
const Array<index_t, 2> conv_dilations,
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads,
index_t,
index_t,
index_t,
index_t) const
{
#if 1
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
const index_t InLeftPadH = in_left_pads[0];
const index_t InLeftPadW = in_left_pads[1];
const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1];
#else
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t Y = 3;
const index_t X = 3;
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
const index_t InLeftPadH = in_left_pads[0];
const index_t InLeftPadW = in_left_pads[1];
const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1];
#endif
// define transform
// pass through
auto f_lower_idx_diff_passthrough = [](index_t& idx_low_diff, const index_t& idx_up_diff) {
idx_low_diff = idx_up_diff;
};
// pad
auto f_lower_idx_diff_pad = [](index_t& idx_low_diff, const index_t& idx_up_diff) {
idx_low_diff = idx_up_diff;
};
// embed
auto f_lower_idx_diff_embed = [](index_t& idx_low_diff,
const index_t& idx_up_diff_0,
const index_t& idx_up_diff_1,
const index_t coeff0,
const index_t coeff1) {
idx_low_diff = coeff0 * idx_up_diff_0 + coeff1 * idx_up_diff_1;
};
// unmerge
auto f_lower_idx_diff_unmerge = [](index_t& idx_low_diff,
const index_t& idx_up_diff_0,
const index_t& idx_up_diff_1,
const index_t up_length_1) {
idx_low_diff = up_length_1 * idx_up_diff_0 + idx_up_diff_1;
};
// merge
auto f_lower_idx_diff_merge_v1 = [](index_t& idx_low_diff_0,
index_t& idx_low_diff_1,
index_t& idx_low_diff_2,
const index_t& idx_up_diff,
const index_t& idx_low_old_0,
const index_t& idx_low_old_1,
const index_t& idx_low_old_2,
const index_t& idx_low_diff_const_0,
const index_t& idx_low_diff_const_1,
const index_t& idx_low_diff_const_2,
const index_t& idx_low_bound_0,
const index_t& idx_low_bound_1,
const index_t& idx_low_bound_2) {
auto f_carry_arithmetic = [](index_t& idx_low_diff,
index_t& carry,
const index_t& idx_low_old,
const index_t& idx_low_diff_const,
const index_t& idx_low_bound) {
index_t idx_low_tmp = idx_low_old + carry + idx_low_diff_const;
#if 0 // positive
bool do_carry = idx_low_tmp >= idx_low_bound;
index_t idx_low_new = do_carry ? idx_low_tmp - idx_low_bound : idx_low_tmp;
carry = do_carry ? 1 : 0;
#else // negative
bool do_borrow = idx_low_tmp < 0;
index_t idx_low_new = do_borrow ? idx_low_tmp + idx_low_bound : idx_low_tmp;
carry = do_borrow ? -1 : 0;
#endif
idx_low_diff = idx_low_new - idx_low_old;
};
index_t carry = 0;
f_carry_arithmetic(
idx_low_diff_2, carry, idx_low_old_2, idx_low_diff_const_2, idx_low_bound_2);
f_carry_arithmetic(
idx_low_diff_1, carry, idx_low_old_1, idx_low_diff_const_1, idx_low_bound_1);
idx_low_diff_0 = idx_low_diff_const_0 + carry;
};
auto f_lower_idx_diff_merge_v2 = [](index_t& idx_low_diff_0,
index_t& idx_low_diff_1,
index_t& idx_low_diff_2,
const index_t& idx_up_diff,
const index_t& idx_low_old_0,
const index_t& idx_low_old_1,
const index_t& idx_low_old_2,
const index_t& idx_low_diff_const_0,
const index_t& idx_low_diff_const_1,
const index_t& idx_low_diff_const_2,
const index_t& idx_low_bound_0,
const index_t& idx_low_bound_1,
const index_t& idx_low_bound_2) {
auto f_carry_arithmetic = [](index_t& idx_low_diff,
index_t& carry,
const index_t& idx_low_old,
const index_t& idx_low_diff_const,
const index_t& idx_low_bound) {
index_t idx_low_tmp = idx_low_old + carry;
index_t idx_low_bound_minus_idx_low_diff_const = idx_low_bound - idx_low_diff_const;
#if 1 // positive
bool do_carry = idx_low_tmp >= idx_low_bound_minus_idx_low_diff_const;
idx_low_diff =
do_carry ? -idx_low_bound_minus_idx_low_diff_const : idx_low_diff_const;
idx_low_diff += carry;
carry = do_carry ? 1 : 0;
#else // negative
bool do_borrow = idx_low_tmp < -idx_low_diff_const;
idx_low_diff = do_borrow ? idx_low_diff_const + idx_low_bound : idx_low_diff_const;
idx_low_diff -= carry;
carry = do_borrow ? 1 : carry;
#endif
};
index_t carry = 0;
f_carry_arithmetic(
idx_low_diff_2, carry, idx_low_old_2, idx_low_diff_const_2, idx_low_bound_2);
f_carry_arithmetic(
idx_low_diff_1, carry, idx_low_old_1, idx_low_diff_const_1, idx_low_bound_1);
idx_low_diff_0 = idx_low_diff_const_0 + carry;
};
auto f_lower_idx_diff_merge_v3 = [](index_t& idx_low_diff_0,
index_t& idx_low_diff_1,
index_t& idx_low_diff_2,
const index_t& idx_up_diff,
const index_t& idx_low_old_0,
const index_t& idx_low_old_1,
const index_t& idx_low_old_2,
const index_t& idx_low_diff_const_0,
const index_t& idx_low_diff_const_1,
const index_t& idx_low_diff_const_2,
const index_t& idx_low_bound_0,
const index_t& idx_low_bound_1,
const index_t& idx_low_bound_2) {
auto f_carry_arithmetic = [](index_t& idx_low_diff,
index_t& negative_carry,
const index_t& idx_low_old,
const index_t& idx_low_diff_const,
const index_t& idx_low_bound) {
index_t neg_idx_low_tmp = negative_carry - idx_low_old;
index_t idx_low_diff_const_minus_idx_low_bound = idx_low_diff_const - idx_low_bound;
#if 1
bool do_carry = neg_idx_low_tmp <= idx_low_diff_const_minus_idx_low_bound;
idx_low_diff =
do_carry ? idx_low_diff_const_minus_idx_low_bound : idx_low_diff_const;
idx_low_diff -= negative_carry;
negative_carry = do_carry ? -1 : 0;
#else
bool do_borrow = neg_idx_low_tmp > idx_low_diff_const;
idx_low_diff = do_borrow ? idx_low_diff_const + idx_low_bound : idx_low_diff_const;
idx_low_diff -= negative_carry;
negative_carry = do_borrow ? 1 : negative_carry;
#endif
};
index_t negative_carry = 0;
f_carry_arithmetic(idx_low_diff_2,
negative_carry,
idx_low_old_2,
idx_low_diff_const_2,
idx_low_bound_2);
f_carry_arithmetic(idx_low_diff_1,
negative_carry,
idx_low_old_1,
idx_low_diff_const_1,
idx_low_bound_1);
idx_low_diff_0 = idx_low_diff_const_0 - negative_carry;
};
index_t idx[20];
index_t idx_diff[20];
index_t const_tmp[6];
// populate const
const index_t GemmKPack = p_wei_global[0];
#if 1
for(index_t i = 0; i < 6; ++i)
{
const_tmp[i] = p_wei_global[i + 1];
}
#else
const_tmp[0] = 0;
const_tmp[1] = 2;
const_tmp[2] = 2;
#endif
// initialize idx
for(index_t i = 0; i < 20; ++i)
{
idx[i] = p_wei_global[get_thread_local_1d_id() + i];
}
// offset
idx[0] = idx[1] * in_n_c_hi_wi_global_desc.GetStride(0) +
idx[2] * in_n_c_hi_wi_global_desc.GetStride(1) +
idx[3] * in_n_c_hi_wi_global_desc.GetStride(2) +
idx[4] * in_n_c_hi_wi_global_desc.GetStride(3);
// start lowering diff
#pragma unroll 1
for(index_t i = 0; i < 100; ++i)
{
for(index_t i = 0; i < 20; ++i)
{
idx_diff[i] = 0;
}
idx_diff[17] = 8;
// stage 4
// Unmerge(GemmKTotal) => GemmK, GemmKPack
f_lower_idx_diff_unmerge(idx_diff[15], idx_diff[17], idx_diff[18], GemmKPack);
// PassThrough GemmN => GemmN
f_lower_idx_diff_passthrough(idx_diff[16], idx_diff[19]);
// stage 3
// Merge(C, Y, X) => GemmKTotal
f_lower_idx_diff_merge_v2(idx_diff[10],
idx_diff[11],
idx_diff[13],
idx_diff[15],
idx[10],
idx[11],
idx[13],
const_tmp[0],
const_tmp[1],
const_tmp[2],
C,
Y,
X);
#if 0
// Merge(N, Ho, Wo) => GemmN
f_lower_idx_diff_merge(idx_diff[9],
idx_diff[12],
idx_diff[14],
idx_diff[16],
idx[9],
idx[12],
idx[14],
const_tmp[3],
const_tmp[4],
const_tmp[5],
N,
Ho,
Wo);
#endif
// stage 2
// PassThrough(N) => N
f_lower_idx_diff_passthrough(idx_diff[5], idx_diff[9]);
// PassThrough(C) => C
f_lower_idx_diff_passthrough(idx_diff[6], idx_diff[10]);
// Embed(Hip) => Y, Ho
f_lower_idx_diff_embed(
idx_diff[7], idx_diff[11], idx_diff[12], ConvDilationH, ConvStrideH);
// Embed(Wip) => X, Wo
f_lower_idx_diff_embed(
idx_diff[8], idx_diff[13], idx_diff[14], ConvDilationW, ConvStrideW);
// stage 1
// PassThrough(N) => N
f_lower_idx_diff_passthrough(idx_diff[1], idx_diff[5]);
// PassThrough(C) => C
f_lower_idx_diff_passthrough(idx_diff[2], idx_diff[6]);
// Pad(Hi) => Hip
f_lower_idx_diff_pad(idx_diff[3], idx_diff[7]);
// Pad(Wi) => Wip
f_lower_idx_diff_pad(idx_diff[4], idx_diff[8]);
// stage 0
// offset_diff
idx_diff[0] = idx_diff[1] * in_n_c_hi_wi_global_desc.GetStride(0) +
idx_diff[2] * in_n_c_hi_wi_global_desc.GetStride(1) +
idx_diff[3] * in_n_c_hi_wi_global_desc.GetStride(2) +
idx_diff[4] * in_n_c_hi_wi_global_desc.GetStride(3);
#if 0
// update idx
for(index_t i = 0; i < 20; ++ i)
{
idx[i] += idx_diff[i];
}
// padding check
bool is_in_bound = idx[3] >= 0 && idx[3] < Hi && idx[4] >= 0 && idx[4] < Wi;
#elif 0 // no pad
// offset
idx[0] += idx_diff[0];
// C, Y, X
idx[10] += idx_diff[10];
idx[11] += idx_diff[11];
idx[13] += idx_diff[13];
// padding check
bool is_in_bound = true;
#else // pad
// offset
idx[0] += idx_diff[0];
// C, Y, X
idx[10] += idx_diff[10];
idx[11] += idx_diff[11];
idx[13] += idx_diff[13];
// Hi, Wi
idx[3] += idx_diff[3];
idx[4] += idx_diff[4];
// padding check
bool is_in_bound = idx[3] >= 0 && idx[3] < Hi && idx[4] >= 0 && idx[4] < Wi;
#endif
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
idx[0],
is_in_bound,
out_n_k_ho_wo_global_desc.GetElementSpace());
}
}
};
} // namespace ck
#endif
...@@ -96,13 +96,10 @@ struct DummyStaticTransform ...@@ -96,13 +96,10 @@ struct DummyStaticTransform
auto coord = typename TensorCoordinate<decltype(in_gemmk_gemmn_global_desc)>::type(k0, n0); auto coord = typename TensorCoordinate<decltype(in_gemmk_gemmn_global_desc)>::type(k0, n0);
if(get_block_1d_id() < coord.GetOffset()) #pragma unroll 1
for(index_t k = 0; k < 100; ++k)
{ {
for(index_t k = 0; k < 1; ++k) coord += Array<index_t, 2>{8, 0};
{
for(index_t n = 0; n < 4; ++n)
{
auto tmp = coord + Array<index_t, 2>{k, n};
Float value = 1; Float value = 1;
transfer_data<Float, transfer_data<Float,
...@@ -116,13 +113,11 @@ struct DummyStaticTransform ...@@ -116,13 +113,11 @@ struct DummyStaticTransform
true, true,
1, 1,
p_in_global, p_in_global,
tmp.GetOffset(), coord.GetOffset(),
tmp.IsOffsetValidAssumingUpperIndexIsValid(), coord.IsOffsetValidAssumingUpperIndexIsValid(),
in_gemmk_gemmn_global_desc.GetElementSpace()); in_gemmk_gemmn_global_desc.GetElementSpace());
} }
} }
}
}
}; };
} // namespace ck } // namespace ck
......
#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp"
namespace ck {
struct DynamicPassThrough
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
index_t low_length_;
__host__ __device__ constexpr DynamicPassThrough(index_t low_length) : low_length_(low_length)
{
}
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetUpperLengths() { return Sequence<Length>{}; }
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
};
template <index_t NDimLow>
struct DynamicMerge
{
static constexpr index_t ndim_low_ = NDimLow static constexpr index_t ndim_up_ = 1;
using LowerIndex = MultiIndex<ndim_low_>;
using UpperIndex = MultiIndex<ndum_up_>;
Array<index_t, NDimLow> low_lengths_;
index_t up_length_;
__host__ __device__ static constexpr auto GetNumOfLowerDimension()
{
return Number<ndim_low_>{};
}
__host__ __device__ static constexpr auto GetNumOfUpperDimension()
{
return Number<ndim_up_>{};
}
__host__ __device__ static constexpr auto GetUpperLengths()
{
return Array<index_t, 1> up_length_;
}
// emulate constexpr lambda
template <typename PseudoLowStrides>
struct lambda_CalculateLowerIndex
{
index_t& itmp;
LowerIndex& idx_low;
__host__ __device__ explicit constexpr lambda_CalculateLowerIndex(index_t& itmp_,
LowerIndex& idx_low_)
: itmp(itmp_), idx_low(idx_low_)
{
}
template <typename IDim>
__host__ __device__ constexpr void operator()(IDim idim) const
{
constexpr index_t stride = PseudoLowStrides::At(idim);
idx_low(idim) = itmp / stride;
itmp -= idx_low[idim] * stride;
}
};
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
LowerIndex idx_low;
index_t itmp = idx_up[0];
constexpr auto pseudo_low_strides =
reverse_inclusive_scan_sequence(
LowerLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
.PushBack(Number<1>{});
static_for<0, nDimLow - 1, 1>{}(
lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
idx_low(nDimLow - 1) = itmp / pseudo_low_strides[nDimLow - 1];
return idx_low;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& idx_low_old)
{
if(idx_up_diff[0] == 0)
{
return make_zero_array<index_t, nDimLow>();
}
else
{
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
// If idx_up_diff is known at compile-time, the calculation can
// be done at compile-time. However, if idx_up_diff is only known
// at run-time, then the calculation will also be computed at
// run-time, and can be very expensive.
LowerIndex idx_low_diff_tmp = CalculateLowerIndex(idx_up_diff);
// find out the last low dimension that changed
index_t last_changed_low_dim = 0;
static_for<0, nDimLow, 1>{}([&](auto i) {
if(idx_low_diff_tmp[i] != 0)
{
last_changed_low_dim = i;
}
});
LowerIndex idx_low_new = idx_low_old + idx_low_diff_tmp;
if(idx_up_diff[0] > 0)
{
// do carry check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool carry = false;
static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
if(i <= last_changed_low_dim)
{
if(carry)
{
++idx_low_new(i);
}
carry = false;
if(idx_low_new[i] >= LowerLengths::At(i))
{
idx_low_new(i) -= LowerLengths::At(i);
carry = true;
}
}
});
// highest dimension, no out-of-bound check
if(carry)
{
++idx_low_new(0);
}
}
else
{
// do borrow check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool borrow = false;
static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
if(i <= last_changed_low_dim)
{
if(borrow)
{
--idx_low_new(i);
}
borrow = false;
if(idx_low_new[i] < 0)
{
idx_low_new(i) += LowerLengths::At(i);
borrow = true;
}
}
});
// highest dimension, no out-of-bound check
if(borrow)
{
--idx_low_new(0);
}
}
return idx_low_new - idx_low_old;
}
}
__host__ __device__ static constexpr bool IsLinearTransform() { return false; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
};
} // namespace ck
#endif
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
#include "common_header.hpp"
namespace ck {
template <index_t NDim>
struct DynamicNativeTensorDescriptor
{
using Index = MultiIndex<NDim>;
Array<index_t, NDim> lengths_;
Array<index_t, NDim> strides_;
index_t element_size_;
index_t element_space_;
template <typename Lengths, typename Strides>
__host__ __device__ constexpr DynamicNativeTensorDescriptor(const Lengths& lengths,
const Strides& strides)
: lengths_(lengths), strides_(strides)
{
element_size_ = 1;
for(index_t i = 0; i < NDim; ++i)
{
element_size_ *= lengths_[i];
}
element_space_ = 1;
for(index_t i = 0; i < NDim; ++i)
{
element_space_ += (lengths_[i] - 1) * strides_[i];
}
}
__host__ __device__ static constexpr auto GetNumOfDimension() { return NDim; }
__host__ __device__ constexpr auto GetLength(const index_t& i) const { return lengths_[i]; }
__host__ __device__ constexpr auto GetStride(const index_t& i) const { return strides_[i]; }
__host__ __device__ constexpr auto GetLengths() const { return lengths_; }
__host__ __device__ constexpr auto GetStrides() const { return strides_; }
__host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
__host__ __device__ constexpr auto GetElementSpace() const { return element_space_; }
__host__ __device__ constexpr auto CalculateOffset(const Index& idx) const
{
index_t offset = 0;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
offset += idx[i] * strides_[i];
}
return offset;
}
__host__ __device__ constexpr auto CalculateOffsetDiff(const Index& idx_diff) const
{
index_t offset_diff = 0;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
offset_diff += idx_diff[i] * strides_[i];
}
return offset_diff;
}
__host__ __device__ constexpr bool IsUpperIndexValid(const Index& idx) const
{
bool flag = true;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
flag = flag && idx[i] >= 0 && idx[i] < lengths_[i];
}
return flag;
}
};
#if 0
// Tensor descriptor for "transformed tensor"
template <typename LowTensorDescriptor,
typename Transforms, // Tuple<DynamicMultIndexTransforms,...>
typename LowDimensions, // Tuple<Sequence<...>,...>
typename UpDimensions> // Tuple<Sequence<...>,...>
struct DynamicTransformedTensorDescriptor
{
using Type = DynamicTransformedTensorDescriptor;
__host__ __device__ static constexpr auto GetNumOfLowerDimension()
{
// Here, we assume all lower-dimensions are active
// TODO: sanity-check all lower-dimension are indeed active
using duplicated_low_active_dims =
decltype(unpack(lambda_merge_sequences{}, LowDimensions{}));
using low_active_dims = typename sequence_unique_sort<duplicated_low_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return low_active_dims::Size();
}
__host__ __device__ static constexpr auto GetNumOfUpperDimension()
{
using duplicated_up_active_dims =
decltype(unpack(lambda_merge_sequences{}, UpDimensions{}));
using up_active_dims = typename sequence_unique_sort<duplicated_up_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return up_active_dims::Size();
}
static constexpr index_t ndim_up_ = GetNumOfUpperDimension();
static constexpr index_t ndim_low_ = GetNumOfLowerDimension();
static constexpr index_t num_transform_ = Transforms::Size();
using UpperIndex = MultiIndex<ndim_up_>;
using LowerIndex = MultiIndex<ndim_low_>;
const LowTensorDescriptor low_tensor_desc_;
const Transforms transforms_;
const LowDimensions low_dims_;
const UpDimensions up_dims_;
__host__ __device__ constexpr TransformedTensorDescriptor(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms)
: low_tensor_desc_(low_tensor_desc),
transforms_(transforms)
{
}
__host__ __device__ static constexpr auto GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
__host__ __device__ constexpr auto GetLowerTensorDescriptor() const
{
return low_dims_;
}
__host__ __device__ constexpr auto GetUpperLengths() cons
{
}
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
__host__ __device__ constexpr auto GetLength(index_t i) const
{
return GetLengths()[i];
}
__host__ __device__ constexpr auto GetElementSize() const
{
index_t element_size = 1;
for(index_t i = 0; i < ndim_up_; ++i)
{
element_size *= GetLength(i);
}
return element_size;
}
__host__ __device__ constexpr auto GetElementSpace() const
{
return lower_tensor_desc_.GetElementSpace();
}
// TODO: right now return value is not constexpr because use of non-constexpr lambda
__host__ __device__ constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) const
{
LowerIndex idx_low;
static_for<0, num_transform_, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, UpDimensions{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensions{}.At(itran));
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
});
return idx_low;
}
// TODO: right now return value is not constexpr because use of non-constepxr lambda
__host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
{
LowerIndex idx_low_diff;
static_for<0, nTransform, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_diff_part =
pick_array_element(idx_up_diff, UpDimensions{}.At(itran));
const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensions{}.At(itran));
const auto idx_low_old_part =
pick_array_element(idx_low_old, LowDimensions{}.At(itran));
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensions{}.At(itran));
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part = tran.CalculateLowerIndexDiff(
to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
});
return idx_low_diff;
}
__host__ __device__ static constexpr index_t CalculateOffset(const UpperIndex& idx_up)
{
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
}
};
#endif
} // namespace ck
#endif
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
namespace ck {
template <typename Lengths, typename Strides>
__host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const Lengths& lengths,
const Strides& strides)
{
static_assert(Lengths::GetSize() == Strides::GetSize(), "wrong! Size not the same");
return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
}
template <typename LowTensorDescriptor,
typename Transforms,
typename LowDimensions,
typename UpDimensions>
__host__ __device__ constexpr auto
transform_dynamic_tensor_descriptor(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms,
LowDimensions,
UpDimensions)
{
return DynamicTransformedTensorDescriptor<LowTensorDescriptor,
Transforms,
LowDimensions,
UpDimensions>(low_tensor_desc, transforms);
}
} // namespace ck
#endif
#ifndef CK_AMD_LLVM_INTRINSIC_HPP
#define CK_AMD_LLVM_INTRINSIC_HPP
#include "float_type.hpp"
namespace ck {
__device__ int32_t __llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
} // namespace ck
#endif
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#if CK_USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp" #include "amd_inline_asm.hpp"
#include "amd_llvm_intrinsic.hpp"
#endif #endif
#if CK_USE_AMD_XDLOPS #if CK_USE_AMD_XDLOPS
......
...@@ -172,7 +172,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -172,7 +172,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 0 #elif 1
// cdata = 64, BlockSize = 256, 128x128x16 // cdata = 64, BlockSize = 256, 128x128x16
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -290,7 +290,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -290,7 +290,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 2; constexpr index_t WeiBlockCopySrcDataPerRead_E = 2;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 1 #elif 0
// cdata = 64, BlockSize = 128, 64x128x8 // cdata = 64, BlockSize = 128, 64x128x8
constexpr index_t BlockSize = 128; constexpr index_t BlockSize = 128;
......
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dummy_dynamic_transform.hpp"
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations,
class InLeftPads,
class InRightPads>
void device_dummy_dynamic_transform(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat)
{
using namespace ck;
using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
const auto in_nchw_desc = make_dynamic_native_tensor_descriptor(to_array(InDesc::GetLengths()),
to_array(InDesc::GetStrides()));
const auto wei_kcyx_desc = make_dynamic_native_tensor_descriptor(
to_array(WeiDesc::GetLengths()), to_array(WeiDesc::GetStrides()));
const auto out_nkhw_desc = make_dynamic_native_tensor_descriptor(
to_array(OutDesc::GetLengths()), to_array(OutDesc::GetStrides()));
const auto conv_strides = to_array(ConvStrides{});
const auto conv_dilations = to_array(ConvDilations{});
const auto in_left_pads = to_array(InLeftPads{});
const auto in_right_pads = to_array(InRightPads{});
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
constexpr index_t BlockSize = 256;
constexpr index_t GridSize = 1;
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
using dummy_transform = DummyDynamicTransform<BlockSize>;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
launch_kernel(run_gridwise_operation<dummy_transform,
index_t* const,
index_t* const,
float* const,
const DynamicNativeTensorDescriptor<4>,
const DynamicNativeTensorDescriptor<4>,
const DynamicNativeTensorDescriptor<4>,
const Array<index_t, 2>,
const Array<index_t, 2>,
const Array<index_t, 2>,
const Array<index_t, 2>,
index_t,
index_t,
index_t,
index_t>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
static_cast<index_t*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
wei_kcyx_desc,
in_nchw_desc,
out_nkhw_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
10,
10,
10,
10);
}
}
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
...@@ -12,7 +12,7 @@ template <class T, ...@@ -12,7 +12,7 @@ template <class T,
class ConvDilations, class ConvDilations,
class InLeftPads, class InLeftPads,
class InRightPads> class InRightPads>
void device_dummy_transform(InDesc, void device_dummy_static_transform(InDesc,
const Tensor<T>& in_nchw, const Tensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const Tensor<T>& wei_kcyx,
......
...@@ -14,7 +14,8 @@ ...@@ -14,7 +14,8 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dummy_transform.hpp" #include "device_dummy_static_transform.hpp"
#include "device_dummy_dynamic_transform.hpp"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
...@@ -200,7 +201,7 @@ int main(int argc, char* argv[]) ...@@ -200,7 +201,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 1
// 3x3, 35x35, stride 2 // 3x3, 35x35, stride 2
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 288; constexpr index_t C = 288;
...@@ -572,8 +573,20 @@ int main(int argc, char* argv[]) ...@@ -572,8 +573,20 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 0
device_dummy_static_transform(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 1 #elif 1
device_dummy_transform(in_nchw_desc, device_dummy_dynamic_transform(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
wei_kcyx, wei_kcyx,
......
conv_driver.cpp
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment