Commit b808713a authored by Chao Liu's avatar Chao Liu
Browse files

prototype dynamic descriptor

parent d3405258
......@@ -12,73 +12,6 @@ namespace ck {
template <index_t BlockSize>
struct DummyDynamicTransform
{
__device__ void Run_(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
const Array<index_t, 2> conv_strides,
const Array<index_t, 2> conv_dilations,
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads,
index_t k_block_num,
index_t c_block_num,
index_t y_block_num,
index_t x_block_num) const
{
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
p_wei_global[0] = wei_k_c_y_x_global_desc.GetElementSize();
p_wei_global[1] = wei_k_c_y_x_global_desc.GetElementSpace();
const index_t k_block_num_stride = c_block_num * y_block_num * x_block_num;
const index_t c_block_num_stride = y_block_num * x_block_num;
const index_t y_block_num_stride = x_block_num;
index_t tmp = get_block_1d_id();
#if 0
const index_t k_block = tmp / k_block_num_stride;
tmp -= k_block * k_block_num_stride;
const index_t c_block = tmp / c_block_num_stride;
tmp -= c_block * c_block_num_stride;
const index_t y_block = tmp / y_block_num_stride;
tmp -= y_block * y_block_num_stride;
const index_t x_block = tmp;
#else
const index_t k_block = __llvm_amdgcn_readfirstlane_i32(tmp / k_block_num_stride);
tmp -= k_block * k_block_num_stride;
const index_t c_block = __llvm_amdgcn_readfirstlane_i32(tmp / c_block_num_stride);
tmp -= c_block * c_block_num_stride;
const index_t y_block = __llvm_amdgcn_readfirstlane_i32(tmp / y_block_num_stride);
tmp -= y_block * y_block_num_stride;
const index_t x_block = __llvm_amdgcn_readfirstlane_i32(tmp);
#endif
const index_t k_thread = p_in_global[get_thread_local_1d_id()];
const index_t c_thread = p_in_global[get_thread_local_1d_id() + 1];
const index_t y_thread = p_in_global[get_thread_local_1d_id() + 2];
const index_t x_thread = p_in_global[get_thread_local_1d_id() + 3];
p_wei_global[3] = wei_k_c_y_x_global_desc.CalculateOffset(
{k_block + k_thread, c_block + c_thread, y_block + y_thread, x_block + x_thread});
}
__device__ void Run(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
......@@ -88,11 +21,7 @@ struct DummyDynamicTransform
const Array<index_t, 2> conv_strides,
const Array<index_t, 2> conv_dilations,
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads,
index_t,
index_t,
index_t,
index_t) const
const Array<index_t, 2> in_right_pads) const
{
#if 1
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
......@@ -189,7 +118,7 @@ struct DummyDynamicTransform
const index_t& idx_low_bound) {
index_t idx_low_tmp = idx_low_old + carry + idx_low_diff_const;
#if 0 // positive
#if 1 // positive
bool do_carry = idx_low_tmp >= idx_low_bound;
index_t idx_low_new = do_carry ? idx_low_tmp - idx_low_bound : idx_low_tmp;
......@@ -389,23 +318,6 @@ struct DummyDynamicTransform
Y,
X);
#if 0
// Merge(N, Ho, Wo) => GemmN
f_lower_idx_diff_merge(idx_diff[9],
idx_diff[12],
idx_diff[14],
idx_diff[16],
idx[9],
idx[12],
idx[14],
const_tmp[3],
const_tmp[4],
const_tmp[5],
N,
Ho,
Wo);
#endif
// stage 2
// PassThrough(N) => N
f_lower_idx_diff_passthrough(idx_diff[5], idx_diff[9]);
......@@ -496,6 +408,19 @@ struct DummyDynamicTransform
out_n_k_ho_wo_global_desc.GetElementSpace());
}
}
__device__ void Run_(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
const Array<index_t, 2> conv_strides,
const Array<index_t, 2> conv_dilations,
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads) const
{
}
};
} // namespace ck
......
......@@ -5,217 +5,18 @@
namespace ck {
struct DynamicPassThrough
class DynamicTransformation
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
index_t low_length_;
__host__ __device__ constexpr DynamicPassThrough(index_t low_length) : low_length_(low_length)
{
}
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetUpperLengths() { return Sequence<Length>{}; }
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
};
template <index_t NDimLow>
struct DynamicMerge
template<index_t N>
class DynamicEmbed : public DynamicTransformation
{
static constexpr index_t ndim_low_ = NDimLow static constexpr index_t ndim_up_ = 1;
using LowerIndex = MultiIndex<ndim_low_>;
using UpperIndex = MultiIndex<ndum_up_>;
Array<index_t, NDimLow> low_lengths_;
index_t up_length_;
__host__ __device__ static constexpr auto GetNumOfLowerDimension()
{
return Number<ndim_low_>{};
}
__host__ __device__ static constexpr auto GetNumOfUpperDimension()
{
return Number<ndim_up_>{};
}
__host__ __device__ static constexpr auto GetUpperLengths()
{
return Array<index_t, 1> up_length_;
}
// emulate constexpr lambda
template <typename PseudoLowStrides>
struct lambda_CalculateLowerIndex
{
index_t& itmp;
LowerIndex& idx_low;
__host__ __device__ explicit constexpr lambda_CalculateLowerIndex(index_t& itmp_,
LowerIndex& idx_low_)
: itmp(itmp_), idx_low(idx_low_)
{
}
template <typename IDim>
__host__ __device__ constexpr void operator()(IDim idim) const
{
constexpr index_t stride = PseudoLowStrides::At(idim);
idx_low(idim) = itmp / stride;
itmp -= idx_low[idim] * stride;
}
};
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
LowerIndex idx_low;
index_t itmp = idx_up[0];
constexpr auto pseudo_low_strides =
reverse_inclusive_scan_sequence(
LowerLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
.PushBack(Number<1>{});
static_for<0, nDimLow - 1, 1>{}(
lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
idx_low(nDimLow - 1) = itmp / pseudo_low_strides[nDimLow - 1];
return idx_low;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& idx_low_old)
{
if(idx_up_diff[0] == 0)
{
return make_zero_array<index_t, nDimLow>();
}
else
{
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
// If idx_up_diff is known at compile-time, the calculation can
// be done at compile-time. However, if idx_up_diff is only known
// at run-time, then the calculation will also be computed at
// run-time, and can be very expensive.
LowerIndex idx_low_diff_tmp = CalculateLowerIndex(idx_up_diff);
// find out the last low dimension that changed
index_t last_changed_low_dim = 0;
static_for<0, nDimLow, 1>{}([&](auto i) {
if(idx_low_diff_tmp[i] != 0)
{
last_changed_low_dim = i;
}
});
LowerIndex idx_low_new = idx_low_old + idx_low_diff_tmp;
if(idx_up_diff[0] > 0)
{
// do carry check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool carry = false;
static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
if(i <= last_changed_low_dim)
{
if(carry)
{
++idx_low_new(i);
}
carry = false;
if(idx_low_new[i] >= LowerLengths::At(i))
{
idx_low_new(i) -= LowerLengths::At(i);
carry = true;
}
}
});
// highest dimension, no out-of-bound check
if(carry)
{
++idx_low_new(0);
}
}
else
{
// do borrow check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool borrow = false;
static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
if(i <= last_changed_low_dim)
{
if(borrow)
{
--idx_low_new(i);
}
borrow = false;
if(idx_low_new[i] < 0)
{
idx_low_new(i) += LowerLengths::At(i);
borrow = true;
}
}
});
// highest dimension, no out-of-bound check
if(borrow)
{
--idx_low_new(0);
}
}
return idx_low_new - idx_low_old;
}
}
__host__ __device__ static constexpr bool IsLinearTransform() { return false; }
const array<idnex_t, N+1> coefficients_;
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
__host__ __device__ constexpr DynamicEmbed(coefficients)
: coefficients_(coefficients)
{
return true;
}
};
......
......@@ -15,21 +15,6 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
}
template <typename LowTensorDescriptor,
typename Transforms,
typename LowDimensions,
typename UpDimensions>
__host__ __device__ constexpr auto
transform_dynamic_tensor_descriptor(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms,
LowDimensions,
UpDimensions)
{
return DynamicTransformedTensorDescriptor<LowTensorDescriptor,
Transforms,
LowDimensions,
UpDimensions>(low_tensor_desc, transforms);
}
} // namespace ck
#endif
......@@ -75,11 +75,7 @@ void device_dummy_dynamic_transform(InDesc,
const Array<index_t, 2>,
const Array<index_t, 2>,
const Array<index_t, 2>,
const Array<index_t, 2>,
index_t,
index_t,
index_t,
index_t>,
const Array<index_t, 2>>,
dim3(GridSize),
dim3(BlockSize),
0,
......@@ -93,11 +89,7 @@ void device_dummy_dynamic_transform(InDesc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
10,
10,
10,
10);
in_right_pads);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment