Commit 4388f572 authored by Chao Liu's avatar Chao Liu
Browse files

prototyping dynamic tensor descriptor

parent b2098e70
...@@ -12,16 +12,16 @@ namespace ck { ...@@ -12,16 +12,16 @@ namespace ck {
template <index_t BlockSize> template <index_t BlockSize>
struct DummyDynamicTransform struct DummyDynamicTransform
{ {
__device__ void Run(index_t* const __restrict__ p_wei_global, __device__ void Run_v1(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global, index_t* const __restrict__ p_in_global,
float* const __restrict__ p_out_global, float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc, const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc, const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc, const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
const Array<index_t, 2> conv_strides, const Array<index_t, 2> conv_strides,
const Array<index_t, 2> conv_dilations, const Array<index_t, 2> conv_dilations,
const Array<index_t, 2> in_left_pads, const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads) const const Array<index_t, 2> in_right_pads) const
{ {
#if 1 #if 1
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0); const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
...@@ -48,8 +48,8 @@ struct DummyDynamicTransform ...@@ -48,8 +48,8 @@ struct DummyDynamicTransform
const index_t InRightPadH = in_right_pads[0]; const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1]; const index_t InRightPadW = in_right_pads[1];
#else #else
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0); const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1); const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t Y = 3; const index_t Y = 3;
const index_t X = 3; const index_t X = 3;
...@@ -234,7 +234,7 @@ struct DummyDynamicTransform ...@@ -234,7 +234,7 @@ struct DummyDynamicTransform
idx_low_diff -= negative_carry; idx_low_diff -= negative_carry;
negative_carry = do_borrow ? 1 : negative_carry; negative_carry = do_borrow ? 1 : negative_carry;
#endif #endif
}; };
...@@ -267,9 +267,9 @@ struct DummyDynamicTransform ...@@ -267,9 +267,9 @@ struct DummyDynamicTransform
const_tmp[i] = p_wei_global[i + 1]; const_tmp[i] = p_wei_global[i + 1];
} }
#else #else
const_tmp[0] = 0; const_tmp[0] = 0;
const_tmp[1] = 2; const_tmp[1] = 2;
const_tmp[2] = 2; const_tmp[2] = 2;
#endif #endif
// initialize idx // initialize idx
...@@ -302,8 +302,9 @@ struct DummyDynamicTransform ...@@ -302,8 +302,9 @@ struct DummyDynamicTransform
// PassThrough GemmN => GemmN // PassThrough GemmN => GemmN
f_lower_idx_diff_passthrough(idx_diff[16], idx_diff[19]); f_lower_idx_diff_passthrough(idx_diff[16], idx_diff[19]);
// stage 3 // stage 3
// Merge(C, Y, X) => GemmKTotal // Merge(C, Y, X) => GemmKTotal
#if 0
f_lower_idx_diff_merge_v2(idx_diff[10], f_lower_idx_diff_merge_v2(idx_diff[10],
idx_diff[11], idx_diff[11],
idx_diff[13], idx_diff[13],
...@@ -317,6 +318,47 @@ struct DummyDynamicTransform ...@@ -317,6 +318,47 @@ struct DummyDynamicTransform
C, C,
Y, Y,
X); X);
#elif 0
index_t tmp = idx_diff[15];
const index_t const_tmp_0 = tmp / (Y * X);
tmp -= const_tmp_0 * (Y * X);
const index_t const_tmp_1 = tmp / X;
const index_t const_tmp_2 = tmp - const_tmp_1 * X;
f_lower_idx_diff_merge_v2(idx_diff[10],
idx_diff[11],
idx_diff[13],
idx_diff[15],
idx[10],
idx[11],
idx[13],
const_tmp_0,
const_tmp_1,
const_tmp_2,
C,
Y,
X);
#elif 1
index_t tmp = idx_diff[15];
const index_t const_tmp_0 = __llvm_amdgcn_readfirstlane_i32(tmp / (Y * X));
tmp -= const_tmp_0 * (Y * X);
const index_t const_tmp_1 = __llvm_amdgcn_readfirstlane_i32(tmp / X);
const index_t const_tmp_2 = __llvm_amdgcn_readfirstlane_i32(tmp - const_tmp_1 * X);
f_lower_idx_diff_merge_v2(idx_diff[10],
idx_diff[11],
idx_diff[13],
idx_diff[15],
idx[10],
idx[11],
idx[13],
const_tmp_0,
const_tmp_1,
const_tmp_2,
C,
Y,
X);
#endif
// stage 2 // stage 2
// PassThrough(N) => N // PassThrough(N) => N
...@@ -372,7 +414,7 @@ struct DummyDynamicTransform ...@@ -372,7 +414,7 @@ struct DummyDynamicTransform
idx[13] += idx_diff[13]; idx[13] += idx_diff[13];
// padding check // padding check
bool is_in_bound = true; bool is_in_bound = true;
#else // pad #else // pad
// offset // offset
idx[0] += idx_diff[0]; idx[0] += idx_diff[0];
...@@ -409,7 +451,39 @@ struct DummyDynamicTransform ...@@ -409,7 +451,39 @@ struct DummyDynamicTransform
} }
} }
__device__ void Run_(index_t* const __restrict__ p_wei_global, __device__ void Run_v2(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
const Array<index_t, 2> conv_strides,
const Array<index_t, 2> conv_dilations,
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads) const
{
Index idx_up;
idx_up(0) = in_n_c_hi_wi_global_desc.GetLength(0);
idx_up(1) = in_n_c_hi_wi_global_desc.GetLength(1);
idx_up(2) = in_n_c_hi_wi_global_desc.GetLength(2);
idx_up(3) = in_n_c_hi_wi_global_desc.GetLength(3);
#if 0
constexpr auto trans = GetTransforms();
auto idx_low = trans[0]->CalculateLowerIndex(idx_up);
#elif 1
constexpr DynamicCoordinateTransform* tran = &embed;
auto idx_low = tran->CalculateLowerIndex(idx_up);
#endif
p_out_global[get_thread_local_1d_id()] = idx_low[0];
}
__device__ void Run(index_t* const __restrict__ p_wei_global,
index_t* const __restrict__ p_in_global, index_t* const __restrict__ p_in_global,
float* const __restrict__ p_out_global, float* const __restrict__ p_out_global,
const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc, const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
...@@ -420,6 +494,16 @@ struct DummyDynamicTransform ...@@ -420,6 +494,16 @@ struct DummyDynamicTransform
const Array<index_t, 2> in_left_pads, const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads) const const Array<index_t, 2> in_right_pads) const
{ {
Run_v2(p_wei_global,
p_in_global,
p_out_global,
wei_k_c_y_x_global_desc,
in_n_c_hi_wi_global_desc,
out_n_k_ho_wo_global_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
} }
}; };
......
...@@ -5,18 +5,452 @@ ...@@ -5,18 +5,452 @@
namespace ck { namespace ck {
class DynamicTransformation struct DynamicPassThrough
{ {
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const index_t low_length_;
__host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length)
: low_length_(low_length)
{
}
__host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperIndex() { return UpperIndex({low_length_}); }
__host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{
return true;
}
}; };
template<index_t N> template <bool SkipIsValidCheck = false>
class DynamicEmbed : public DynamicTransformation struct DynamicLeftPad
{ {
const array<idnex_t, N+1> coefficients_; using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const index_t low_length_;
const index_t left_pad_;
__host__ __device__ explicit constexpr Pad(const index_t& low_length, const index_t& left_pad)
: low_length_{low_length}, left_pad_{left_pad}
{
}
__host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperIndex()
{
return UpperIndex({low_length_ + left_pad_});
}
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return LowerIndex{idx_up - lef_pad_};
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return SkipIsValidCheck;
}
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up)
{
return SkipIsValidCheck || (idx_up[0] >= left_pad_);
}
};
template <bool SkipIsValidCheck = false>
struct DynamicRightPad
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const index_t low_length_;
const index_t right_pad_;
__host__ __device__ explicit constexpr Pad(const index_t& low_length, const index_t& right_pad)
: low_length_{low_length}, right_pad_{right_pad}
{
}
__host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperIndex()
{
return UpperIndex({low_length_ + right_pad_});
}
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return SkipIsValidCheck;
}
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up)
{
return SkipIsValidCheck || (idx_up[0] < low_length_);
}
};
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp]
template <index_t NDimUp>
struct DynamicEmbed
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>;
const index_t low_length_;
const UpperIndex up_lengths_;
const Array<index_t, NDimUp + 1> coefficients_;
__host__ __device__ explicit constexpr Embed(const index_t& low_length,
const UpperIndex& up_lengths,
const Array<index_t, NDimUp + 1>& coefficients)
: low_length_(low_length), up_lengths_(up_lengths), coefficients_(coefficients)
{
static_assert(up_lengths.GetSize() == nDimUp && coefficients.GetSize() == nDimUp + 1,
"wrong! # of dimensions not consistent");
}
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimUp; }
__host__ __device__ static constexpr auto GetUpperIndex() { return up_lengths_; }
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
index_t idx_low = coefficients_[NDimUp];
for(index_t i = 0; i < nDimUp; ++i)
{
idx_low += idx_up[i] * coefficients_[i];
}
return LowerIndex({idx_low});
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
index_t idx_low_diff = 0;
for(index_t i = 0; i < nDimUp; ++i)
{
idx_low_diff += idx_up_diff[i] * Coefficients{}[i];
}
return LowerIndex({idx_low_diff});
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{
return true;
}
};
template <index_t NDimLow>
struct DynamicMerge
{
using LowerIndex = MultiIndex<NDimLow>;
using UpperIndex = MultiIndex<1>;
const LowerIndex low_lengths_;
const LowerIndex low_lengths_scan_;
const index_t up_length_;
__host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths_)
: low_lengths_(low_lengths),
low_lengths_scan_(reverse_inclusive_scan_on_array(low_lengths, multiplies<index_t>()),
up_length(accumulate_on_array(low_lengths, multiplies<index_t>(), 1))
{
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() {
return NDimLow; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() {
return 1; }
__host__ __device__ constexpr auto GetUpperIndex() const
{
return UpperIndex({up_length_});
}
__host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) const
{
LowerIndex idx_low;
index_t itmp = idx_up[0];
#pragma unroll
for(index_t i; i < NDimLow - 1; ++i)
{
idx_low(i) = itmp / low_lengths_scan_[i];
itmp -= idx_low[i] * low_lengths_scan_[i];
}
idx_low(NDimLow - 1) = itmp;
#pragma unroll
return idx_low;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& idx_low_old)
{
LowerIndex idx_low_diff;
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
// 1) If idx_up_diff is known at compile-time, then idx_low_diff_const
// can be calculated at compile-time.
// 2) If idx_up_diff is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then idx_low_diff_const also
// doesn't change during the whole kernel execution. Compiler generated ISA should
// only caclculate idx_low_diff once and save it durinng the whole kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff);
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
#pragma unroll
for(index_t i = NDimLow - 1; i > 1; --i)
{
// this should be saved as well
index_t idx_low_length_minus_idx_low_diff_const =
low_lengths_[i] - idx_low_diff_const[i];
#if 0
index_t idx_low_length_plus_idx_low_diff_const =
low_lengths_[i] + idx_low_diff_const[i];
#endif
index_t idx_low_tmp[i] = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_low_diff_const;
#if 0
bool do_borrow = idx_low_tmp < -idx_low_diff_const[i];
#endif
idx_low_diff[i] =
do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const;
#if 0
idx_low_diff[i] =
do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i];
#endif
idx_low_diff[i] += carry;
carry = do_carry ? 1 : 0;
#if 0
carry = do_borrow ? -1 : carry;
#endif
}
idx_low_diff[0] = idx_low_diff_const[0] + carry;
return idx_low_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() {
return false; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{
return true;
}
};
template <index_t NDimUp>
struct DynamicUnMerge
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_;
const UpperIndex up_lengths_scan_;
__host__ __device__ explicit constexpr DynamicUnMerge(const UpperIndex& up_lengths)
: up_lengths_(up_lengths),
up_lengths_scan_(reverse_exclusive_scan_on_array(up_lengths, multiplies<index_t>(), index_t(1))
{
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() {
return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() {
return NDimUp; }
__host__ __device__ constexpr auto GetUpperIndex() const {
return up_lengths_; }
__host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) const
{
index_t idx_low = idx_up[NDimUp];
#pragma unroll
for(index_t i = 0; i < NDimUp - 1; ++i)
{
idx_low += idx_up[i] * up_lengths_scan_[i];
}
return LowerIndex{idx_low};
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return CalculateLowerIndex(idx_up_diff);
}
__host__ __device__ static constexpr bool IsLinearTransform() {
return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{
return true;
}
};
struct DynamicFreeze
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<>;
const index_t low_idx_;
const index_t low_length_;
__host__ __device__ explicit constexpr Freeze(const index_t& low_idx, const index_t& low_length)
: low_idx_(low_idx), low_length_(low_length)
{
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; }
__host__ __device__ static constexpr auto GetUpperIndex() { return UpperIndex(); }
__host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& /*idx_up*/) const
{
return LowerIndex({low_length_});
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& /* idx_up_diff */,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return LowerIndex({0});
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
__host__ __device__ constexpr DynamicEmbed(coefficients) __host__ __device__ static constexpr bool
: coefficients_(coefficients) IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{ {
return true;
} }
}; };
......
...@@ -2,236 +2,13 @@ ...@@ -2,236 +2,13 @@
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP #define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "dynamic_multi_index_transform.hpp"
namespace ck { namespace ck {
template <index_t NDim> struct TensorDescriptor
struct DynamicNativeTensorDescriptor
{ {
using Index = MultiIndex<NDim>;
Array<index_t, NDim> lengths_;
Array<index_t, NDim> strides_;
index_t element_size_;
index_t element_space_;
template <typename Lengths, typename Strides>
__host__ __device__ constexpr DynamicNativeTensorDescriptor(const Lengths& lengths,
const Strides& strides)
: lengths_(lengths), strides_(strides)
{
element_size_ = 1;
for(index_t i = 0; i < NDim; ++i)
{
element_size_ *= lengths_[i];
}
element_space_ = 1;
for(index_t i = 0; i < NDim; ++i)
{
element_space_ += (lengths_[i] - 1) * strides_[i];
}
}
__host__ __device__ static constexpr auto GetNumOfDimension() { return NDim; }
__host__ __device__ constexpr auto GetLength(const index_t& i) const { return lengths_[i]; }
__host__ __device__ constexpr auto GetStride(const index_t& i) const { return strides_[i]; }
__host__ __device__ constexpr auto GetLengths() const { return lengths_; }
__host__ __device__ constexpr auto GetStrides() const { return strides_; }
__host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
__host__ __device__ constexpr auto GetElementSpace() const { return element_space_; }
__host__ __device__ constexpr auto CalculateOffset(const Index& idx) const
{
index_t offset = 0;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
offset += idx[i] * strides_[i];
}
return offset;
}
__host__ __device__ constexpr auto CalculateOffsetDiff(const Index& idx_diff) const
{
index_t offset_diff = 0;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
offset_diff += idx_diff[i] * strides_[i];
}
return offset_diff;
}
__host__ __device__ constexpr bool IsUpperIndexValid(const Index& idx) const
{
bool flag = true;
#pragma unroll
for(index_t i = 0; i < NDim; ++i)
{
flag = flag && idx[i] >= 0 && idx[i] < lengths_[i];
}
return flag;
}
}; };
#if 0
// Tensor descriptor for "transformed tensor"
template <typename LowTensorDescriptor,
typename Transforms, // Tuple<DynamicMultIndexTransforms,...>
typename LowDimensions, // Tuple<Sequence<...>,...>
typename UpDimensions> // Tuple<Sequence<...>,...>
struct DynamicTransformedTensorDescriptor
{
using Type = DynamicTransformedTensorDescriptor;
__host__ __device__ static constexpr auto GetNumOfLowerDimension()
{
// Here, we assume all lower-dimensions are active
// TODO: sanity-check all lower-dimension are indeed active
using duplicated_low_active_dims =
decltype(unpack(lambda_merge_sequences{}, LowDimensions{}));
using low_active_dims = typename sequence_unique_sort<duplicated_low_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return low_active_dims::Size();
}
__host__ __device__ static constexpr auto GetNumOfUpperDimension()
{
using duplicated_up_active_dims =
decltype(unpack(lambda_merge_sequences{}, UpDimensions{}));
using up_active_dims = typename sequence_unique_sort<duplicated_up_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return up_active_dims::Size();
}
static constexpr index_t ndim_up_ = GetNumOfUpperDimension();
static constexpr index_t ndim_low_ = GetNumOfLowerDimension();
static constexpr index_t num_transform_ = Transforms::Size();
using UpperIndex = MultiIndex<ndim_up_>;
using LowerIndex = MultiIndex<ndim_low_>;
const LowTensorDescriptor low_tensor_desc_;
const Transforms transforms_;
const LowDimensions low_dims_;
const UpDimensions up_dims_;
__host__ __device__ constexpr TransformedTensorDescriptor(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms)
: low_tensor_desc_(low_tensor_desc),
transforms_(transforms)
{
}
__host__ __device__ static constexpr auto GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
__host__ __device__ constexpr auto GetLowerTensorDescriptor() const
{
return low_dims_;
}
__host__ __device__ constexpr auto GetUpperLengths() cons
{
}
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
__host__ __device__ constexpr auto GetLength(index_t i) const
{
return GetLengths()[i];
}
__host__ __device__ constexpr auto GetElementSize() const
{
index_t element_size = 1;
for(index_t i = 0; i < ndim_up_; ++i)
{
element_size *= GetLength(i);
}
return element_size;
}
__host__ __device__ constexpr auto GetElementSpace() const
{
return lower_tensor_desc_.GetElementSpace();
}
// TODO: right now return value is not constexpr because use of non-constexpr lambda
__host__ __device__ constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) const
{
LowerIndex idx_low;
static_for<0, num_transform_, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, UpDimensions{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensions{}.At(itran));
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
});
return idx_low;
}
// TODO: right now return value is not constexpr because use of non-constepxr lambda
__host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
{
LowerIndex idx_low_diff;
static_for<0, nTransform, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_diff_part =
pick_array_element(idx_up_diff, UpDimensions{}.At(itran));
const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensions{}.At(itran));
const auto idx_low_old_part =
pick_array_element(idx_low_old, LowDimensions{}.At(itran));
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensions{}.At(itran));
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part = tran.CalculateLowerIndexDiff(
to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
});
return idx_low_diff;
}
__host__ __device__ static constexpr index_t CalculateOffset(const UpperIndex& idx_up)
{
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
}
};
#endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -15,6 +15,5 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L ...@@ -15,6 +15,5 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides); return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
} }
} // namespace ck } // namespace ck
#endif #endif
...@@ -13,7 +13,7 @@ struct Array ...@@ -13,7 +13,7 @@ struct Array
using data_type = TData; using data_type = TData;
// TODO: implement empty Array // TODO: implement empty Array
index_t mData[NSize]; TData mData[NSize] = {0};
__host__ __device__ explicit constexpr Array() {} __host__ __device__ explicit constexpr Array() {}
...@@ -159,262 +159,5 @@ struct ArrayElementPicker ...@@ -159,262 +159,5 @@ struct ArrayElementPicker
Arr& mArray; Arr& mArray;
}; };
template <typename Arr, typename Picks>
__host__ __device__ constexpr auto pick_array_element(Arr& a, Picks)
{
return ArrayElementPicker<Arr, Picks>(a);
}
template <typename T>
__host__ __device__ constexpr auto to_array(const T& x)
{
Array<typename T::data_type, T::Size()> y;
static_for<0, T::Size(), 1>{}([&](auto i) { y.At(i) = x.At(i); });
return y;
}
// TODO: remove this
template <index_t... Is>
__host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
{
return Array<index_t, sizeof...(Is)>{Is...};
}
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto make_zero_array()
{
constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::type{};
constexpr auto zero_array = sequence2array(zero_sequence);
return zero_array;
}
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
Sequence<IRs...> /*new2old*/)
{
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return Array<TData, NSize>{old_array[IRs]...};
}
template <typename TData, index_t NSize, typename MapOld2New>
struct lambda_reorder_array_given_old2new
{
const Array<TData, NSize>& old_array;
Array<TData, NSize>& new_array;
__host__ __device__ constexpr lambda_reorder_array_given_old2new(
const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
: old_array(old_array_), new_array(new_array_)
{
}
template <index_t IOldDim>
__host__ __device__ constexpr void operator()(Number<IOldDim>) const
{
TData old_data = old_array[IOldDim];
constexpr index_t INewDim = MapOld2New::At(Number<IOldDim>{});
new_array(Number<INewDim>{}) = old_data;
}
};
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
Sequence<IRs...> /*old2new*/)
{
Array<TData, NSize> new_array;
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
static_for<0, NSize, 1>{}(
lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
return new_array;
}
template <typename TData, index_t NSize, typename ExtractSeq>
__host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
{
Array<TData, ExtractSeq::GetSize()> new_array;
constexpr index_t new_size = ExtractSeq::GetSize();
static_assert(new_size <= NSize, "wrong! too many extract");
static_for<0, new_size, 1>{}([&](auto I) { new_array(I) = old_array[ExtractSeq::At(I)]; });
return new_array;
}
// emulate constepxr lambda for array
template <typename F, typename X, typename Y, typename Z>
struct lambda_array_math
{
const F& f;
const X& x;
const Y& y;
Z& z;
__host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
: f(f_), x(x_), y(y_), z(z_)
{
}
template <index_t IDim_>
__host__ __device__ constexpr void operator()(Number<IDim_>) const
{
constexpr auto IDim = Number<IDim_>{};
z(IDim) = f(x[IDim], y[IDim]);
}
};
// Array = Array + Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
{
Array<TData, NSize> result;
auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array - Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b)
{
Array<TData, NSize> result;
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array += Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator+=(Array<TData, NSize>& a, Array<TData, NSize> b)
{
a = a + b;
return a;
}
// Array -= Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator-=(Array<TData, NSize>& a, Array<TData, NSize> b)
{
a = a - b;
return a;
}
// Array = Array + Sequence
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array - Sequence
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array * Sequence
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::multiplies<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Sequence - Array
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array * TData
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
{
Array<TData, NSize> result;
for(index_t i = 0; i < NSize; ++i)
{
result(i) = a[i] * v;
}
return result;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr TData
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
{
TData result = init;
static_assert(NSize > 0, "wrong");
static_for<0, NSize, 1>{}([&](auto I) { result = f(result, a[I]); });
return result;
}
} // namespace ck } // namespace ck
#endif #endif
#ifndef CK_ARRAY_HELPER_HPP
#define CK_ARRAY_HELPER_HPP
#include "array.hpp"
namespace ck {
template <typename Arr, typename Picks>
__host__ __device__ constexpr auto pick_array_element(Arr& a, Picks)
{
return ArrayElementPicker<Arr, Picks>(a);
}
template <typename T>
__host__ __device__ constexpr auto to_array(const T& x)
{
Array<typename T::data_type, T::Size()> y;
static_for<0, T::Size(), 1>{}([&](auto i) { y.At(i) = x.At(i); });
return y;
}
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto make_zero_array()
{
constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::type{};
constexpr auto zero_array = to_array(zero_sequence);
return zero_array;
}
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
Sequence<IRs...> /*new2old*/)
{
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return Array<TData, NSize>{old_array[IRs]...};
}
template <typename TData, index_t NSize, typename MapOld2New>
struct lambda_reorder_array_given_old2new
{
const Array<TData, NSize>& old_array;
Array<TData, NSize>& new_array;
__host__ __device__ constexpr lambda_reorder_array_given_old2new(
const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
: old_array(old_array_), new_array(new_array_)
{
}
template <index_t IOldDim>
__host__ __device__ constexpr void operator()(Number<IOldDim>) const
{
TData old_data = old_array[IOldDim];
constexpr index_t INewDim = MapOld2New::At(Number<IOldDim>{});
new_array(Number<INewDim>{}) = old_data;
}
};
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
Sequence<IRs...> /*old2new*/)
{
Array<TData, NSize> new_array;
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
static_for<0, NSize, 1>{}(
lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
return new_array;
}
template <typename TData, index_t NSize, typename ExtractSeq>
__host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
{
Array<TData, ExtractSeq::GetSize()> new_array;
constexpr index_t new_size = ExtractSeq::GetSize();
static_assert(new_size <= NSize, "wrong! too many extract");
static_for<0, new_size, 1>{}([&](auto I) { new_array(I) = old_array[ExtractSeq::At(I)]; });
return new_array;
}
// emulate constepxr lambda for array
template <typename F, typename X, typename Y, typename Z>
struct lambda_array_math
{
const F& f;
const X& x;
const Y& y;
Z& z;
__host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
: f(f_), x(x_), y(y_), z(z_)
{
}
template <index_t IDim_>
__host__ __device__ constexpr void operator()(Number<IDim_>) const
{
constexpr auto IDim = Number<IDim_>{};
z(IDim) = f(x[IDim], y[IDim]);
}
};
// Array = Array + Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
{
Array<TData, NSize> result;
auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array - Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b)
{
Array<TData, NSize> result;
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array += Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator+=(Array<TData, NSize>& a, Array<TData, NSize> b)
{
a = a + b;
return a;
}
// Array -= Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator-=(Array<TData, NSize>& a, Array<TData, NSize> b)
{
a = a - b;
return a;
}
// Array = Array + Sequence
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array - Sequence
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array * Sequence
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::multiplies<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Sequence - Array
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array * TData
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
{
Array<TData, NSize> result;
for(index_t i = 0; i < NSize; ++i)
{
result(i) = a[i] * v;
}
return result;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr TData
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
{
TData result = init;
static_assert(NSize > 0, "wrong");
static_for<0, NSize, 1>{}([&](auto I) { result = f(result, a[I]); });
return result;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
reverse_inclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init)
{
Array<TData, NSize> y;
TData r = init;
#pragma unroll
for(index_t i = NSize - 1; i >= 0; --i)
{
r = f(r, x[i]);
y(i) = r;
}
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init)
{
Array<TData, NSize> y;
TData r = init;
#pragma unroll
for(index_t i = NSize - 1; i > 0; --i)
{
y(i) = r;
r = f(r, x[i]);
}
y(i) = r;
return y;
}
} // namespace ck
#endif
...@@ -197,7 +197,7 @@ int main(int argc, char* argv[]) ...@@ -197,7 +197,7 @@ int main(int argc, char* argv[])
constexpr index_t X = 3; constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>; using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<2, 2>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
...@@ -244,7 +244,7 @@ int main(int argc, char* argv[]) ...@@ -244,7 +244,7 @@ int main(int argc, char* argv[])
#endif #endif
} }
#if 1 #if 0
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif 0 #elif 0
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
...@@ -253,17 +253,17 @@ int main(int argc, char* argv[]) ...@@ -253,17 +253,17 @@ int main(int argc, char* argv[])
#elif 1 #elif 1
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
#endif #endif
(in_nchw_desc, (in_nchw_desc,
in_nchw_device, in_nchw_device,
wei_kcyx_desc, wei_kcyx_desc,
wei_kcyx, wei_kcyx,
out_nkhw_desc, out_nkhw_desc,
out_nkhw, out_nkhw,
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
if(do_verification) if(do_verification)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment