Commit 9535f806 authored by Chao Liu's avatar Chao Liu
Browse files

refactoring array

parent c98cbea0
......@@ -18,29 +18,34 @@ map_convolution_into_gemm_v1(const WeiDesc& wei_k_c_y_x_global_desc,
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads)
{
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
constexpr auto i0 = Number<0>{};
constexpr auto i1 = Number<1>{};
constexpr auto i2 = Number<2>{};
constexpr auto i3 = Number<3>{};
const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
const index_t N = in_n_c_hi_wi_global_desc.GetLength(i0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(i1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(i1);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(i2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(i3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(i2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(i3);
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(i2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(i3);
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
const index_t ConvStrideH = conv_strides[i0];
const index_t ConvStrideW = conv_strides[i1];
const index_t InLeftPadH = in_left_pads[0];
const index_t InLeftPadW = in_left_pads[1];
const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1];
const index_t ConvDilationH = conv_dilations[i0];
const index_t ConvDilationW = conv_dilations[i1];
const index_t InLeftPadH = in_left_pads[i0];
const index_t InLeftPadW = in_left_pads[i1];
const index_t InRightPadH = in_right_pads[i0];
const index_t InRightPadW = in_right_pads[i1];
// input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
......@@ -59,22 +64,23 @@ map_convolution_into_gemm_v1(const WeiDesc& wei_k_c_y_x_global_desc,
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(i2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(i3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(DynamicPassThrough{N},
make_tuple(
DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc =
transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}},
DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}),
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
......
......@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
AddressSpace::Vgpr,
AddressSpace::Lds,
InMemoryDataOperation::Set>(
MultiIndex<4>{0, 0, b_block_data_on_global, 0}, MultiIndex<4>{0, 0, 0, 0});
MultiIndex<4>{{0, 0, b_block_data_on_global, 0}}, MultiIndex<4>{{0, 0, 0, 0}});
// weight tensor
// global tensor in global memory, src of blockwise copy
......
......@@ -12,9 +12,8 @@ using MultiIndex = Array<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs... xs)
{
return MultiIndex<sizeof...(Xs)>{{static_cast<index_t>(xs)...}};
return make_array<index_t>(xs...);
}
#else
template <index_t N>
using MultiIndex = StaticallyIndexedArray<index_t, N>;
......@@ -33,5 +32,57 @@ __host__ __device__ constexpr auto make_zero_multi_index()
typename uniform_sequence_gen<NSize, 0>::type{});
}
template <typename T>
__host__ __device__ constexpr auto to_multi_index(const T& x)
{
return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
}
template <index_t NSize, typename X>
__host__ __device__ constexpr auto operator+=(MultiIndex<NSize>& y, const X& x)
{
static_assert(X::Size() == NSize, "wrong! size not the same");
static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
return y;
}
template <index_t NSize, typename X>
__host__ __device__ constexpr auto operator-=(MultiIndex<NSize>& y, const X& x)
{
static_assert(X::Size() == NSize, "wrong! size not the same");
static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
return y;
}
template <index_t NSize, typename T>
__host__ __device__ constexpr auto operator+(const MultiIndex<NSize>& a, const T& b)
{
using type = MultiIndex<NSize>;
static_assert(T::Size() == NSize, "wrong! size not the same");
type r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] + b[i]; });
return r;
}
template <index_t NSize, typename T>
__host__ __device__ constexpr auto operator-(const MultiIndex<NSize>& a, const T& b)
{
using type = MultiIndex<NSize>;
static_assert(T::Size() == NSize, "wrong! size not the same");
type r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] - b[i]; });
return r;
}
template <index_t NSize, typename T>
__host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T& b)
{
using type = MultiIndex<NSize>;
static_assert(T::Size() == NSize, "wrong! size not the same");
type r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] * b[i]; });
return r;
}
} // namespace ck
#endif
......@@ -234,7 +234,7 @@ struct Merge
{
if(idx_up_diff[0] == 0)
{
return make_zero_array<index_t, nDimLow>();
return make_zero_multi_index<nDimLow>();
}
else
{
......@@ -459,7 +459,7 @@ struct Embed
for(index_t icorner = 0; icorner < ncorner; ++icorner)
{
// generate upper index for each corner
auto idx_up = make_zero_array<index_t, nDimUp>();
auto idx_up = make_zero_multi_index<nDimUp>();
index_t itmp = icorner;
......@@ -512,7 +512,7 @@ struct Freeze
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return make_zero_array<index_t, nDimLow>();
return make_zero_multi_index<nDimLow>();
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
......
......@@ -270,7 +270,7 @@ struct TensorCoordinate
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
make_zero_multi_index<TensorDesc::GetNumOfDimension()>());
}
template <typename... Ts>
......@@ -278,7 +278,7 @@ struct TensorCoordinate
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
make_zero_multi_index<TensorDesc::GetNumOfDimension()>());
}
public:
......
......@@ -317,7 +317,7 @@ struct TransformedTensorDescriptor
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
idx_low_part = tran.CalculateLowerIndex(to_multi_index(idx_up_part));
});
return idx_low;
......@@ -345,8 +345,9 @@ struct TransformedTensorDescriptor
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part = tran.CalculateLowerIndexDiff(
to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
idx_low_diff_part = tran.CalculateLowerIndexDiff(to_multi_index(idx_up_diff_part),
to_multi_index(idx_up_old_part),
to_multi_index(idx_low_old_part));
});
return idx_low_diff;
......@@ -506,7 +507,8 @@ struct TransformedTensorDescriptor
constexpr auto low_dims_part = LowDimensionIds{}.At(itran);
constexpr auto low_lengths_part =
GetLowerTensorDescriptor().GetLengths(low_dims_part);
const auto idx_low_part = to_array(pick_array_element(idx_low, low_dims_part));
const auto idx_low_part =
to_multi_index(pick_array_element(idx_low, low_dims_part));
for(index_t i = 0; i < low_dims_part.Size(); ++i)
{
......
......@@ -68,9 +68,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
mThreadwiseLoad.SetDstSliceOrigin(make_zero_multi_index<nDim>());
mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
mThreadwiseStore.SetSrcSliceOrigin(make_zero_multi_index<nDim>());
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
}
}
......
......@@ -54,8 +54,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2()
: ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
: ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_multi_index<nDim>(),
make_zero_multi_index<nDim>())
{
}
......@@ -104,7 +104,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
auto scalar_id = make_zero_multi_index<nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t buffer_offset = i * src_data_per_access;
......@@ -143,7 +143,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
auto scalar_id = make_zero_multi_index<nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
......@@ -177,9 +177,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
__device__ void MoveSrcSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
const auto step_sizes = to_array(step_sizes_);
const auto step_sizes = to_multi_index(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { mSrcSliceOrigin += to_array(step_sizes); })
static_if<PositiveDirection>{}([&](auto) { mSrcSliceOrigin += to_multi_index(step_sizes); })
.Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
}
......@@ -187,7 +187,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
__device__ void MoveDstSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
const auto step_sizes = to_array(step_sizes_);
const auto step_sizes = to_multi_index(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { mDstSliceOrigin += step_sizes; })
.Else([&](auto) { mDstSliceOrigin -= step_sizes; });
......
......@@ -9,49 +9,20 @@ namespace ck {
template <typename TData, index_t NSize>
struct Array
{
using type = Array<TData, NSize>;
using type = Array;
using data_type = TData;
// hack: add extra element to allow empty array
// TODO: implement empty Array
TData mData[NSize + 1] = {0};
TData mData[NSize] = {0};
__host__ __device__ static constexpr index_t Size() { return NSize; }
// TODO: remove
__host__ __device__ static constexpr index_t GetSize() { return Size(); }
template <index_t I>
__host__ __device__ constexpr const TData& At(Number<I>) const
{
static_assert(I < NSize, "wrong!");
return mData[I];
}
template <index_t I>
__host__ __device__ constexpr TData& At(Number<I>)
{
static_assert(I < NSize, "wrong!");
return mData[I];
}
__host__ __device__ constexpr const TData& At(index_t i) const { return mData[i]; }
__host__ __device__ constexpr TData& At(index_t i) { return mData[i]; }
template <typename I>
__host__ __device__ constexpr const TData& operator[](I i) const
{
return At(i);
}
__host__ __device__ constexpr const TData& operator[](index_t i) const { return At(i); }
template <typename I>
__host__ __device__ constexpr TData& operator()(I i)
{
return At(i);
}
__host__ __device__ constexpr TData& operator()(index_t i) { return At(i); }
template <typename T>
__host__ __device__ constexpr auto operator=(const T& a)
......@@ -62,127 +33,34 @@ struct Array
return *this;
}
};
template <typename T>
__host__ __device__ constexpr auto operator+=(const T& a)
{
static_assert(T::Size() == Size(), "wrong! size not the same");
static_for<0, Size(), 1>{}([&](auto i) { operator()(i) += a[i]; });
return *this;
}
template <typename T>
__host__ __device__ constexpr auto operator-=(const T& a)
{
static_assert(T::Size() == Size(), "wrong! size not the same");
static_for<0, Size(), 1>{}([&](auto i) { operator()(i) -= a[i]; });
return *this;
}
template <typename T>
__host__ __device__ constexpr auto operator+(const T& a) const
{
static_assert(T::Size() == Size(), "wrong! size not the same");
type r;
static_for<0, Size(), 1>{}([&](auto i) { r(i) = operator[](i) + a[i]; });
return r;
}
template <typename T>
__host__ __device__ constexpr auto operator-(const T& a) const
{
static_assert(T::Size() == Size(), "wrong! size not the same");
type r;
static_for<0, Size(), 1>{}([&](auto i) { r(i) = operator[](i) - a[i]; });
return r;
}
template <typename T>
__host__ __device__ constexpr auto operator*(const T& a) const
{
static_assert(T::Size() == Size(), "wrong! size not the same");
type r;
static_for<0, Size(), 1>{}([&](auto i) { r(i) = operator[](i) * a[i]; });
return r;
}
struct lambda_PushBack // emulate constexpr lambda
{
const Array<TData, NSize>& old_array;
Array<TData, NSize + 1>& new_array;
__host__ __device__ constexpr lambda_PushBack(const Array<TData, NSize>& old_array_,
Array<TData, NSize + 1>& new_array_)
: old_array(old_array_), new_array(new_array_)
{
}
template <index_t I>
__host__ __device__ constexpr void operator()(Number<I>) const
{
new_array(Number<I>{}) = old_array[I];
}
};
__host__ __device__ constexpr auto PushBack(TData x) const
{
Array<TData, NSize + 1> new_array;
static_for<0, NSize, 1>{}(lambda_PushBack(*this, new_array));
new_array(Number<NSize>{}) = x;
return new_array;
}
template <index_t NAppend>
__host__ __device__ constexpr auto Append(const Array<TData, NAppend>& xs) const
{
Array<TData, NSize + NAppend> r;
static_for<0, NSize, 1>{}([&r, this ](auto i) constexpr { r(i) = (*this)[i]; });
static_for<0, NAppend, 1>{}([&r, &xs ](auto i) constexpr { r(NSize + i) = xs[i]; });
// empty Array
template <typename TData>
struct Array<TData, 0>
{
using type = Array;
using data_type = TData;
return r;
}
__host__ __device__ static constexpr index_t Size() { return 0; }
};
template <typename X, typename... Xs>
__host__ __device__ constexpr auto make_array(const X& x, const Xs&... xs)
{
return Array<X, sizeof...(xs) + 1>{{x, xs...}};
return Array<X, sizeof...(Xs) + 1>{{x, static_cast<X>(xs)...}};
}
template <typename T>
__host__ __device__ constexpr auto to_array(const T& x)
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto push_back(Array<TData, NSize>& a, const TData& x)
{
Array<typename T::data_type, T::Size()> y;
Array<TData, NSize + 1> r;
static_for<0, T::Size(), 1>{}([&](auto i) { y.At(i) = x.At(i); });
static_for<0, NSize, 1>{}([&r, &a ](auto i) constexpr { r(i) = a[i]; });
return y;
}
r(Number<NSize>{}) = x;
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto make_zero_array()
{
constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::type{};
constexpr auto zero_array = to_array(zero_sequence);
return zero_array;
return r;
}
} // namespace ck
......
......@@ -63,7 +63,7 @@ struct ford_impl
for(index_t i = 0; i < RemainLengths::Front(); ++i)
{
ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
f, current_ordered_id.PushBack(i));
f, push_back(current_ordered_id, i));
}
}
};
......
......@@ -386,43 +386,5 @@ struct StaticallyIndexedArray<TData, 22> : Tuple<TData,
using data_type = TData;
};
template <typename TData, index_t NSize, typename X>
__host__ __device__ constexpr auto operator+=(StaticallyIndexedArray<TData, NSize>& y, const X& x)
{
static_assert(X::Size() == NSize, "wrong! size not the same");
static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
return y;
}
template <typename TData, index_t NSize, typename X>
__host__ __device__ constexpr auto operator-=(StaticallyIndexedArray<TData, NSize>& y, const X& x)
{
static_assert(X::Size() == NSize, "wrong! size not the same");
static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
return y;
}
template <typename TData, index_t NSize, typename T>
__host__ __device__ constexpr auto operator+(const StaticallyIndexedArray<TData, NSize>& a,
const T& b)
{
using type = StaticallyIndexedArray<TData, NSize>;
static_assert(T::Size() == NSize, "wrong! size not the same");
type r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] + b[i]; });
return r;
}
template <typename TData, index_t NSize, typename T>
__host__ __device__ constexpr auto operator-(const StaticallyIndexedArray<TData, NSize>& a,
const T& b)
{
using type = StaticallyIndexedArray<TData, NSize>;
static_assert(T::Size() == NSize, "wrong! size not the same");
type r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] - b[i]; });
return r;
}
} // namespace ck
#endif
......@@ -29,16 +29,16 @@ void device_dummy_dynamic_transform_v2(InDesc,
using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
const auto in_nchw_desc = make_dynamic_native_tensor_descriptor_v2(
to_array(InDesc::GetLengths()), to_array(InDesc::GetStrides()));
to_multi_index(InDesc::GetLengths()), to_multi_index(InDesc::GetStrides()));
const auto wei_kcyx_desc = make_dynamic_native_tensor_descriptor_v2(
to_array(WeiDesc::GetLengths()), to_array(WeiDesc::GetStrides()));
to_multi_index(WeiDesc::GetLengths()), to_multi_index(WeiDesc::GetStrides()));
const auto out_nkhw_desc = make_dynamic_native_tensor_descriptor_v2(
to_array(OutDesc::GetLengths()), to_array(OutDesc::GetStrides()));
to_multi_index(OutDesc::GetLengths()), to_multi_index(OutDesc::GetStrides()));
const auto conv_strides = to_array(ConvStrides{});
const auto conv_dilations = to_array(ConvDilations{});
const auto in_left_pads = to_array(InLeftPads{});
const auto in_right_pads = to_array(InRightPads{});
const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{});
const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{});
const auto tensor_descs = map_convolution_into_gemm_v2(wei_kcyx_desc,
in_nchw_desc,
......
......@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
#endif
}
#if 0
#if 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment