Commit b2439ec9 authored by Chao Liu's avatar Chao Liu
Browse files

adding implicit gemm v4 (nchw, kcyx)

parent 0a265731
......@@ -44,11 +44,7 @@ struct GeneratorTensor_3
{
std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};
#if 0
auto f_acc = std::plus<index_t>{};
#else
auto f_acc = [](auto a, auto b) { return 100 * a + b; };
#endif
return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
}
......@@ -447,7 +443,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
#elif 0
// 3x3 filter, 28x28 image
constexpr index_t N = 128;
constexpr index_t C = 256;
......@@ -543,7 +539,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
#elif 1
// 1x1 filter, 14x14 image
constexpr index_t N = 128;
constexpr index_t C = 512;
......@@ -553,6 +549,18 @@ int main(int argc, char* argv[])
constexpr index_t Y = 1;
constexpr index_t X = 1;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
// 1x1 filter, 73x73 image
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 73;
constexpr index_t WI = 73;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#endif
......@@ -609,8 +617,6 @@ int main(int argc, char* argv[])
};
wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
#endif
// out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
}
#if 1
......@@ -649,7 +655,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
#if 1
#if 0
if(Y == 3 && X == 3)
{
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
......
......@@ -105,6 +105,7 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
return new_array;
}
// Array = Array + Array
template <class TData, index_t NSize>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
{
......@@ -119,6 +120,55 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
return result;
}
// Array = Array - Array
template <class TData, index_t NSize>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b)
{
Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a[i] - b[i];
});
return result;
}
// Array = Array + Sequence
template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a[i] + b.Get(I);
});
return result;
}
// Array = Array - Sequence
template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a[i] - b.Get(I);
});
return result;
}
// Array = Array * Sequence
template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
......@@ -136,15 +186,119 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
return result;
}
template <class TData, index_t NSize, class F>
__host__ __device__ constexpr TData reduce_on_array(Array<TData, NSize> a, F f)
// Array = Sequence - Array
template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
{
TData result = a[0];
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_for<1, NSize, 1>{}([&](auto I) {
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a.Get(I) - b[i];
});
return result;
}
template <class TData, index_t NSize, class Reduce>
__host__ __device__ constexpr TData
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
{
TData result = init;
static_assert(NSize > 0, "wrong");
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result = f(result, a[i]);
});
return result;
}
template <class T, index_t NSize>
__host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
{
constexpr index_t nsize = a.GetSize();
static_assert(nsize > 0 && nsize <= 10, "wrong!");
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
static_if<nsize == 3>{}(
[&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
static_if<nsize == 4>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
static_if<nsize == 5>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
});
static_if<nsize == 6>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
});
static_if<nsize == 7>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6]);
});
static_if<nsize == 8>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7]);
});
static_if<nsize == 9>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8]);
});
static_if<nsize == 10>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8],
a[9]);
});
}
......@@ -99,15 +99,7 @@ struct ConstantMergedTensorDescriptor
return original_multi_id;
}
#if 0 // not needed
__host__ __device__ static index_t
GetOffsetFromOriginalMultiIndex(Array<index_t, nOriginalDim> original_multi_id)
{
return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
}
#endif
__host__ __device__ static index_t GetOffsetFromMultiIndexA(Array<index_t, nDim> multi_id)
__host__ __device__ static index_t GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id)
{
const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
......@@ -126,38 +118,6 @@ struct ConstantMergedTensorDescriptor
return dummy_desc.GetMultiIndexFrom1dIndex(id);
}
#if 0 // not needed
template <index_t IDim>
__host__ __device__ static index_t GetNewOriginalMultiIndexAfterMovingAlongOneDimension(
Array<index_t, nOriginalDim> old_original_multi_id, Number<IDim>, index_t step_size)
{
auto new_original_multi_id = old_original_multi_id;
// get partial-original-multi-id corresponding to this merged dimension
constexpr auto original_partial_dims = std::get<IDim>(mOriginalDimMergeSeqs);
constexpr auto original_partial_tensor_desc =
OriginalTensorDesc::Extract(original_partial_dims);
auto old_original_partial_multi_id =
extract_array(old_original_mutli_id, original_paritial_dims);
auto new_original_partial_multi_id =
original_partial_tensor_desc.GetNewMultiIndexGivenStepSizeOf1dIndex(
old_original_partial_multi_id, step_size);
// update original-mutli-id
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = original_dims_partial.Get(I);
new_original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
});
return new_original_multi_id;
}
#endif
};
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
......
......@@ -4,7 +4,7 @@
template <class Lengths>
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths)
{
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), mod_conv::multiplies<index_t>{})
.PushBack(Number<1>{});
}
......@@ -95,7 +95,7 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr index_t GetElementSize()
{
return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{});
}
// WRONG! ReorderGivenOld2New is broken
......@@ -107,10 +107,10 @@ struct ConstantTensorDescriptor
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
constexpr index_t element_space_unaligned = accumulate_on_sequence(
(lengths_in_rank - Number<1>{}) * strides_in_rank, std::plus<index_t>{}, Number<1>{});
(lengths_in_rank - Number<1>{}) * strides_in_rank, mod_conv::plus<index_t>{}, Number<1>{});
#else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
constexpr index_t element_space_unaligned = accumulate_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), std::plus<index_t>{}, Number<1>{});
(GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{});
#endif
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
......@@ -144,7 +144,8 @@ struct ConstantTensorDescriptor
constexpr auto multi_id = Sequence<Is...>{};
return accumulate_on_sequence(multi_id * GetStrides(), std::plus<index_t>{}, Number<0>{});
return accumulate_on_sequence(
multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
}
#if 0 // ReorderGivenOld2new is broken
......@@ -197,32 +198,70 @@ struct ConstantTensorDescriptor
// This function doesn't do carry check on the highest dimension, for performance reason.
// It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound
// on the highest dimension
template <bool PositiveDirection>
__host__ __device__ static Array<index_t, nDim>
UpdateMultiIndexGivenStepSizeOf1dIndex(Array<index_t, nDim> old_multi_id,
index_t step_size_of_1d_index)
index_t step_size_of_1d_index,
integral_constant<bool, PositiveDirection>)
{
auto new_multi_id = old_multi_id + GetMultiIndexFrom1dIndex(step_size_of_1d_index);
bool carry = false;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
constexpr index_t idim = nDim - 1 - IDimReverse.Get();
constexpr auto IDim = Number<idim>{};
if(carry)
{
++new_multi_id[idim];
}
carry = false;
if(new_multi_id[idim] >= GetLength(IDim))
{
new_multi_id[idim] -= GetLength(IDim);
carry = true;
}
Array<index_t, nDim> new_multi_id;
const auto step_sizes = GetMultiIndexFrom1dIndex(step_size_of_1d_index);
static_if<PositiveDirection>{}([&](auto) {
new_multi_id = old_multi_id + step_sizes;
bool carry = false;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
constexpr index_t idim = nDim - 1 - IDimReverse.Get();
constexpr auto IDim = Number<idim>{};
if(carry)
{
++new_multi_id[idim];
}
carry = false;
if(new_multi_id[idim] >= GetLength(IDim))
{
new_multi_id[idim] -= GetLength(IDim);
carry = true;
}
});
}).Else([&](auto) {
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
new_multi_id = old_multi_id + (GetLengths() - step_sizes);
bool borrow = false;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
constexpr index_t idim = nDim - 1 - IDimReverse.Get();
constexpr auto IDim = Number<idim>{};
if(borrow)
{
--new_multi_id[idim];
}
borrow = false;
if(new_multi_id[idim] < GetLength(IDim))
{
new_multi_id[idim] += GetLength(IDim);
borrow = true;
}
});
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
new_multi_id = new_multi_id - GetLengths();
});
return new_multi_id;
......@@ -255,7 +294,7 @@ struct ConstantTensorDescriptor
}
template <class... Ts>
__host__ __device__ static constexpr auto Inject(ConstantTensorDescriptor<Ts...>)
__host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor<Ts...>)
{
using leaf_tensor = ConstantTensorDescriptor<Ts...>;
......@@ -290,7 +329,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
constexpr index_t fold_intervals_product =
accumulate_on_sequence(fold_intervals, std::multiplies<index_t>{}, Number<1>{});
accumulate_on_sequence(fold_intervals, mod_conv::multiplies<index_t>{}, Number<1>{});
constexpr auto unfold_length = GetLength(Number<IDim>{});
constexpr auto unfold_stride = GetStride(Number<IDim>{});
......@@ -309,7 +348,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_strides =
Number<unfold_stride>{} *
reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}),
std::multiplies<index_t>{});
mod_conv::multiplies<index_t>{});
// folded_ranks
constexpr auto fold_ranks =
......@@ -389,7 +428,7 @@ struct ConstantTensorDescriptor
// unfolded length, stride and rank
constexpr index_t unfold_length = accumulate_on_sequence(
GetLengths().Extract(middle), std::multiplies<index_t>{}, Number<1>{});
GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
......@@ -472,7 +511,20 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
{
constexpr index_t ndim = TDesc::GetNumOfDimension();
static_assert(ndim >= 2 && ndim <= 10, "wrong!");
static_assert(ndim >= 1 && ndim <= 10, "wrong!");
static_if<ndim == 1>{}([&](auto fwd) {
constexpr auto I0 = Number<0>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u}, strides {%u}, ranks {%u}\n",
s,
desc.GetNumOfDimension(),
desc.GetLength(I0),
desc.GetStride(I0),
desc.GetMemoryRank(I0));
});
static_if<ndim == 2>{}([&](auto fwd) {
constexpr auto I0 = Number<0>{};
......
......@@ -495,3 +495,39 @@ __host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)
return seq_left.PushBack(Number<X>{}).Append(seq_right);
}
template <index_t... Xs>
__host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
{
constexpr index_t nsize = Sequence<Xs...>::GetSize();
static_assert(nsize <= 10, "wrong!");
static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 5>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 6>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 7>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 8>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 9>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 10>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
}
......@@ -158,7 +158,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!");
constexpr index_t num_active_thread =
accumulate_on_sequence(ThreadPerDims{}, std::multiplies<index_t>{}, Number<1>{});
accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
if(BlockSize > num_active_thread)
{
......
......@@ -500,7 +500,7 @@ struct Blockwise4dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!");
constexpr index_t num_active_thread =
accumulate_on_sequence(ThreadPerDims{}, std::multiplies<index_t>{}, Number<1>{});
accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
if(BlockSize > num_active_thread)
{
......
......@@ -3,6 +3,7 @@
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst
// For now, only support SubLengths == 1 on a merged dimension
template <index_t BlockSize,
class Float,
class SrcDesc,
......@@ -47,7 +48,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_multi_id_begin,
Array<index_t, nDim> dst_block_data_multi_id_begin)
{
// check NDim consistent
// check NDim consistency
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() &&
......@@ -55,7 +56,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(),
"wrong");
// check
// check thread arrange order and read/write access order are valid
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
is_valid_sequence_map<SrcAccessOrder>::value &&
is_valid_sequence_map<DstAccessOrder>::value,
......@@ -140,10 +141,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
// complete offset
mThreadSrcOffset = reduce_on_array(mThreadSrcPartialOffsets, std::plus<index_t>{});
mThreadDstOffset = reduce_on_array(mThreadDstPartialOffsets, std::plus<index_t>{});
mThreadSrcOffset = accumulate_on_array(
mThreadSrcPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));
mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));
#if 0
if(get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"src_block_data_multi_id_begin: %u %u %u %u, "
......@@ -279,13 +284,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
__device__ void MoveSlicingWindowOnSourceTensor(Number<IDim_>,
Number<StepSize>,
integral_constant<bool, PositiveDirection>)
__device__ void MoveSlicingWindowOnSourceTensor(
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
{
static_assert(PositiveDirection,
"wrong! only support movement in positive direction for now");
constexpr auto IDim = Number<IDim_>{};
constexpr index_t idim = IDim.Get();
......@@ -306,7 +307,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
auto new_src_partial_original_multi_id =
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_multi_id, StepSize);
old_src_partial_original_multi_id, StepSize, direction);
// update "mThreadSrcOriginalMultiId"
static_for<0, src_partial_original_dims.GetSize(), 1>{}([&](auto I_) {
......@@ -328,7 +329,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadSrcPartialOffsets[idim] = new_src_partial_offset;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = mThreadSrcOffset + new_src_partial_offset - old_src_partial_offset;
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
}).Else([&](auto fwd) {
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
......@@ -336,13 +337,25 @@ struct BlockwiseGenericTensorSliceCopy_v1
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim);
static_if<PositiveDirection>{}([&](auto) {
mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim);
mThreadSrcOriginalMultiId[idim_original] += StepSize;
mThreadSrcOriginalMultiId[idim_original] += StepSize;
mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim);
mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim);
}).Else([&](auto) {
mThreadSrcOffset -= StepSize * SrcDesc::GetStride(IDim);
mThreadSrcOriginalMultiId[idim_original] -= StepSize;
mThreadSrcPartialOffsets[idim] -= StepSize * SrcDesc::GetStride(IDim);
});
});
}
};
......@@ -39,6 +39,18 @@ struct scales
__host__ __device__ constexpr T operator()(T a) const { return s * a; }
};
template <class T>
struct plus
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
};
template <class T>
struct multiplies
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
};
template <class T>
struct integer_divide_ceiler
{
......
......@@ -58,6 +58,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{};
constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
......@@ -123,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
BlockSize,
Float,
decltype(in_c_n1_b_n2_global_merged_desc),
......@@ -150,20 +153,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
const auto blockwise_wei_copy =
#if 0
auto blockwise_wei_copy =
#if 1
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
Float,
decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_C_K,
WeiBlockCopyClusterLengths_C_K,
Sequence<0, 1>, // thread_arrange_order [C, K]
Sequence<0, 1>, // src_access_order [C, K]
Sequence<0, 1>, // dst_access_order [C, K]
WeiBlockCopyDataPerAccess_K,
WeiBlockCopyDataPerAccess_K>(
Float,
decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_C_K,
WeiBlockCopyClusterLengths_C_K,
Sequence<0, 1>, // thread_arrange_order [C, K]
Sequence<0, 1>, // src_access_order [C, K]
Sequence<0, 1>, // dst_access_order [C, K]
WeiBlockCopyDataPerAccess_K,
WeiBlockCopyDataPerAccess_K>(
{0, k_block_data_on_global}, {0, 0});
#else
Blockwise2dTensorCopy3<BlockSize,
......@@ -175,16 +178,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
{0, 0});
#endif
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[CPerBlock, KPerBlock] is in LDS
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// register
constexpr auto a_c_k_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<KPerBlock>{},
Number<wei_c_k_block_desc.GetStride(I0)>{});
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[CPerBlock, KPerBlock] is in LDS
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// register
constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_c_k_block_desc.GetStride(I0)>{});
constexpr auto b_c_n1bn2_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
......@@ -239,6 +240,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// zero out threadwise output
threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
#if 0
// do work
for(index_t y = 0; y < Y; ++y)
{
......@@ -269,6 +271,45 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
}
}
}
#else
for(index_t y = 0; y < Y; ++y)
{
for(index_t x = 0; x < X; ++x)
{
// calculate origin of block input and weight tensor on global memory
const Float* p_in_block_on_global =
p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x);
const Float* p_wei_block_on_global =
p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0);
for(index_t c_block_data_on_global = 0; c_block_data_on_global < C;
c_block_data_on_global += CPerBlock)
{
blockwise_in_copy.Run(p_in_block_on_global, p_in_block);
blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block);
__syncthreads();
blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread);
__syncthreads();
// move on C: C_N1_B_N2, C_K
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(
I0, Number<CPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(
I0, Number<CPerBlock>{}, True);
}
// reset C
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<C>{}, False);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<C>{}, False);
}
}
#endif
// copy output: register to global memory
{
......
......@@ -59,7 +59,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
constexpr auto TRUE = integral_constant<bool, true>{};
constexpr auto True = integral_constant<bool, true>{};
constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{};
constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
......@@ -102,9 +102,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
// input tensor
// tensor descriptor in device memory [N0, N1, N2, H, W]
constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Hi>{})
.Slice(I3, Number<Wi>{})
// tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Ho>{})
.Slice(I3, Number<Wo>{})
.Fold(I0, Number<N1>{}, Number<N2>{})
.Extract(Sequence<0, 1, 2, 4, 5>{});
......@@ -115,12 +115,23 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
in_c_y_x_global_desc.Inject(in_n0_n1_n2_h_w_global_desc),
in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc),
Sequence<0, 1, 2>{},
Sequence<4>{},
Sequence<3, 6, 7>{},
Sequence<5>{});
#if 0
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc,
"in_n0_n1_n2_h_w_global_desc: ");
print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: ");
print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc,
"in_e_n1_b_n2_global_merged_desc: ");
}
#endif
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment
constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
......@@ -243,6 +254,31 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// do work
for(index_t e = 0; e < E; e += EPerBlock)
{
#if 0
if(e == 1 * EPerBlock && get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"mThreadSrcOriginalMultiId %u %u %u %u %u %u %u %u, "
"mThreadSrcPartialOffsets %u %u %u %u, "
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
get_block_1d_id(),
get_thread_local_1d_id(),
blockwise_in_copy.mThreadSrcOriginalMultiId[0],
blockwise_in_copy.mThreadSrcOriginalMultiId[1],
blockwise_in_copy.mThreadSrcOriginalMultiId[2],
blockwise_in_copy.mThreadSrcOriginalMultiId[3],
blockwise_in_copy.mThreadSrcOriginalMultiId[4],
blockwise_in_copy.mThreadSrcOriginalMultiId[5],
blockwise_in_copy.mThreadSrcOriginalMultiId[6],
blockwise_in_copy.mThreadSrcOriginalMultiId[7],
blockwise_in_copy.mThreadSrcPartialOffsets[0],
blockwise_in_copy.mThreadSrcPartialOffsets[1],
blockwise_in_copy.mThreadSrcPartialOffsets[2],
blockwise_in_copy.mThreadSrcPartialOffsets[3],
blockwise_in_copy.mThreadSrcOffset,
blockwise_in_copy.mThreadDstOffset);
}
#endif
// marching slicing window
blockwise_in_copy.Run(p_in_global, p_in_block);
blockwise_wei_copy.Run(p_wei_global, p_wei_block);
......@@ -253,8 +289,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
__syncthreads();
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, TRUE);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, TRUE);
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
}
// copy output: register to global memory
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment