Commit b2439ec9 authored by Chao Liu's avatar Chao Liu
Browse files

adding implicit gemm v4 (nchw, kcyx)

parent 0a265731
...@@ -44,11 +44,7 @@ struct GeneratorTensor_3 ...@@ -44,11 +44,7 @@ struct GeneratorTensor_3
{ {
std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}}; std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};
#if 0
auto f_acc = std::plus<index_t>{};
#else
auto f_acc = [](auto a, auto b) { return 100 * a + b; }; auto f_acc = [](auto a, auto b) { return 100 * a + b; };
#endif
return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc); return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
} }
...@@ -447,7 +443,7 @@ int main(int argc, char* argv[]) ...@@ -447,7 +443,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 1 #elif 0
// 3x3 filter, 28x28 image // 3x3 filter, 28x28 image
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 256; constexpr index_t C = 256;
...@@ -543,7 +539,7 @@ int main(int argc, char* argv[]) ...@@ -543,7 +539,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 0 #elif 1
// 1x1 filter, 14x14 image // 1x1 filter, 14x14 image
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 512; constexpr index_t C = 512;
...@@ -553,6 +549,18 @@ int main(int argc, char* argv[]) ...@@ -553,6 +549,18 @@ int main(int argc, char* argv[])
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
// 1x1 filter, 73x73 image
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 73;
constexpr index_t WI = 73;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#endif #endif
...@@ -609,8 +617,6 @@ int main(int argc, char* argv[]) ...@@ -609,8 +617,6 @@ int main(int argc, char* argv[])
}; };
wei_kcyx.GenerateTensorValue(gen_wei, num_thread); wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
#endif #endif
// out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
} }
#if 1 #if 1
...@@ -649,7 +655,7 @@ int main(int argc, char* argv[]) ...@@ -649,7 +655,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 1 #if 0
if(Y == 3 && X == 3) if(Y == 3 && X == 3)
{ {
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
......
...@@ -105,6 +105,7 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_ ...@@ -105,6 +105,7 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
return new_array; return new_array;
} }
// Array = Array + Array
template <class TData, index_t NSize> template <class TData, index_t NSize>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b) __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
{ {
...@@ -119,6 +120,55 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, ...@@ -119,6 +120,55 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
return result; return result;
} }
// Array = Array - Array
template <class TData, index_t NSize>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b)
{
Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a[i] - b[i];
});
return result;
}
// Array = Array + Sequence
template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a[i] + b.Get(I);
});
return result;
}
// Array = Array - Sequence
template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a[i] - b.Get(I);
});
return result;
}
// Array = Array * Sequence // Array = Array * Sequence
template <class TData, index_t NSize, index_t... Is> template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b) __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
...@@ -136,15 +186,119 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is. ...@@ -136,15 +186,119 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
return result; return result;
} }
template <class TData, index_t NSize, class F> // Array = Sequence - Array
__host__ __device__ constexpr TData reduce_on_array(Array<TData, NSize> a, F f) template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
{ {
TData result = a[0]; static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_for<1, NSize, 1>{}([&](auto I) { static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a.Get(I) - b[i];
});
return result;
}
template <class TData, index_t NSize, class Reduce>
__host__ __device__ constexpr TData
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
{
TData result = init;
static_assert(NSize > 0, "wrong");
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get(); constexpr index_t i = I.Get();
result = f(result, a[i]); result = f(result, a[i]);
}); });
return result; return result;
} }
template <class T, index_t NSize>
__host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
{
constexpr index_t nsize = a.GetSize();
static_assert(nsize > 0 && nsize <= 10, "wrong!");
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
static_if<nsize == 3>{}(
[&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
static_if<nsize == 4>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
static_if<nsize == 5>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
});
static_if<nsize == 6>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
});
static_if<nsize == 7>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6]);
});
static_if<nsize == 8>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7]);
});
static_if<nsize == 9>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8]);
});
static_if<nsize == 10>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8],
a[9]);
});
}
...@@ -99,15 +99,7 @@ struct ConstantMergedTensorDescriptor ...@@ -99,15 +99,7 @@ struct ConstantMergedTensorDescriptor
return original_multi_id; return original_multi_id;
} }
#if 0 // not needed __host__ __device__ static index_t GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id)
__host__ __device__ static index_t
GetOffsetFromOriginalMultiIndex(Array<index_t, nOriginalDim> original_multi_id)
{
return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
}
#endif
__host__ __device__ static index_t GetOffsetFromMultiIndexA(Array<index_t, nDim> multi_id)
{ {
const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
...@@ -126,38 +118,6 @@ struct ConstantMergedTensorDescriptor ...@@ -126,38 +118,6 @@ struct ConstantMergedTensorDescriptor
return dummy_desc.GetMultiIndexFrom1dIndex(id); return dummy_desc.GetMultiIndexFrom1dIndex(id);
} }
#if 0 // not needed
template <index_t IDim>
__host__ __device__ static index_t GetNewOriginalMultiIndexAfterMovingAlongOneDimension(
Array<index_t, nOriginalDim> old_original_multi_id, Number<IDim>, index_t step_size)
{
auto new_original_multi_id = old_original_multi_id;
// get partial-original-multi-id corresponding to this merged dimension
constexpr auto original_partial_dims = std::get<IDim>(mOriginalDimMergeSeqs);
constexpr auto original_partial_tensor_desc =
OriginalTensorDesc::Extract(original_partial_dims);
auto old_original_partial_multi_id =
extract_array(old_original_mutli_id, original_paritial_dims);
auto new_original_partial_multi_id =
original_partial_tensor_desc.GetNewMultiIndexGivenStepSizeOf1dIndex(
old_original_partial_multi_id, step_size);
// update original-mutli-id
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = original_dims_partial.Get(I);
new_original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
});
return new_original_multi_id;
}
#endif
}; };
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs> template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths) __host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths)
{ {
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{}) return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), mod_conv::multiplies<index_t>{})
.PushBack(Number<1>{}); .PushBack(Number<1>{});
} }
...@@ -95,7 +95,7 @@ struct ConstantTensorDescriptor ...@@ -95,7 +95,7 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr index_t GetElementSize() __host__ __device__ static constexpr index_t GetElementSize()
{ {
return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{}); return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{});
} }
// WRONG! ReorderGivenOld2New is broken // WRONG! ReorderGivenOld2New is broken
...@@ -107,10 +107,10 @@ struct ConstantTensorDescriptor ...@@ -107,10 +107,10 @@ struct ConstantTensorDescriptor
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{}); constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
constexpr index_t element_space_unaligned = accumulate_on_sequence( constexpr index_t element_space_unaligned = accumulate_on_sequence(
(lengths_in_rank - Number<1>{}) * strides_in_rank, std::plus<index_t>{}, Number<1>{}); (lengths_in_rank - Number<1>{}) * strides_in_rank, mod_conv::plus<index_t>{}, Number<1>{});
#else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension #else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
constexpr index_t element_space_unaligned = accumulate_on_sequence( constexpr index_t element_space_unaligned = accumulate_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), std::plus<index_t>{}, Number<1>{}); (GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{});
#endif #endif
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
...@@ -144,7 +144,8 @@ struct ConstantTensorDescriptor ...@@ -144,7 +144,8 @@ struct ConstantTensorDescriptor
constexpr auto multi_id = Sequence<Is...>{}; constexpr auto multi_id = Sequence<Is...>{};
return accumulate_on_sequence(multi_id * GetStrides(), std::plus<index_t>{}, Number<0>{}); return accumulate_on_sequence(
multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
} }
#if 0 // ReorderGivenOld2new is broken #if 0 // ReorderGivenOld2new is broken
...@@ -197,32 +198,70 @@ struct ConstantTensorDescriptor ...@@ -197,32 +198,70 @@ struct ConstantTensorDescriptor
// This function doesn't do carry check on the highest dimension, for performance reason. // This function doesn't do carry check on the highest dimension, for performance reason.
// It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound // It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound
// on the highest dimension // on the highest dimension
template <bool PositiveDirection>
__host__ __device__ static Array<index_t, nDim> __host__ __device__ static Array<index_t, nDim>
UpdateMultiIndexGivenStepSizeOf1dIndex(Array<index_t, nDim> old_multi_id, UpdateMultiIndexGivenStepSizeOf1dIndex(Array<index_t, nDim> old_multi_id,
index_t step_size_of_1d_index) index_t step_size_of_1d_index,
integral_constant<bool, PositiveDirection>)
{ {
auto new_multi_id = old_multi_id + GetMultiIndexFrom1dIndex(step_size_of_1d_index); Array<index_t, nDim> new_multi_id;
bool carry = false; const auto step_sizes = GetMultiIndexFrom1dIndex(step_size_of_1d_index);
// do carry check in reversed order, starting from lowest dimension static_if<PositiveDirection>{}([&](auto) {
// don't check the highest dimension new_multi_id = old_multi_id + step_sizes;
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
constexpr index_t idim = nDim - 1 - IDimReverse.Get(); bool carry = false;
constexpr auto IDim = Number<idim>{};
// do carry check in reversed order, starting from lowest dimension
if(carry) // don't check the highest dimension
{ static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
++new_multi_id[idim]; constexpr index_t idim = nDim - 1 - IDimReverse.Get();
} constexpr auto IDim = Number<idim>{};
carry = false; if(carry)
{
if(new_multi_id[idim] >= GetLength(IDim)) ++new_multi_id[idim];
{ }
new_multi_id[idim] -= GetLength(IDim);
carry = true; carry = false;
}
if(new_multi_id[idim] >= GetLength(IDim))
{
new_multi_id[idim] -= GetLength(IDim);
carry = true;
}
});
}).Else([&](auto) {
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
new_multi_id = old_multi_id + (GetLengths() - step_sizes);
bool borrow = false;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
constexpr index_t idim = nDim - 1 - IDimReverse.Get();
constexpr auto IDim = Number<idim>{};
if(borrow)
{
--new_multi_id[idim];
}
borrow = false;
if(new_multi_id[idim] < GetLength(IDim))
{
new_multi_id[idim] += GetLength(IDim);
borrow = true;
}
});
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
new_multi_id = new_multi_id - GetLengths();
}); });
return new_multi_id; return new_multi_id;
...@@ -255,7 +294,7 @@ struct ConstantTensorDescriptor ...@@ -255,7 +294,7 @@ struct ConstantTensorDescriptor
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto Inject(ConstantTensorDescriptor<Ts...>) __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor<Ts...>)
{ {
using leaf_tensor = ConstantTensorDescriptor<Ts...>; using leaf_tensor = ConstantTensorDescriptor<Ts...>;
...@@ -290,7 +329,7 @@ struct ConstantTensorDescriptor ...@@ -290,7 +329,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_intervals = Sequence<FoldIntervals...>{}; constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
constexpr index_t fold_intervals_product = constexpr index_t fold_intervals_product =
accumulate_on_sequence(fold_intervals, std::multiplies<index_t>{}, Number<1>{}); accumulate_on_sequence(fold_intervals, mod_conv::multiplies<index_t>{}, Number<1>{});
constexpr auto unfold_length = GetLength(Number<IDim>{}); constexpr auto unfold_length = GetLength(Number<IDim>{});
constexpr auto unfold_stride = GetStride(Number<IDim>{}); constexpr auto unfold_stride = GetStride(Number<IDim>{});
...@@ -309,7 +348,7 @@ struct ConstantTensorDescriptor ...@@ -309,7 +348,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_strides = constexpr auto fold_strides =
Number<unfold_stride>{} * Number<unfold_stride>{} *
reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}), reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}),
std::multiplies<index_t>{}); mod_conv::multiplies<index_t>{});
// folded_ranks // folded_ranks
constexpr auto fold_ranks = constexpr auto fold_ranks =
...@@ -389,7 +428,7 @@ struct ConstantTensorDescriptor ...@@ -389,7 +428,7 @@ struct ConstantTensorDescriptor
// unfolded length, stride and rank // unfolded length, stride and rank
constexpr index_t unfold_length = accumulate_on_sequence( constexpr index_t unfold_length = accumulate_on_sequence(
GetLengths().Extract(middle), std::multiplies<index_t>{}, Number<1>{}); GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{}); constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
...@@ -472,7 +511,20 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) ...@@ -472,7 +511,20 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
{ {
constexpr index_t ndim = TDesc::GetNumOfDimension(); constexpr index_t ndim = TDesc::GetNumOfDimension();
static_assert(ndim >= 2 && ndim <= 10, "wrong!"); static_assert(ndim >= 1 && ndim <= 10, "wrong!");
static_if<ndim == 1>{}([&](auto fwd) {
constexpr auto I0 = Number<0>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u}, strides {%u}, ranks {%u}\n",
s,
desc.GetNumOfDimension(),
desc.GetLength(I0),
desc.GetStride(I0),
desc.GetMemoryRank(I0));
});
static_if<ndim == 2>{}([&](auto fwd) { static_if<ndim == 2>{}([&](auto fwd) {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
......
...@@ -495,3 +495,39 @@ __host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>) ...@@ -495,3 +495,39 @@ __host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)
return seq_left.PushBack(Number<X>{}).Append(seq_right); return seq_left.PushBack(Number<X>{}).Append(seq_right);
} }
template <index_t... Xs>
__host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
{
constexpr index_t nsize = Sequence<Xs...>::GetSize();
static_assert(nsize <= 10, "wrong!");
static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 5>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 6>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 7>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 8>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 9>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 10>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
}
...@@ -158,7 +158,7 @@ struct Blockwise3dTensorCopy3 ...@@ -158,7 +158,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"); "wrrong! BlockSize is not big enough for ThreadPerDims!");
constexpr index_t num_active_thread = constexpr index_t num_active_thread =
accumulate_on_sequence(ThreadPerDims{}, std::multiplies<index_t>{}, Number<1>{}); accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
if(BlockSize > num_active_thread) if(BlockSize > num_active_thread)
{ {
......
...@@ -500,7 +500,7 @@ struct Blockwise4dTensorCopy3 ...@@ -500,7 +500,7 @@ struct Blockwise4dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"); "wrrong! BlockSize is not big enough for ThreadPerDims!");
constexpr index_t num_active_thread = constexpr index_t num_active_thread =
accumulate_on_sequence(ThreadPerDims{}, std::multiplies<index_t>{}, Number<1>{}); accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
if(BlockSize > num_active_thread) if(BlockSize > num_active_thread)
{ {
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor // slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst // memory layout (ordering of dimensions) can be different between src and dst
// For now, only support SubLengths == 1 on a merged dimension
template <index_t BlockSize, template <index_t BlockSize,
class Float, class Float,
class SrcDesc, class SrcDesc,
...@@ -47,7 +48,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -47,7 +48,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_multi_id_begin, BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_multi_id_begin,
Array<index_t, nDim> dst_block_data_multi_id_begin) Array<index_t, nDim> dst_block_data_multi_id_begin)
{ {
// check NDim consistent // check NDim consistency
static_assert(nDim == SrcDesc::GetNumOfDimension() && static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() && nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() &&
...@@ -55,7 +56,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -55,7 +56,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(), nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(),
"wrong"); "wrong");
// check // check thread arrange order and read/write access order are valid
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value && static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
is_valid_sequence_map<SrcAccessOrder>::value && is_valid_sequence_map<SrcAccessOrder>::value &&
is_valid_sequence_map<DstAccessOrder>::value, is_valid_sequence_map<DstAccessOrder>::value,
...@@ -140,10 +141,14 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -140,10 +141,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
}); });
// complete offset // complete offset
mThreadSrcOffset = reduce_on_array(mThreadSrcPartialOffsets, std::plus<index_t>{}); mThreadSrcOffset = accumulate_on_array(
mThreadDstOffset = reduce_on_array(mThreadDstPartialOffsets, std::plus<index_t>{}); mThreadSrcPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));
mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));
#if 0 #if 0
if(get_block_1d_id() == 0)
{ {
printf("id %5u %5u: " printf("id %5u %5u: "
"src_block_data_multi_id_begin: %u %u %u %u, " "src_block_data_multi_id_begin: %u %u %u %u, "
...@@ -279,13 +284,9 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -279,13 +284,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity // the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason // check on out-of-bound slicing window, for performance reason
template <index_t IDim_, index_t StepSize, bool PositiveDirection> template <index_t IDim_, index_t StepSize, bool PositiveDirection>
__device__ void MoveSlicingWindowOnSourceTensor(Number<IDim_>, __device__ void MoveSlicingWindowOnSourceTensor(
Number<StepSize>, Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
integral_constant<bool, PositiveDirection>)
{ {
static_assert(PositiveDirection,
"wrong! only support movement in positive direction for now");
constexpr auto IDim = Number<IDim_>{}; constexpr auto IDim = Number<IDim_>{};
constexpr index_t idim = IDim.Get(); constexpr index_t idim = IDim.Get();
...@@ -306,7 +307,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -306,7 +307,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
auto new_src_partial_original_multi_id = auto new_src_partial_original_multi_id =
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex( src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_multi_id, StepSize); old_src_partial_original_multi_id, StepSize, direction);
// update "mThreadSrcOriginalMultiId" // update "mThreadSrcOriginalMultiId"
static_for<0, src_partial_original_dims.GetSize(), 1>{}([&](auto I_) { static_for<0, src_partial_original_dims.GetSize(), 1>{}([&](auto I_) {
...@@ -328,7 +329,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -328,7 +329,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadSrcPartialOffsets[idim] = new_src_partial_offset; mThreadSrcPartialOffsets[idim] = new_src_partial_offset;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = mThreadSrcOffset + new_src_partial_offset - old_src_partial_offset; mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
}).Else([&](auto fwd) { }).Else([&](auto fwd) {
// Logic for non-merged dimension. If you are never going to move the slicing window on // Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets", // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
...@@ -336,13 +337,25 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -336,13 +337,25 @@ struct BlockwiseGenericTensorSliceCopy_v1
// should be able to remove these calculations. // should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case. // TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front(); constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim); static_if<PositiveDirection>{}([&](auto) {
mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim);
mThreadSrcOriginalMultiId[idim_original] += StepSize; mThreadSrcOriginalMultiId[idim_original] += StepSize;
mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim); mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim);
}).Else([&](auto) {
mThreadSrcOffset -= StepSize * SrcDesc::GetStride(IDim);
mThreadSrcOriginalMultiId[idim_original] -= StepSize;
mThreadSrcPartialOffsets[idim] -= StepSize * SrcDesc::GetStride(IDim);
});
}); });
} }
}; };
...@@ -39,6 +39,18 @@ struct scales ...@@ -39,6 +39,18 @@ struct scales
__host__ __device__ constexpr T operator()(T a) const { return s * a; } __host__ __device__ constexpr T operator()(T a) const { return s * a; }
}; };
template <class T>
struct plus
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
};
template <class T>
struct multiplies
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
};
template <class T> template <class T>
struct integer_divide_ceiler struct integer_divide_ceiler
{ {
......
...@@ -58,6 +58,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -58,6 +58,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr auto I6 = Number<6>{}; constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{}; constexpr auto I7 = Number<7>{};
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{};
constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{}; constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
...@@ -123,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -123,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// input blockwise copy // input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor // slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in // this copy operator already has blockwise offset built-in
const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1< auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
BlockSize, BlockSize,
Float, Float,
decltype(in_c_n1_b_n2_global_merged_desc), decltype(in_c_n1_b_n2_global_merged_desc),
...@@ -150,20 +153,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -150,20 +153,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// operator for blockwise copy of weight into LDS // operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor // slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in // this copy operator already have blockwise offset built-in
const auto blockwise_wei_copy = auto blockwise_wei_copy =
#if 0 #if 1
BlockwiseGenericTensorSliceCopy_v1<BlockSize, BlockwiseGenericTensorSliceCopy_v1<BlockSize,
Float, Float,
decltype(wei_c_k_global_desc), decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc), decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()), decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_C_K, WeiBlockCopySubLengths_C_K,
WeiBlockCopyClusterLengths_C_K, WeiBlockCopyClusterLengths_C_K,
Sequence<0, 1>, // thread_arrange_order [C, K] Sequence<0, 1>, // thread_arrange_order [C, K]
Sequence<0, 1>, // src_access_order [C, K] Sequence<0, 1>, // src_access_order [C, K]
Sequence<0, 1>, // dst_access_order [C, K] Sequence<0, 1>, // dst_access_order [C, K]
WeiBlockCopyDataPerAccess_K, WeiBlockCopyDataPerAccess_K,
WeiBlockCopyDataPerAccess_K>( WeiBlockCopyDataPerAccess_K>(
{0, k_block_data_on_global}, {0, 0}); {0, k_block_data_on_global}, {0, 0});
#else #else
Blockwise2dTensorCopy3<BlockSize, Blockwise2dTensorCopy3<BlockSize,
...@@ -175,16 +178,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -175,16 +178,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
{0, 0}); {0, 0});
#endif #endif
// GEMM definition // GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx // c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[CPerBlock, KPerBlock] is in LDS // a_mtx[CPerBlock, KPerBlock] is in LDS
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS // b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in // c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// register // register
constexpr auto a_c_k_block_mtx_desc = constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor(
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_c_k_block_desc.GetStride(I0)>{});
Number<KPerBlock>{},
Number<wei_c_k_block_desc.GetStride(I0)>{});
constexpr auto b_c_n1bn2_block_mtx_desc = constexpr auto b_c_n1bn2_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, make_ConstantMatrixDescriptor(Number<CPerBlock>{},
...@@ -239,6 +240,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -239,6 +240,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// zero out threadwise output // zero out threadwise output
threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread); threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
#if 0
// do work // do work
for(index_t y = 0; y < Y; ++y) for(index_t y = 0; y < Y; ++y)
{ {
...@@ -269,6 +271,45 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -269,6 +271,45 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
} }
} }
} }
#else
for(index_t y = 0; y < Y; ++y)
{
for(index_t x = 0; x < X; ++x)
{
// calculate origin of block input and weight tensor on global memory
const Float* p_in_block_on_global =
p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x);
const Float* p_wei_block_on_global =
p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0);
for(index_t c_block_data_on_global = 0; c_block_data_on_global < C;
c_block_data_on_global += CPerBlock)
{
blockwise_in_copy.Run(p_in_block_on_global, p_in_block);
blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block);
__syncthreads();
blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread);
__syncthreads();
// move on C: C_N1_B_N2, C_K
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(
I0, Number<CPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(
I0, Number<CPerBlock>{}, True);
}
// reset C
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<C>{}, False);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<C>{}, False);
}
}
#endif
// copy output: register to global memory // copy output: register to global memory
{ {
......
...@@ -59,7 +59,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -59,7 +59,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
constexpr auto I6 = Number<6>{}; constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{}; constexpr auto I7 = Number<7>{};
constexpr auto TRUE = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{};
constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
...@@ -102,9 +102,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -102,9 +102,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
// input tensor // input tensor
// tensor descriptor in device memory [N0, N1, N2, H, W] // tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Hi>{}) constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Ho>{})
.Slice(I3, Number<Wi>{}) .Slice(I3, Number<Wo>{})
.Fold(I0, Number<N1>{}, Number<N2>{}) .Fold(I0, Number<N1>{}, Number<N2>{})
.Extract(Sequence<0, 1, 2, 4, 5>{}); .Extract(Sequence<0, 1, 2, 4, 5>{});
...@@ -115,12 +115,23 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -115,12 +115,23 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy // merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor( constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
in_c_y_x_global_desc.Inject(in_n0_n1_n2_h_w_global_desc), in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc),
Sequence<0, 1, 2>{}, Sequence<0, 1, 2>{},
Sequence<4>{}, Sequence<4>{},
Sequence<3, 6, 7>{}, Sequence<3, 6, 7>{},
Sequence<5>{}); Sequence<5>{});
#if 0
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc,
"in_n0_n1_n2_h_w_global_desc: ");
print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: ");
print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc,
"in_e_n1_b_n2_global_merged_desc: ");
}
#endif
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
...@@ -243,6 +254,31 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -243,6 +254,31 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// do work // do work
for(index_t e = 0; e < E; e += EPerBlock) for(index_t e = 0; e < E; e += EPerBlock)
{ {
#if 0
if(e == 1 * EPerBlock && get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"mThreadSrcOriginalMultiId %u %u %u %u %u %u %u %u, "
"mThreadSrcPartialOffsets %u %u %u %u, "
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
get_block_1d_id(),
get_thread_local_1d_id(),
blockwise_in_copy.mThreadSrcOriginalMultiId[0],
blockwise_in_copy.mThreadSrcOriginalMultiId[1],
blockwise_in_copy.mThreadSrcOriginalMultiId[2],
blockwise_in_copy.mThreadSrcOriginalMultiId[3],
blockwise_in_copy.mThreadSrcOriginalMultiId[4],
blockwise_in_copy.mThreadSrcOriginalMultiId[5],
blockwise_in_copy.mThreadSrcOriginalMultiId[6],
blockwise_in_copy.mThreadSrcOriginalMultiId[7],
blockwise_in_copy.mThreadSrcPartialOffsets[0],
blockwise_in_copy.mThreadSrcPartialOffsets[1],
blockwise_in_copy.mThreadSrcPartialOffsets[2],
blockwise_in_copy.mThreadSrcPartialOffsets[3],
blockwise_in_copy.mThreadSrcOffset,
blockwise_in_copy.mThreadDstOffset);
}
#endif
// marching slicing window // marching slicing window
blockwise_in_copy.Run(p_in_global, p_in_block); blockwise_in_copy.Run(p_in_global, p_in_block);
blockwise_wei_copy.Run(p_wei_global, p_wei_block); blockwise_wei_copy.Run(p_wei_global, p_wei_block);
...@@ -253,8 +289,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -253,8 +289,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
__syncthreads(); __syncthreads();
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, TRUE); blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, TRUE); blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
} }
// copy output: register to global memory // copy output: register to global memory
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment