"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "32a4328b16b85aae26d08d81157ab74b58edcdb1"
Commit a7c587ee authored by Chao Liu's avatar Chao Liu
Browse files

overhauling DynamicTensorDescriptor and dynamic multi-index transform in...

overhauling DynamicTensorDescriptor and dynamic multi-index transform in preparation for partially compile-time and partially run-time tensor descriptor
parent cb7ed650
...@@ -80,7 +80,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad ...@@ -80,7 +80,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
// weight tensor // weight tensor
const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor( const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed<2>(make_multi_index(K, C * Y * X)), make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(K, C * Y * X)),
make_tuple(DynamicPassThrough{K}, DynamicPassThrough{C * Y * X}), make_tuple(DynamicPassThrough{K}, DynamicPassThrough{C * Y * X}),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<1>{}, Sequence<0>{})); make_tuple(Sequence<1>{}, Sequence<0>{}));
...@@ -118,7 +118,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad ...@@ -118,7 +118,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
// output tensor // output tensor
const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor( const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed<3>(make_multi_index(N, K, Ho * Wo)), make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, K, Ho * Wo)),
make_tuple(DynamicPassThrough{K}, DynamicMerge<2>{make_multi_index(N, Ho * Wo)}), make_tuple(DynamicPassThrough{K}, DynamicMerge<2>{make_multi_index(N, Ho * Wo)}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}), make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
...@@ -781,7 +781,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad ...@@ -781,7 +781,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
make_tuple(Sequence<1>{}, Sequence<0>{})); make_tuple(Sequence<1>{}, Sequence<0>{}));
#else #else
const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor( const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed<2>(make_multi_index(K, C * Y * X)), make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(K, C * Y * X)),
make_tuple(DynamicPassThrough{K}, DynamicPassThrough{C * Y * X}), make_tuple(DynamicPassThrough{K}, DynamicPassThrough{C * Y * X}),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<1>{}, Sequence<0>{})); make_tuple(Sequence<1>{}, Sequence<0>{}));
...@@ -822,7 +822,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad ...@@ -822,7 +822,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
#else #else
const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor( const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed<3>(make_multi_index(N, K, Ho * Wo)), make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, K, Ho * Wo)),
make_tuple(DynamicPassThrough{K}, DynamicMerge<2>{make_multi_index(N, Ho * Wo)}), make_tuple(DynamicPassThrough{K}, DynamicMerge<2>{make_multi_index(N, Ho * Wo)}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}), make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
...@@ -1461,7 +1461,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1 ...@@ -1461,7 +1461,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
// weight tensor // weight tensor
const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor( const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed<2>(make_multi_index(K, C)), make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(K, C)),
make_tuple(DynamicPassThrough{K}, DynamicPassThrough{C}), make_tuple(DynamicPassThrough{K}, DynamicPassThrough{C}),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<1>{}, Sequence<0>{})); make_tuple(Sequence<1>{}, Sequence<0>{}));
...@@ -1475,7 +1475,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1 ...@@ -1475,7 +1475,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
// output tensor // output tensor
const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor( const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed<3>(make_multi_index(N, K, Ho * Wo)), make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, K, Ho * Wo)),
make_tuple(DynamicPassThrough{K}, DynamicMerge<2>{make_multi_index(N, Ho * Wo)}), make_tuple(DynamicPassThrough{K}, DynamicMerge<2>{make_multi_index(N, Ho * Wo)}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}), make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
......
...@@ -347,8 +347,8 @@ struct DynamicEmbed ...@@ -347,8 +347,8 @@ struct DynamicEmbed
__host__ __device__ constexpr DynamicEmbed() = default; __host__ __device__ constexpr DynamicEmbed() = default;
__host__ __device__ constexpr DynamicEmbed(const UpperIndex& up_lengths, __host__ __device__ constexpr DynamicEmbed(const UpLengths& up_lengths,
const UpperIndex& coefficients) const Coefficients& coefficients)
: up_lengths_{up_lengths}, coefficients_{coefficients} : up_lengths_{up_lengths}, coefficients_{coefficients}
{ {
} }
...@@ -415,9 +415,9 @@ struct DynamicEmbed ...@@ -415,9 +415,9 @@ struct DynamicEmbed
printf("{"); printf("{");
printf("DynamicEmbed, "); printf("DynamicEmbed, ");
printf("up_lengths_ "); printf("up_lengths_ ");
// print_multi_index(up_lengths_); print_multi_index(up_lengths_);
printf("coefficients_ "); printf("coefficients_ ");
// print_multi_index(coefficients_); print_multi_index(coefficients_);
printf("}"); printf("}");
} }
}; };
...@@ -1017,8 +1017,8 @@ struct DynamicUnMerge ...@@ -1017,8 +1017,8 @@ struct DynamicUnMerge
{ {
printf("{"); printf("{");
printf("DynamicUnMerge, "); printf("DynamicUnMerge, ");
// print_multi_index(up_lengths_); print_multi_index(up_lengths_);
// print_multi_index(up_lengths_scan_); print_multi_index(up_lengths_scan_);
printf("}"); printf("}");
} }
}; };
......
...@@ -39,8 +39,7 @@ template <typename Transforms, ...@@ -39,8 +39,7 @@ template <typename Transforms,
typename LowerDimensionIdss, typename LowerDimensionIdss,
typename UpperDimensionIdss, typename UpperDimensionIdss,
typename VisibleDimensionIds, typename VisibleDimensionIds,
typename ElementSize = index_t, typename ElementSpaceSize>
typename ElementSpaceSize = index_t>
struct DynamicTensorDescriptor struct DynamicTensorDescriptor
{ {
// TODO make these private // TODO make these private
...@@ -70,6 +69,57 @@ struct DynamicTensorDescriptor ...@@ -70,6 +69,57 @@ struct DynamicTensorDescriptor
return unique_sort_all_dim_ids::Size(); return unique_sort_all_dim_ids::Size();
} }
__host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms)
{
const auto lengths = generate_tuple(
[&](auto idim_visible) {
constexpr auto tmp = GetTransformAndItsUpperDimension(idim_visible);
constexpr index_t itran = tmp[Number<0>{}];
constexpr index_t idim_up = tmp[Number<1>{}];
constexpr bool found = tmp[Number<2>{}];
static_assert(found == true,
"wrong! not found matching transformation and upper-dimension");
const auto length =
transforms[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
return length;
},
Number<ndim_visible_>{});
// TODO: make container_reduce support tuple of Number and index_t
return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
}
template <index_t IDim>
__host__ __device__ static constexpr auto GetTransformAndItsUpperDimension(Number<IDim>)
{
constexpr auto idim_visible = Number<IDim>{};
constexpr index_t idim_hidden = VisibleDimensionIds::At(idim_visible);
index_t itran_found = 0;
index_t idim_up_found = 0;
bool found = false;
static_for<0, ntransform_, 1>{}([&](auto itran) {
constexpr auto up_dim_ids = UpperDimensionIdss{}[itran];
static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) {
if constexpr(up_dim_ids[idim_up] == idim_hidden)
{
itran_found = itran;
idim_up_found = idim_up;
found = true;
}
});
});
return make_tuple(itran_found, idim_up_found, found);
}
constexpr static index_t ntransform_ = GetNumOfTransform(); constexpr static index_t ntransform_ = GetNumOfTransform();
constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension(); constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension();
constexpr static index_t ndim_hidden_ = GetNumOfHiddenDimension(); constexpr static index_t ndim_hidden_ = GetNumOfHiddenDimension();
...@@ -78,6 +128,9 @@ struct DynamicTensorDescriptor ...@@ -78,6 +128,9 @@ struct DynamicTensorDescriptor
using HiddenIndex = MultiIndex<ndim_hidden_>; using HiddenIndex = MultiIndex<ndim_hidden_>;
using Coordinate = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>; using Coordinate = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
// may be index_t or Number<>
using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
public: public:
__host__ __device__ constexpr DynamicTensorDescriptor() = default; __host__ __device__ constexpr DynamicTensorDescriptor() = default;
...@@ -148,57 +201,6 @@ struct DynamicTensorDescriptor ...@@ -148,57 +201,6 @@ struct DynamicTensorDescriptor
return VisibleDimensionIds{}; return VisibleDimensionIds{};
} }
__host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms)
{
const auto lengths = generate_tuple(
[&](auto idim_visible) {
constexpr auto tmp = GetTransformAndItsUpperDimension(idim_visible);
constexpr index_t itran = tmp[Number<0>{}];
constexpr index_t idim_up = tmp[Number<1>{}];
constexpr bool found = tmp[Number<2>{}];
static_assert(found == true,
"wrong! not found matching transformation and upper-dimension");
const auto length =
transforms[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
return length;
},
Number<ndim_visible_>{});
// TODO: make container_reduce support tuple of Number and index_t
return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
}
template <index_t IDim>
__host__ __device__ static constexpr auto GetTransformAndItsUpperDimension(Number<IDim>)
{
constexpr auto idim_visible = Number<IDim>{};
constexpr index_t idim_hidden = VisibleDimensionIds::At(idim_visible);
index_t itran_found = 0;
index_t idim_up_found = 0;
bool found = false;
static_for<0, ntransform_, 1>{}([&](auto itran) {
constexpr auto up_dim_ids = UpperDimensionIdss{}[itran];
static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) {
if constexpr(up_dim_ids[idim_up] == idim_hidden)
{
itran_found = itran;
idim_up_found = idim_up;
found = true;
}
});
});
return make_tuple(itran_found, idim_up_found, found);
}
__host__ __device__ void Print() const __host__ __device__ void Print() const
{ {
printf("{"); printf("{");
...@@ -367,11 +369,14 @@ transform_dynamic_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc, ...@@ -367,11 +369,14 @@ transform_dynamic_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
constexpr auto all_up_dim_hidden_idss = constexpr auto all_up_dim_hidden_idss =
container_cat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss); container_cat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss);
const auto element_space_size = old_tensor_desc.GetElementSpaceSize();
return DynamicTensorDescriptor<remove_cv_t<decltype(all_transforms)>, return DynamicTensorDescriptor<remove_cv_t<decltype(all_transforms)>,
remove_cv_t<decltype(all_low_dim_hidden_idss)>, remove_cv_t<decltype(all_low_dim_hidden_idss)>,
remove_cv_t<decltype(all_up_dim_hidden_idss)>, remove_cv_t<decltype(all_up_dim_hidden_idss)>,
remove_cv_t<decltype(new_visible_dim_hidden_ids)>>{ remove_cv_t<decltype(new_visible_dim_hidden_ids)>,
all_transforms, old_tensor_desc.GetElementSpaceSize()}; remove_cv_t<decltype(element_space_size)>>{all_transforms,
element_space_size};
} }
template <typename TensorDesc, typename VisibleIndex> template <typename TensorDesc, typename VisibleIndex>
......
...@@ -14,59 +14,59 @@ namespace ck { ...@@ -14,59 +14,59 @@ namespace ck {
* functions on GPU without worrying about scratch memory usage. * functions on GPU without worrying about scratch memory usage.
*/ */
template <index_t N> template <typename... Lengths,
typename... Strides,
typename std::enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_dynamic_naive_tensor_descriptor(const MultiIndex<N>& lengths, const MultiIndex<N>& strides) make_dynamic_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
const Tuple<Strides...>& strides)
{ {
const auto transforms = make_tuple(DynamicEmbed<N>{lengths, strides}); constexpr index_t N = sizeof...(Lengths);
constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
constexpr auto up_dim_hidden_idss =
make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
index_t element_space_size = 1;
static_for<0, N, 1>{}([&](auto i) { element_space_size += (lengths[i] - 1) * strides[i]; });
return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>, const auto transforms =
remove_cv_t<decltype(low_dim_hidden_idss)>, make_tuple(DynamicEmbed<N, Tuple<Lengths...>, Tuple<Strides...>>{lengths, strides});
remove_cv_t<decltype(up_dim_hidden_idss)>,
remove_cv_t<decltype(visible_dim_hidden_ids)>>{
transforms, element_space_size};
}
template <index_t N>
__host__ __device__ constexpr auto
make_dynamic_naive_tensor_descriptor_packed(const MultiIndex<N>& lengths)
{
const auto transforms = make_tuple(DynamicUnMerge<N>{lengths});
constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{}); constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
constexpr auto up_dim_hidden_idss = constexpr auto up_dim_hidden_idss =
make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{}); make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{}; constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
const index_t element_space_size = // recursive function for reduction
container_reduce(lengths, math::multiplies<index_t>{}, index_t{1}); auto f = [&](auto fs, auto i, auto acc_old) {
auto acc_new = acc_old + (lengths[i] - Number<1>{}) * strides[i];
if constexpr(i.value < N - 1)
{
return fs(fs, i + Number<1>{}, acc_new);
}
else
{
return acc_new;
}
};
const auto element_space_size = f(f, Number<0>{}, Number<1>{});
return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>, return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>,
remove_cv_t<decltype(low_dim_hidden_idss)>, remove_cv_t<decltype(low_dim_hidden_idss)>,
remove_cv_t<decltype(up_dim_hidden_idss)>, remove_cv_t<decltype(up_dim_hidden_idss)>,
remove_cv_t<decltype(visible_dim_hidden_ids)>>{ remove_cv_t<decltype(visible_dim_hidden_ids)>,
transforms, element_space_size}; remove_cv_t<decltype(element_space_size)>>{transforms,
element_space_size};
} }
// Is... can be: // Lengths... can be:
// 1) index_t, which is known at run-time // 1) index_t, which is known at run-time
// 2) Number<>, which is known at compile-time // 2) Number<>, which is known at compile-time
template <typename... Is> template <typename... Lengths>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple<Is...>& lengths) make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple<Lengths...>& lengths)
{ {
constexpr index_t N = sizeof...(Is); constexpr index_t N = sizeof...(Lengths);
using Lengths = remove_cv_t<remove_reference_t<decltype(lengths)>>;
const auto transforms = make_tuple(DynamicUnMerge<N, false, Lengths>{lengths}); const auto transforms = make_tuple(DynamicUnMerge<N, false, Tuple<Lengths...>>{lengths});
constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{}); constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
...@@ -75,34 +75,45 @@ make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple<Is...>& lengths) ...@@ -75,34 +75,45 @@ make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple<Is...>& lengths)
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{}; constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
const auto element_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{}); const auto element_space_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
const auto element_space_size = element_size;
return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>, return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>,
remove_cv_t<decltype(low_dim_hidden_idss)>, remove_cv_t<decltype(low_dim_hidden_idss)>,
remove_cv_t<decltype(up_dim_hidden_idss)>, remove_cv_t<decltype(up_dim_hidden_idss)>,
remove_cv_t<decltype(visible_dim_hidden_ids)>, remove_cv_t<decltype(visible_dim_hidden_ids)>,
remove_cv_t<decltype(element_size)>,
remove_cv_t<decltype(element_space_size)>>{transforms, remove_cv_t<decltype(element_space_size)>>{transforms,
element_space_size}; element_space_size};
} }
template <index_t N> template <typename... Lengths, typename Align>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_dynamic_naive_tensor_descriptor_aligned(const MultiIndex<N>& lengths, index_t align) make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align align)
{ {
auto strides = make_zero_multi_index<N>(); constexpr index_t N = sizeof...(Lengths);
strides(Number<N - 1>{}) = 1; auto strides = generate_tuple(
strides(Number<N - 2>{}) = math::lcm(lengths[Number<N - 1>{}], align); [&](auto i) {
if constexpr(i.value == N - 1)
static_for<N - 3, -1, -1>{}([&](auto i) { {
constexpr auto i_p1 = i + Number<1>{}; return Number<1>{};
strides(i) = strides(i_p1) * lengths(i_p1); }
}); else if constexpr(i.value == N - 2)
{
return make_dynamic_naive_tensor_descriptor<N>(lengths, strides); return math::lcm(lengths[Number<N - 1>{}], align);
}
else
{
return container_reduce(lengths,
math::multiplies_v2{},
math::lcm(lengths[Number<N - 1>{}], align),
i,
Number<N - 2>{},
Number<1>{});
}
},
Number<N>{});
return make_dynamic_naive_tensor_descriptor_v2(lengths, strides);
} }
} // namespace ck } // namespace ck
......
...@@ -99,7 +99,7 @@ __host__ __device__ void print_multi_index(const Tuple<Xs...>& x) ...@@ -99,7 +99,7 @@ __host__ __device__ void print_multi_index(const Tuple<Xs...>& x)
printf("{"); printf("{");
printf("MultiIndex, "); printf("MultiIndex, ");
printf("size %d,", index_t{sizeof...(Xs)}); printf("size %d,", index_t{sizeof...(Xs)});
static_for<0, sizeof...(Xs), 1>{}([&](auto i) { printf("%d ", x.At(i)); }); static_for<0, sizeof...(Xs), 1>{}([&](auto i) { printf("%d ", index_t{x.At(i)}); });
printf("}"); printf("}");
} }
......
...@@ -95,6 +95,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -95,6 +95,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
level1_n_id * NPerLevel0Cluster + level0_n_id * NPerThreadSubC}; level1_n_id * NPerLevel0Cluster + level0_n_id * NPerThreadSubC};
} }
#if 0
__device__ static MatrixIndex GetDistanceFromBeginOfThreadMatrixC(index_t m_in_c, __device__ static MatrixIndex GetDistanceFromBeginOfThreadMatrixC(index_t m_in_c,
index_t n_in_c) index_t n_in_c)
{ {
...@@ -114,6 +115,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -114,6 +115,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
return MatrixIndex{m_repeat * MPerLevel1Cluster + m_in_sub_c, return MatrixIndex{m_repeat * MPerLevel1Cluster + m_in_sub_c,
n_repeat * NPerLevel1Cluster + n_in_sub_c}; n_repeat * NPerLevel1Cluster + n_in_sub_c};
} }
#endif
template <typename FloatA, typename FloatB, typename FloatC> template <typename FloatA, typename FloatB, typename FloatC>
__device__ void __device__ void
...@@ -336,9 +338,14 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -336,9 +338,14 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr index_t MRepeat = MPerThread / MPerThreadSubC; constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
constexpr index_t NRepeat = NPerThread / NPerThreadSubC; constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
static_if<MRepeat == 2 && NRepeat == 2>{}( if constexpr(MRepeat == 2 && NRepeat == 2)
[&](auto) { Run_pipelined_2x2(p_a_block, p_b_block, p_c_thread); }) {
.Else([&](auto) { Run_naive(p_a_block, p_b_block, p_c_thread); }); Run_pipelined_2x2(p_a_block, p_b_block, p_c_thread);
}
else
{
Run_naive(p_a_block, p_b_block, p_c_thread);
}
#else #else
Run_naive(p_a_block, p_b_block, p_c_thread); Run_naive(p_a_block, p_b_block, p_c_thread);
#endif #endif
......
...@@ -64,12 +64,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -64,12 +64,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned<2>( constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_multi_index(KPerBlock, MPerBlock), max_lds_align); make_multi_index(KPerBlock, MPerBlock), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned<2>( constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_multi_index(KPerBlock, NPerBlock), max_lds_align); make_multi_index(KPerBlock, NPerBlock), max_lds_align);
// LDS allocation for A and B: be careful of alignment // LDS allocation for A and B: be careful of alignment
...@@ -129,12 +129,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -129,12 +129,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned<2>( constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_multi_index(KPerBlock, MPerBlock), max_lds_align); make_multi_index(KPerBlock, MPerBlock), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned<2>( constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_multi_index(KPerBlock, NPerBlock), max_lds_align); make_multi_index(KPerBlock, NPerBlock), max_lds_align);
// A matrix blockwise copy // A matrix blockwise copy
......
...@@ -962,7 +962,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -962,7 +962,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
private: private:
static constexpr auto buffer_desc_ = static constexpr auto buffer_desc_ =
make_dynamic_naive_tensor_descriptor_packed<nDim>(to_multi_index(SliceLengths{})); make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(SliceLengths{}));
static constexpr index_t buffer_size_ = buffer_desc_.GetElementSpaceSize(); static constexpr index_t buffer_size_ = buffer_desc_.GetElementSpaceSize();
......
...@@ -97,20 +97,30 @@ __host__ __device__ constexpr auto container_reorder_given_old2new(Sequence<Is.. ...@@ -97,20 +97,30 @@ __host__ __device__ constexpr auto container_reorder_given_old2new(Sequence<Is..
return container_reorder_give_new2old(old_seq, new2old); return container_reorder_give_new2old(old_seq, new2old);
} }
template <typename Container, typename Reduce, typename Init> template <typename Container,
__host__ __device__ constexpr auto container_reduce(const Container& x, Reduce reduce, Init init) typename Reduce,
typename Init,
index_t IBegin = 0,
index_t IEnd = Container::Size(),
index_t IStep = 1>
__host__ __device__ constexpr auto container_reduce(const Container& x,
Reduce reduce,
Init init,
Number<IBegin> = Number<0>{},
Number<IEnd> = Number<Container::Size()>{},
Number<IStep> = Number<1>{})
{ {
constexpr index_t NSize = Container::Size(); static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
// f is recursive function, fs is a dummy of f // f is recursive function, fs is a dummy of f
// i is index, y_old is current scan, r_old is current reduction // i is index, y_old is current scan, r_old is current reduction
auto f = [&](auto fs, auto i, auto r_old) { auto f = [&](auto fs, auto i, auto r_old) {
auto r_new = reduce(x[i], r_old); auto r_new = reduce(x[i], r_old);
if constexpr(i.value > 0) if constexpr(i.value < IEnd - IStep)
{ {
// recursively call f/fs // recursively call f/fs
return fs(fs, i - Number<1>{}, r_new); return fs(fs, i + Number<IStep>{}, r_new);
} }
else else
{ {
...@@ -119,7 +129,7 @@ __host__ __device__ constexpr auto container_reduce(const Container& x, Reduce r ...@@ -119,7 +129,7 @@ __host__ __device__ constexpr auto container_reduce(const Container& x, Reduce r
}; };
// start recursion // start recursion
return f(f, Number<NSize - 1>{}, init); return f(f, Number<IBegin>{}, init);
} }
template <typename TData, index_t NSize, typename Reduce> template <typename TData, index_t NSize, typename Reduce>
......
...@@ -114,8 +114,8 @@ __host__ __device__ constexpr T min(T x, Ts... xs) ...@@ -114,8 +114,8 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
} }
// greatest common divisor, aka highest common factor // greatest common divisor, aka highest common factor
template <typename T> template <typename X, typename Y>
__host__ __device__ constexpr T gcd(T x, T y) __host__ __device__ constexpr auto gcd(X x, Y y)
{ {
if(x == y || x == 0) if(x == y || x == 0)
{ {
...@@ -135,13 +135,6 @@ __host__ __device__ constexpr T gcd(T x, T y) ...@@ -135,13 +135,6 @@ __host__ __device__ constexpr T gcd(T x, T y)
} }
} }
template <index_t X, index_t Y>
__host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
{
constexpr auto result = gcd(X, Y);
return Number<result>{};
}
template <typename X, typename... Ys> template <typename X, typename... Ys>
__host__ __device__ constexpr auto gcd(X x, Ys... ys) __host__ __device__ constexpr auto gcd(X x, Ys... ys)
{ {
...@@ -149,8 +142,8 @@ __host__ __device__ constexpr auto gcd(X x, Ys... ys) ...@@ -149,8 +142,8 @@ __host__ __device__ constexpr auto gcd(X x, Ys... ys)
} }
// least common multiple // least common multiple
template <typename T> template <typename X, typename Y>
__host__ __device__ constexpr T lcm(T x, T y) __host__ __device__ constexpr auto lcm(X x, Y y)
{ {
return (x * y) / gcd(x, y); return (x * y) / gcd(x, y);
} }
......
...@@ -43,11 +43,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -43,11 +43,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
// assume packed tensor // assume packed tensor
const auto in_n_c_hi_wi_desc = const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed<4>(to_multi_index(InDesc::GetLengths())); make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(InDesc::GetLengths()));
const auto wei_k_c_y_x_desc = const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed<4>(to_multi_index(WeiDesc::GetLengths())); make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(WeiDesc::GetLengths()));
const auto out_n_k_ho_wo_desc = const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed<4>(to_multi_index(OutDesc::GetLengths())); make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(OutDesc::GetLengths()));
const auto conv_strides = to_multi_index(ConvStrides{}); const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{}); const auto conv_dilations = to_multi_index(ConvDilations{});
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment