Commit 39821a90 authored by Chao Liu's avatar Chao Liu
Browse files

refactor, clean up

parent 01e94729
...@@ -232,13 +232,13 @@ struct DummyDynamicTransform_1 ...@@ -232,13 +232,13 @@ struct DummyDynamicTransform_1
auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx); auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx);
const auto in_gemmk_gemmn_coord_step = const auto in_gemmk_gemmn_coord_iterator = make_dynamic_tensor_coordinate_iterator(
make_dynamic_tensor_coordinate_step(in_gemmk_gemmn_global_desc, make_multi_index(1, 0)); in_gemmk_gemmn_global_desc, make_multi_index(1, 0));
for(index_t iter = 0; iter < niter; ++iter) for(index_t iter = 0; iter < niter; ++iter)
{ {
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step); in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_iterator);
// write // write
float value = 1; float value = 1;
...@@ -352,12 +352,12 @@ struct DummyDynamicTransform_1 ...@@ -352,12 +352,12 @@ struct DummyDynamicTransform_1
auto in_coord = make_dynamic_tensor_coordinate(in_n_c_hip_wip_global_desc, idx); auto in_coord = make_dynamic_tensor_coordinate(in_n_c_hip_wip_global_desc, idx);
const auto in_coord_step = make_dynamic_tensor_coordinate_step( const auto in_coord_iterator = make_dynamic_tensor_coordinate_iterator(
in_n_c_hip_wip_global_desc, make_multi_index(1, 0, 0, 0)); in_n_c_hip_wip_global_desc, make_multi_index(1, 0, 0, 0));
for(index_t iter = 0; iter < niter; ++iter) for(index_t iter = 0; iter < niter; ++iter)
{ {
move_dynamic_tensor_coordinate(in_n_c_hip_wip_global_desc, in_coord, in_coord_step); move_dynamic_tensor_coordinate(in_n_c_hip_wip_global_desc, in_coord, in_coord_iterator);
// write // write
float value = 1; float value = 1;
...@@ -430,21 +430,24 @@ struct DummyDynamicTransform_fwd_v4r4 ...@@ -430,21 +430,24 @@ struct DummyDynamicTransform_fwd_v4r4
auto in_gemmk_gemmn_gemmkpack_coord = auto in_gemmk_gemmn_gemmkpack_coord =
make_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, idx); make_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, idx);
const auto in_gemmk_gemmn_gemmkpack_coord_step_0_0_1 = make_dynamic_tensor_coordinate_step( const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1 =
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 1)); make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(0, 0, 1));
const auto in_gemmk_gemmn_gemmkpack_coord_step_0_1_0 = make_dynamic_tensor_coordinate_step( const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0 =
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 1, 0)); make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(0, 1, 0));
const auto in_gemmk_gemmn_gemmkpack_coord_step_1_0_0 = make_dynamic_tensor_coordinate_step( const auto in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0 =
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(1, 0, 0)); make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(1, 0, 0));
// move (0, 0, 1) // move (0, 0, 1)
for(index_t iter = 0; iter < niter; ++iter) for(index_t iter = 0; iter < niter; ++iter)
{ {
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord, in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1); in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1);
// write // write
float value = 1; float value = 1;
...@@ -476,7 +479,7 @@ struct DummyDynamicTransform_fwd_v4r4 ...@@ -476,7 +479,7 @@ struct DummyDynamicTransform_fwd_v4r4
{ {
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord, in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0); in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0);
// write // write
float value = 1; float value = 1;
...@@ -508,7 +511,7 @@ struct DummyDynamicTransform_fwd_v4r4 ...@@ -508,7 +511,7 @@ struct DummyDynamicTransform_fwd_v4r4
{ {
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord, in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0); in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0);
// write // write
float value = 1; float value = 1;
......
...@@ -9,12 +9,8 @@ namespace ck { ...@@ -9,12 +9,8 @@ namespace ck {
template <index_t NDimHidden, typename VisibleDimensionIds> template <index_t NDimHidden, typename VisibleDimensionIds>
struct DynamicTensorCoordinate; struct DynamicTensorCoordinate;
#if 0 // hack template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
template <index_t NTransform, index_t NDimVisible> struct DynamicTensorCoordinateIterator;
#else
template <index_t NTransform, index_t NDimVisible, typename HackCalculateLowerIndexDiff>
#endif
struct DynamicTensorCoordinateStep;
// Transforms: Tuple<transforms...> // Transforms: Tuple<transforms...>
// LowerDimensionIdss : Tuple<Sequence<...>, ...> // LowerDimensionIdss : Tuple<Sequence<...>, ...>
...@@ -193,18 +189,14 @@ struct DynamicTensorCoordinate ...@@ -193,18 +189,14 @@ struct DynamicTensorCoordinate
HiddenIndex idx_hidden_; HiddenIndex idx_hidden_;
}; };
#if 0 // hack template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
template <index_t NTransform, index_t NDimVisible> struct DynamicTensorCoordinateIterator
#else
template <index_t NTransform, index_t NDimVisible, typename HackCalculateLowerIndexDiff>
#endif
struct DynamicTensorCoordinateStep
{ {
// TODO make these private // TODO make these private
using VisibleIndex = MultiIndex<NDimVisible>; using VisibleIndex = MultiIndex<NDimVisible>;
public: public:
__host__ __device__ explicit constexpr DynamicTensorCoordinateStep( __host__ __device__ explicit constexpr DynamicTensorCoordinateIterator(
const VisibleIndex& idx_diff_visible, const MultiIndex<NTransform>& do_transforms) const VisibleIndex& idx_diff_visible, const MultiIndex<NTransform>& do_transforms)
: idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms} : idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms}
{ {
...@@ -221,10 +213,8 @@ struct DynamicTensorCoordinateStep ...@@ -221,10 +213,8 @@ struct DynamicTensorCoordinateStep
const VisibleIndex idx_diff_visible_; const VisibleIndex idx_diff_visible_;
const MultiIndex<NTransform> do_transforms_; const MultiIndex<NTransform> do_transforms_;
#if 1 // hack // HACK: control UpdateLowerIndex()
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack static constexpr UpdateLowerIndexHack update_lower_index_hack_;
static constexpr HackCalculateLowerIndexDiff hack_calculate_lower_index_diff_;
#endif
}; };
// TODO: How to fix this? It uses an struct instead of lambda because lambda // TODO: How to fix this? It uses an struct instead of lambda because lambda
...@@ -350,9 +340,11 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDe ...@@ -350,9 +340,11 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDe
return DynamicTensorCoordinate<ndim_hidden, decltype(visible_dim_ids)>{idx_hidden}; return DynamicTensorCoordinate<ndim_hidden, decltype(visible_dim_ids)>{idx_hidden};
} }
template <typename TensorDesc, typename VisibleIndex> // UpdateLowerIndexHack: Sequence<...>
__host__ __device__ constexpr auto // HACK: control UpdateLowerIndex
make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_diff_visible) template <typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack>
__host__ __device__ constexpr auto make_dynamic_tensor_coordinate_iterator(
const TensorDesc&, const VisibleIndex& idx_diff_visible, UpdateLowerIndexHack)
{ {
static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(), static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
"wrong! # of dimension inconsistent"); "wrong! # of dimension inconsistent");
...@@ -362,6 +354,8 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d ...@@ -362,6 +354,8 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension(); constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension();
constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds(); constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
static_assert(UpdateLowerIndexHack::Size() == ntransform, "wrong!");
// use index_t for boolean type // use index_t for boolean type
auto do_transforms = make_zero_multi_index<ntransform>(); auto do_transforms = make_zero_multi_index<ntransform>();
auto is_non_zero_diff = make_zero_multi_index<ndim_hidden>(); auto is_non_zero_diff = make_zero_multi_index<ndim_hidden>();
...@@ -397,78 +391,23 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d ...@@ -397,78 +391,23 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low); set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
}); });
#if 0 // hack return DynamicTensorCoordinateIterator<ntransform, ndim_visible, UpdateLowerIndexHack>{
return DynamicTensorCoordinateStep<ntransform, ndim_visible>{idx_diff_visible, do_transforms};
#else
return DynamicTensorCoordinateStep<ntransform,
ndim_visible,
typename uniform_sequence_gen<ntransform, 0>::type>{
idx_diff_visible, do_transforms}; idx_diff_visible, do_transforms};
#endif
} }
#if 0 // hack
template <typename TensorDesc, typename VisibleIndex> template <typename TensorDesc, typename VisibleIndex>
#else __host__ __device__ constexpr auto
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack make_dynamic_tensor_coordinate_iterator(const TensorDesc&, const VisibleIndex& idx_diff_visible)
template <typename TensorDesc, typename VisibleIndex, typename HackCalculateLowerIndexDiff>
#endif
__host__ __device__ constexpr auto make_dynamic_tensor_coordinate_step_hack(
const TensorDesc&, const VisibleIndex& idx_diff_visible, HackCalculateLowerIndexDiff)
{ {
static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(), constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
"wrong! # of dimension inconsistent");
constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension();
constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension();
constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
static_assert(HackCalculateLowerIndexDiff::Size() == ntransform, "wrong!");
// use index_t for boolean type
auto do_transforms = make_zero_multi_index<ntransform>();
auto is_non_zero_diff = make_zero_multi_index<ndim_hidden>();
// decide do_transform by checkout non-zero index diff components
MultiIndex<VisibleIndex::Size()> non_zero_diff_pick_visible;
static_for<0, ndim_visible, 1>{}(
[&](auto i) { non_zero_diff_pick_visible(i) = (idx_diff_visible[i] != 0); });
set_container_subset(is_non_zero_diff, visible_dim_ids, non_zero_diff_pick_visible);
static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
constexpr auto dims_up = TensorDesc::GetUpperDimensionIdss().At(itran);
const auto non_zero_diff_pick_up = get_container_subset(is_non_zero_diff, dims_up);
MultiIndex<dims_low.Size()> non_zero_diff_pick_low;
// if any of upper index diff components is non-zero, then
// 1) Need to do this transform
// 2) all components of lower index diff will assume to be non-zero and need to be
// computed
const bool idx_diff_up_has_non_zero = container_reduce(
non_zero_diff_pick_up, [](auto a, auto b) constexpr { return a or b; }, false);
do_transforms(itran) = idx_diff_up_has_non_zero;
static_for<0, dims_low.Size(), 1>{}(
[&](auto i) { non_zero_diff_pick_low(i) = idx_diff_up_has_non_zero; });
set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
});
return DynamicTensorCoordinateStep<ntransform, ndim_visible, HackCalculateLowerIndexDiff>{ return make_dynamic_tensor_coordinate_iterator(
idx_diff_visible, do_transforms}; TensorDesc{}, idx_diff_visible, typename uniform_sequence_gen<ntransform, 0>::type{});
} }
template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep> template <typename TensorDesc, typename TensorCoord, typename TensorCoordIterator>
__host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDesc& tensor_desc, __host__ __device__ constexpr void move_dynamic_tensor_coordinate(
TensorCoord& coord, const TensorDesc& tensor_desc, TensorCoord& coord, const TensorCoordIterator& coord_iterator)
const TensorCoordStep& coord_step)
{ {
constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension(); constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension();
constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension(); constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension();
...@@ -480,8 +419,9 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe ...@@ -480,8 +419,9 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
auto idx_diff_hidden = make_zero_multi_index<ndim_hidden>(); auto idx_diff_hidden = make_zero_multi_index<ndim_hidden>();
// initialize visible index diff // initialize visible index diff
set_container_subset( set_container_subset(idx_diff_hidden,
idx_diff_hidden, TensorDesc::GetVisibleDimensionIds(), coord_step.GetVisibleIndexDiff()); TensorDesc::GetVisibleDimensionIds(),
coord_iterator.GetVisibleIndexDiff());
// this is what needs to be updated // this is what needs to be updated
auto& idx_hidden = coord.GetHiddenIndex(); auto& idx_hidden = coord.GetHiddenIndex();
...@@ -490,13 +430,13 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe ...@@ -490,13 +430,13 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
auto idx_hidden_pick_visible = auto idx_hidden_pick_visible =
get_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds()); get_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds());
idx_hidden_pick_visible += coord_step.GetIndexDiff(); idx_hidden_pick_visible += coord_iterator.GetIndexDiff();
set_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds(), idx_hidden_pick_visible); set_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds(), idx_hidden_pick_visible);
// update rest of hidden index // update rest of hidden index
static_for<ntransform - 1, -1, -1>{}([&](auto itran) { static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
if(coord_step.do_transforms_[itran]) if(coord_iterator.do_transforms_[itran])
{ {
const auto& tran = tensor_desc.GetTransforms().At(itran); const auto& tran = tensor_desc.GetTransforms().At(itran);
constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran); constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
...@@ -509,9 +449,7 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe ...@@ -509,9 +449,7 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
MultiIndex<dims_low.Size()> idx_diff_low; MultiIndex<dims_low.Size()> idx_diff_low;
// HACK: control UpdateLowerIndex for DynamicMerge using hack // HACK: control UpdateLowerIndex for DynamicMerge using hack
// TODO remove hack constexpr index_t Hack = decltype(coord_iterator.update_lower_index_hack_)::At(itran);
constexpr index_t Hack =
decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran);
tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{}); tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
...@@ -579,7 +517,7 @@ using DynamicTensorCoordinate_t = decltype(make_dynamic_tensor_coordinate( ...@@ -579,7 +517,7 @@ using DynamicTensorCoordinate_t = decltype(make_dynamic_tensor_coordinate(
TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{})); TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
template <typename TensorDesc> template <typename TensorDesc>
using DynamicTensorCoordinateStep_t = decltype(make_dynamic_tensor_coordinate_step( using DynamicTensorCoordinateIterator_t = decltype(make_dynamic_tensor_coordinate_iterator(
TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{})); TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
} // namespace ck } // namespace ck
......
...@@ -87,21 +87,15 @@ struct BlockwiseDynamicTensorSliceTransfer_v4 ...@@ -87,21 +87,15 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
return thread_cluster_id * ThreadSliceLengths{}; return thread_cluster_id * ThreadSliceLengths{};
} }
__device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src) template <typename SrcIteratorHacks>
__device__ void RunRead(const SrcDesc& src_desc,
const SrcData* p_src,
const SrcIteratorHacks& src_iterator_hacks)
{ {
if(BlockSize == thread_cluster_desc_.GetElementSize() or if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{ {
threadwise_transfer_.RunRead(src_desc, p_src); threadwise_transfer_.RunRead(src_desc, p_src, src_iterator_hacks);
}
}
__device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.RunRead_hack(src_desc, p_src);
} }
} }
...@@ -123,12 +117,18 @@ struct BlockwiseDynamicTensorSliceTransfer_v4 ...@@ -123,12 +117,18 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
} }
} }
__device__ void MoveSrcSliceWindow_hack(const SrcDesc& src_desc, const Index& step) // SrcMoveSliceWindowIteratorHack to control index calculation move slice window
template <typename SrcMoveSliceWindowIteratorHack>
__device__ void
MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& step,
const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
{ {
if(BlockSize == thread_cluster_desc_.GetElementSize() or if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{ {
threadwise_transfer_.MoveSrcSliceWindow_hack(src_desc, step); threadwise_transfer_.MoveSrcSliceWindow(
src_desc, step, src_move_slice_window_iterator_hack);
} }
} }
......
...@@ -259,10 +259,47 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -259,10 +259,47 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0); constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0);
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0); constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr auto a_k_m_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
constexpr auto a_k_m_global_reset_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
// hack to control index calculation when iterating over b_k_n_global tensor
#if 0
// for padded input
constexpr auto b_k_n_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{}));
constexpr auto b_k_n_global_move_slice_window_iterator_hack =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{};
#elif 0
// for non-padded input
constexpr auto b_k_n_global_iterator_hacks = make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0, 1, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 1>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 2, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 2>{}));
constexpr auto b_k_n_global_move_slice_window_iterator_hack =
Sequence<0, 0, 0, 0, 0, 1, 2>{};
#elif 1
// for 1x1 case
constexpr auto b_k_n_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 1, 0>{}, Sequence<0, 0, 1>{}),
make_tuple(Sequence<0, 2, 0>{}, Sequence<0, 0, 2>{}));
constexpr auto b_k_n_global_move_slice_window_iterator_hack = Sequence<0, 1, 2>{};
#endif
// LDS double buffer: preload data into LDS // LDS double buffer: preload data into LDS
{ {
a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global); a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global, a_k_m_global_iterator_hacks);
b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global); b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global, b_k_n_global_iterator_hacks);
a_blockwise_copy.RunWrite(a_k_m_block_desc, p_a_block_double); a_blockwise_copy.RunWrite(a_k_m_block_desc, p_a_block_double);
b_blockwise_copy.RunWrite(b_k_n_block_desc, p_b_block_double); b_blockwise_copy.RunWrite(b_k_n_block_desc, p_b_block_double);
...@@ -284,14 +321,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -284,14 +321,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
{ {
// even iteration // even iteration
a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step); a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step);
b_blockwise_copy.MoveSrcSliceWindow_hack(b_k_n_global_desc, b_blockwise_copy.MoveSrcSliceWindow(b_k_n_global_desc,
b_block_slice_copy_step); b_block_slice_copy_step,
b_k_n_global_move_slice_window_iterator_hack);
__syncthreads(); __syncthreads();
// LDS doubel buffer: load next data from device mem // LDS doubel buffer: load next data from device mem
a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global); a_blockwise_copy.RunRead(
b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global); a_k_m_global_desc, p_a_global, a_k_m_global_iterator_hacks);
b_blockwise_copy.RunRead(
b_k_n_global_desc, p_b_global, b_k_n_global_iterator_hacks);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_a_block_even, p_b_block_even, p_c_thread); blockwise_gemm.Run(p_a_block_even, p_b_block_even, p_c_thread);
...@@ -302,14 +342,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -302,14 +342,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// odd iteration // odd iteration
a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step); a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step);
b_blockwise_copy.MoveSrcSliceWindow_hack(b_k_n_global_desc, b_blockwise_copy.MoveSrcSliceWindow(b_k_n_global_desc,
b_block_slice_copy_step); b_block_slice_copy_step,
b_k_n_global_move_slice_window_iterator_hack);
__syncthreads(); __syncthreads();
// LDS doubel buffer: load next data from device mem // LDS doubel buffer: load next data from device mem
a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global); a_blockwise_copy.RunRead(
b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global); a_k_m_global_desc, p_a_global, a_k_m_global_iterator_hacks);
b_blockwise_copy.RunRead(
b_k_n_global_desc, p_b_global, b_k_n_global_iterator_hacks);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_a_block_odd, p_b_block_odd, p_c_thread); blockwise_gemm.Run(p_a_block_odd, p_b_block_odd, p_c_thread);
...@@ -326,13 +369,15 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -326,13 +369,15 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
{ {
a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step); a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step);
b_blockwise_copy.MoveSrcSliceWindow_hack(b_k_n_global_desc, b_block_slice_copy_step); b_blockwise_copy.MoveSrcSliceWindow(b_k_n_global_desc,
b_block_slice_copy_step,
b_k_n_global_move_slice_window_iterator_hack);
__syncthreads(); __syncthreads();
// LDS double buffer: load last data from device mem // LDS double buffer: load last data from device mem
a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global); a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global, a_k_m_global_iterator_hacks);
b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global); b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global, b_k_n_global_iterator_hacks);
// LDS double buffer: GEMM on 2nd-last data // LDS double buffer: GEMM on 2nd-last data
blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread); blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread);
...@@ -383,6 +428,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -383,6 +428,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
const index_t n_thread_data_on_global = const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col; n_block_data_on_global + c_thread_mtx_on_block.col;
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr auto c_m0_m1_n0_n1_global_tensor_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{}));
ThreadwiseDynamicTensorSliceTransfer_v1r3< ThreadwiseDynamicTensorSliceTransfer_v1r3<
AccFloat, AccFloat,
Float, Float,
...@@ -402,7 +459,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -402,7 +459,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
m_thread_data_on_global % M1, m_thread_data_on_global % M1,
n_thread_data_on_global / N1, n_thread_data_on_global / N1,
n_thread_data_on_global % N1)) n_thread_data_on_global % N1))
.Run_hack(p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global); .Run(p_c_thread,
c_m0_m1_n0_n1_global_desc,
p_c_global,
c_m0_m1_n0_n1_global_tensor_iterator_hacks);
} }
} }
...@@ -435,5 +495,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -435,5 +495,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
integral_constant<bool, HasDoubleTailKBlockLoop>{}); integral_constant<bool, HasDoubleTailKBlockLoop>{});
} }
}; };
} // namespace ck } // namespace ck
#endif #endif
...@@ -31,7 +31,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -31,7 +31,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{})); using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{})); using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3( __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3(
const DstDesc& dst_desc, const Index& dst_slice_origin_idx) const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
...@@ -49,8 +49,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -49,8 +49,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
dst_slice_origin_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx); dst_slice_origin_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
} }
__device__ void Run_hack(const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst) template <typename DstIteratorHacks>
__device__ void Run(const SrcData* p_src,
const DstDesc& dst_desc,
DstData* p_dst,
const DstIteratorHacks& dst_iterator_hacks)
{ {
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
// hardcoded for 4D // hardcoded for 4D
// TODO implemente N-D // TODO implemente N-D
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4, static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4,
...@@ -60,14 +69,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -60,14 +69,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
Index dst_scalar_per_access; Index dst_scalar_per_access;
static_for<0, nDim, 1>{}([&](auto i) { static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == DstVectorDim) dst_scalar_per_access(i) = (i == DstVectorDim) ? DstScalarPerVector : 1;
{
dst_scalar_per_access(i) = DstScalarPerVector;
}
else
{
dst_scalar_per_access(i) = 1;
}
}); });
return dst_scalar_per_access; return dst_scalar_per_access;
...@@ -76,16 +78,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -76,16 +78,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
constexpr auto dst_scalar_step_in_vector = [&]() { constexpr auto dst_scalar_step_in_vector = [&]() {
Index dst_scalar_step_in_vector; Index dst_scalar_step_in_vector;
static_for<0, nDim, 1>{}([&](auto i) { static_for<0, nDim, 1>{}(
if constexpr(i == DstVectorDim) [&](auto i) { dst_scalar_step_in_vector(i) = (i == DstVectorDim) ? 1 : 0; });
{
dst_scalar_step_in_vector(i) = 1;
}
else
{
dst_scalar_step_in_vector(i) = 0;
}
});
return dst_scalar_step_in_vector; return dst_scalar_step_in_vector;
}(); }();
...@@ -99,63 +93,41 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -99,63 +93,41 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
return access_lengths; return access_lengths;
}(); }();
#if 0 const auto dst_forward_iterators =
const auto dst_forward_steps = make_tuple(make_dynamic_tensor_coordinate_iterator(dst_desc,
make_tuple(make_dynamic_tensor_coordinate_step( make_multi_index(1, 0, 0, 0) *
dst_desc, make_multi_index(1, 0, 0, 0) * dst_scalar_per_access), dst_scalar_per_access,
make_dynamic_tensor_coordinate_step( dst_iterator_hacks[I0][I0]),
dst_desc, make_multi_index(0, 1, 0, 0) * dst_scalar_per_access), make_dynamic_tensor_coordinate_iterator(dst_desc,
make_dynamic_tensor_coordinate_step( make_multi_index(0, 1, 0, 0) *
dst_desc, make_multi_index(0, 0, 1, 0) * dst_scalar_per_access), dst_scalar_per_access,
make_dynamic_tensor_coordinate_step( dst_iterator_hacks[I0][I1]),
dst_desc, make_multi_index(0, 0, 0, 1) * dst_scalar_per_access), make_dynamic_tensor_coordinate_iterator(dst_desc,
make_multi_index(0, 0, 1, 0) *
const auto dst_backward_steps = dst_scalar_per_access,
make_tuple(make_dynamic_tensor_coordinate_step( dst_iterator_hacks[I0][I2]),
dst_desc, make_multi_index(-1, 0, 0, 0) * dst_scalar_per_access), make_dynamic_tensor_coordinate_iterator(dst_desc,
make_dynamic_tensor_coordinate_step( make_multi_index(0, 0, 0, 1) *
dst_desc, make_multi_index(0, -1, 0, 0) * dst_scalar_per_access), dst_scalar_per_access,
make_dynamic_tensor_coordinate_step( dst_iterator_hacks[I0][I3]));
dst_desc, make_multi_index(0, 0, -1, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step( const auto dst_backward_iterators =
dst_desc, make_multi_index(0, 0, 0, -1) * dst_scalar_per_access)); make_tuple(make_dynamic_tensor_coordinate_iterator(dst_desc,
#else make_multi_index(-1, 0, 0, 0) *
// hack for NKHW output tensor dst_scalar_per_access,
const auto dst_forward_steps = dst_iterator_hacks[I1][I0]),
make_tuple(make_dynamic_tensor_coordinate_step( make_dynamic_tensor_coordinate_iterator(dst_desc,
dst_desc, make_multi_index(1, 0, 0, 0) * dst_scalar_per_access), make_multi_index(0, -1, 0, 0) *
make_dynamic_tensor_coordinate_step( dst_scalar_per_access,
dst_desc, make_multi_index(0, 1, 0, 0) * dst_scalar_per_access), dst_iterator_hacks[I1][I1]),
make_dynamic_tensor_coordinate_step_hack(dst_desc, make_dynamic_tensor_coordinate_iterator(dst_desc,
make_multi_index(0, 0, 1, 0) * make_multi_index(0, 0, -1, 0) *
dst_scalar_per_access, dst_scalar_per_access,
Sequence<0, 0, 1, 0, 0>{}), dst_iterator_hacks[I1][I2]),
make_dynamic_tensor_coordinate_step_hack(dst_desc, make_dynamic_tensor_coordinate_iterator(dst_desc,
make_multi_index(0, 0, 0, 1) * make_multi_index(0, 0, 0, -1) *
dst_scalar_per_access, dst_scalar_per_access,
Sequence<0, 0, 1, 0, 0>{})); dst_iterator_hacks[I1][I3]));
const auto dst_backward_steps =
make_tuple(make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(-1, 0, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, -1, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step_hack(dst_desc,
make_multi_index(0, 0, -1, 0) *
dst_scalar_per_access,
Sequence<0, 0, 2, 0, 0>{}),
make_dynamic_tensor_coordinate_step_hack(dst_desc,
make_multi_index(0, 0, 0, -1) *
dst_scalar_per_access,
Sequence<0, 0, 2, 0, 0>{}));
#endif
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
index_t counter = 0;
// loop over dim0 // loop over dim0
static_for<0, static_for<0,
...@@ -249,14 +221,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -249,14 +221,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(
dst_desc, dst_desc,
dst_slice_origin_coord_, dst_slice_origin_coord_,
dst_forward_steps[DimAccessOrder{}[I3]]); dst_forward_iterators[DimAccessOrder{}[I3]]);
} }
else else
{ {
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(
dst_desc, dst_desc,
dst_slice_origin_coord_, dst_slice_origin_coord_,
dst_backward_steps[DimAccessOrder{}[I3]]); dst_backward_iterators[DimAccessOrder{}[I3]]);
} }
} }
}); });
...@@ -267,16 +239,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -267,16 +239,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
{ {
if constexpr(forward_dim2) if constexpr(forward_dim2)
{ {
move_dynamic_tensor_coordinate(dst_desc, move_dynamic_tensor_coordinate(
dst_slice_origin_coord_, dst_desc,
dst_forward_steps[DimAccessOrder{}[I2]]); dst_slice_origin_coord_,
dst_forward_iterators[DimAccessOrder{}[I2]]);
} }
else else
{ {
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(
dst_desc, dst_desc,
dst_slice_origin_coord_, dst_slice_origin_coord_,
dst_backward_steps[DimAccessOrder{}[I2]]); dst_backward_iterators[DimAccessOrder{}[I2]]);
} }
} }
}); });
...@@ -289,13 +262,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -289,13 +262,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
{ {
move_dynamic_tensor_coordinate(dst_desc, move_dynamic_tensor_coordinate(dst_desc,
dst_slice_origin_coord_, dst_slice_origin_coord_,
dst_forward_steps[DimAccessOrder{}[I1]]); dst_forward_iterators[DimAccessOrder{}[I1]]);
} }
else else
{ {
move_dynamic_tensor_coordinate(dst_desc, move_dynamic_tensor_coordinate(
dst_slice_origin_coord_, dst_desc,
dst_backward_steps[DimAccessOrder{}[I1]]); dst_slice_origin_coord_,
dst_backward_iterators[DimAccessOrder{}[I1]]);
} }
} }
}); });
...@@ -305,54 +279,47 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -305,54 +279,47 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
dst_scalar_per_access[DimAccessOrder{}[I0]]) dst_scalar_per_access[DimAccessOrder{}[I0]])
{ {
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_coord_, dst_forward_steps[DimAccessOrder{}[I0]]); dst_desc, dst_slice_origin_coord_, dst_forward_iterators[DimAccessOrder{}[I0]]);
} }
}); });
// move dst coordinate back to slice origin (or not) // move dst coordinate back to slice origin (or not)
if constexpr(DstResetCoordinateAfterRun) if constexpr(DstResetCoordinateAfterRun)
{ {
const auto dst_back_step = const auto dst_reset_iterator =
make_dynamic_tensor_coordinate_step(dst_desc, GetDstCoordinateBackStep()); make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_coord_, dst_back_step); move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_coord_, dst_reset_iterator);
} }
} }
__device__ static constexpr auto GetDstCoordinateBackStep() __device__ static constexpr auto GetDstCoordinateResetStep()
{ {
constexpr auto dst_scalar_per_access = [&]() { constexpr auto dst_scalar_per_access = [&]() {
Index dst_scalar_per_access; Index dst_scalar_per_access;
static_for<0, nDim, 1>{}([&](auto i) { static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == DstVectorDim) dst_scalar_per_access(i) = (i == DstVectorDim) ? DstScalarPerVector : 1;
{
dst_scalar_per_access(i) = DstScalarPerVector;
}
else
{
dst_scalar_per_access(i) = 1;
}
}); });
return dst_scalar_per_access; return dst_scalar_per_access;
}(); }();
MultiIndex<nDim> dst_back_step; MultiIndex<nDim> dst_reset_iterator;
// TODO: this is wrong, need to consider DimAccessOrder // TODO: this is wrong, need to consider DimAccessOrder
dst_back_step(Number<0>{}) = dst_scalar_per_access[Number<0>{}] - SliceLengths{}[0]; dst_reset_iterator(Number<0>{}) = dst_scalar_per_access[Number<0>{}] - SliceLengths{}[0];
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
constexpr auto i_m1 = i - Number<1>{}; constexpr auto i_m1 = i - Number<1>{};
// TODO: this is wrong // TODO: this is wrong
dst_back_step(i) = (SliceLengths{}[i_m1] % (2 * dst_scalar_per_access[i_m1]) == 0) dst_reset_iterator(i) = (SliceLengths{}[i_m1] % (2 * dst_scalar_per_access[i_m1]) == 0)
? 0 ? 0
: (dst_scalar_per_access[i] - SliceLengths{}[i]); : (dst_scalar_per_access[i] - SliceLengths{}[i]);
}); });
return dst_back_step; return dst_reset_iterator;
} }
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
...@@ -360,12 +327,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -360,12 +327,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
const Index& dst_slice_origin_step_idx) const Index& dst_slice_origin_step_idx)
{ {
// if dst coord was not reset by RunWrite(), then need to adjust the step here // if dst coord was not reset by RunWrite(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun const auto adjusted_step_idx =
? dst_slice_origin_step_idx DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetDstCoordinateBackStep(); : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
// is it OK to construct a new step every time? // is it OK to construct a new step every time?
const auto adjusted_step = make_dynamic_tensor_coordinate_step(dst_desc, adjusted_step_idx); const auto adjusted_step =
make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_coord_, adjusted_step); move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_coord_, adjusted_step);
} }
...@@ -410,8 +378,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -410,8 +378,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{})); using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{})); using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{})); using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{})); using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3(const SrcDesc& src_desc, __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3(const SrcDesc& src_desc,
const Index& src_slice_origin, const Index& src_slice_origin,
...@@ -444,8 +412,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -444,8 +412,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx); dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
} }
__device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src) template <typename SrcIteratorHacks>
__device__ void RunRead(const SrcDesc& src_desc,
const SrcData* p_src,
const SrcIteratorHacks& src_iterator_hacks)
{ {
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
// hardcoded for 2D // hardcoded for 2D
// TODO implemente N-D // TODO implemente N-D
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2, static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
...@@ -455,14 +429,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -455,14 +429,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
Index src_scalar_per_access; Index src_scalar_per_access;
static_for<0, nDim, 1>{}([&](auto i) { static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim) src_scalar_per_access(i) = (i == SrcVectorDim) ? SrcScalarPerVector : 1;
{
src_scalar_per_access(i) = SrcScalarPerVector;
}
else
{
src_scalar_per_access(i) = 1;
}
}); });
return src_scalar_per_access; return src_scalar_per_access;
...@@ -471,35 +438,36 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -471,35 +438,36 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
constexpr auto src_scalar_step_in_vector = [&]() { constexpr auto src_scalar_step_in_vector = [&]() {
Index src_scalar_step_in_vector; Index src_scalar_step_in_vector;
static_for<0, nDim, 1>{}([&](auto i) { static_for<0, nDim, 1>{}(
if constexpr(i == SrcVectorDim) [&](auto i) { src_scalar_step_in_vector(i) = (i == SrcVectorDim) ? 1 : 0; });
{
src_scalar_step_in_vector(i) = 1;
}
else
{
src_scalar_step_in_vector(i) = 0;
}
});
return src_scalar_step_in_vector; return src_scalar_step_in_vector;
}(); }();
// TODO use constexpr for coordinate-step to make sure compiler behave correctly constexpr auto access_lengths = [&]() {
const auto src_step_0_p = make_dynamic_tensor_coordinate_step( Index access_lengths;
src_desc, make_multi_index(0, 1) * src_scalar_per_access);
const auto src_step_0_m = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, -1) * src_scalar_per_access);
const auto src_step_p_0 = make_dynamic_tensor_coordinate_step( static_for<0, nDim, 1>{}(
src_desc, make_multi_index(1, 0) * src_scalar_per_access); [&](auto i) { access_lengths(i) = SliceLengths{}[i] / src_scalar_per_access[i]; });
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step( return access_lengths;
src_desc, make_multi_index(-1, 0) * src_scalar_per_access); }();
constexpr auto I0 = Number<0>{}; const auto src_forward_iterators = make_tuple(
constexpr auto I1 = Number<1>{}; make_dynamic_tensor_coordinate_iterator(src_desc,
make_multi_index(1, 0) * src_scalar_per_access,
src_iterator_hacks[I0][I0]),
make_dynamic_tensor_coordinate_iterator(src_desc,
make_multi_index(0, 1) * src_scalar_per_access,
src_iterator_hacks[I0][I1]));
const auto src_backward_iterators = make_tuple(
make_dynamic_tensor_coordinate_iterator(src_desc,
make_multi_index(-1, 0) * src_scalar_per_access,
src_iterator_hacks[I1][I0]),
make_dynamic_tensor_coordinate_iterator(src_desc,
make_multi_index(0, -1) * src_scalar_per_access,
src_iterator_hacks[I1][I1]));
static_for<0, SliceLengths{}[I0], src_scalar_per_access[I0]>{}([&](auto iter0) { static_for<0, SliceLengths{}[I0], src_scalar_per_access[I0]>{}([&](auto iter0) {
static_for<0, SliceLengths{}[I1], src_scalar_per_access[I1]>{}([&](auto iter1) { static_for<0, SliceLengths{}[I1], src_scalar_per_access[I1]>{}([&](auto iter1) {
...@@ -517,10 +485,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -517,10 +485,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert(SrcAddressSpace == AddressSpace::Global, static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem"); "wrong! hardcoded to use buffer_load, src must be global mem");
vector_type<SrcData, SrcScalarPerVector> src_vector;
using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType; using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
vector_type<SrcData, SrcScalarPerVector> src_vector;
#if 1
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>( src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize()); p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
...@@ -535,38 +504,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -535,38 +504,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
buffer_(Number<buffer_offset>{}) = src_vector[i]; buffer_(Number<buffer_offset>{}) = src_vector[i];
}); });
#else
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src, src_slice_origin_.GetOffset(), is_valid, src_desc.GetElementSpaceSize());
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) + i * src_scalar_step_in_vector);
buffer_(Number<buffer_offset>{}) = src_vector[i];
});
#endif
// move dim1 iterator // move dim1 iterator
if constexpr(iter1.value < SliceLengths{}[I1] - src_scalar_per_access[I1]) if constexpr(iter1.value < access_lengths[I1] - 1)
{ {
if constexpr(forward_dim1) if constexpr(forward_dim1)
{ {
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p); move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_forward_iterators[I1]);
} }
else else
{ {
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m); move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_backward_iterators[I1]);
} }
} }
}); });
// move dim0 iterator // move dim0 iterator
if constexpr(iter0.value < SliceLengths{}[I0] - src_scalar_per_access[I0]) if constexpr(iter0.value < access_lengths[I0] - 1)
{ {
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p_0); move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_forward_iterators[I0]);
} }
}); });
// move src coordinate back to its slice origin // move src coordinate back to its slice origin
if constexpr(SrcResetCoordinateAfterRun) if constexpr(SrcResetCoordinateAfterRun)
{ {
const auto src_back_step = const auto src_reset_iterator =
make_dynamic_tensor_coordinate_step(src_desc, GetSrcCoordinateBackStep()); make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step); move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_reset_iterator);
} }
} }
__device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst) __device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
{ {
static_assert(remove_reference_t<DstDesc>::GetNumOfDimension() == 2, static_assert(remove_reference_t<DstDesc>::GetNumOfDimension() == 2,
...@@ -578,14 +563,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -578,14 +563,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
{ {
// TODO use constexpr for coordinate-step to make sure compiler behave correctly // TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto dst_step_0_p = const auto dst_step_0_p =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1)); make_dynamic_tensor_coordinate_iterator(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m = const auto dst_step_0_m =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1)); make_dynamic_tensor_coordinate_iterator(dst_desc, make_multi_index(0, -1));
const auto dst_step_p_0 = const auto dst_step_p_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0)); make_dynamic_tensor_coordinate_iterator(dst_desc, make_multi_index(1, 0));
const auto dst_step_m_0 = const auto dst_step_m_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0)); make_dynamic_tensor_coordinate_iterator(dst_desc, make_multi_index(-1, 0));
constexpr index_t Len0 = SliceLengths{}[0]; constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1]; constexpr index_t Len1 = SliceLengths{}[1];
...@@ -637,20 +622,15 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -637,20 +622,15 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// move dst coordinate back to its slice origin // move dst coordinate back to its slice origin
if constexpr(DstResetCoordinateAfterRun) if constexpr(DstResetCoordinateAfterRun)
{ {
const auto dst_back_step = const auto dst_reset_iterator =
make_dynamic_tensor_coordinate_step(dst_desc, GetDstCoordinateBackStep()); make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step); move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_reset_iterator);
} }
} }
__device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src) __device__ static constexpr auto GetSrcCoordinateResetStep()
{ {
// hardcoded for 2D
// TODO implemente N-D
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
"wrong! hardcoded for 2D tensor");
constexpr auto src_scalar_per_access = [&]() { constexpr auto src_scalar_per_access = [&]() {
Index src_scalar_per_access; Index src_scalar_per_access;
...@@ -668,235 +648,22 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -668,235 +648,22 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
return src_scalar_per_access; return src_scalar_per_access;
}(); }();
constexpr auto src_scalar_step_in_vector = [&]() { MultiIndex<nDim> src_reset_iterator;
Index src_scalar_step_in_vector;
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_step_in_vector(i) = 1;
}
else
{
src_scalar_step_in_vector(i) = 0;
}
});
return src_scalar_step_in_vector;
}();
constexpr auto access_lengths = [&]() {
Index access_lengths;
static_for<0, nDim, 1>{}( src_reset_iterator(Number<0>{}) = src_scalar_per_access[Number<0>{}] - SliceLengths{}[0];
[&](auto i) { access_lengths(i) = SliceLengths{}[i] / src_scalar_per_access[i]; });
return access_lengths;
}();
#if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, 1) * src_scalar_per_access);
const auto src_step_0_m = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, -1) * src_scalar_per_access);
const auto src_step_p_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(1, 0) * src_scalar_per_access);
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
#elif 1
// for padded input tensor
const auto src_step_0_p =
make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(0, 1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
const auto src_step_0_m = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, -1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
const auto src_step_p_0 =
make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(-1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
#elif 0
// for non-padded input tensor
const auto src_step_0_p =
make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(0, 1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 1>{});
const auto src_step_0_m = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, -1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 2>{});
const auto src_step_p_0 =
make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(-1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 2, 0>{});
#elif 1
// for 1x1, input tensor
const auto src_step_0_p = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, 1) * src_scalar_per_access, Sequence<0, 0, 1>{});
const auto src_step_0_m = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, -1) * src_scalar_per_access, Sequence<0, 0, 2>{});
const auto src_step_p_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(1, 0) * src_scalar_per_access);
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
#endif
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
static_for<0, SliceLengths{}[I0], src_scalar_per_access[I0]>{}([&](auto iter0) {
static_for<0, SliceLengths{}[I1], src_scalar_per_access[I1]>{}([&](auto iter1) {
// step direction
constexpr bool forward_dim1 = (iter0.value % (2 * src_scalar_per_access[I0]) == 0);
constexpr index_t i0 = iter0.value;
constexpr index_t i1 =
forward_dim1 ? iter1.value
: SliceLengths{}[I1] - src_scalar_per_access[I1] - iter1.value;
// do work
// hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this
static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem");
using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
vector_type<SrcData, SrcScalarPerVector> src_vector;
#if 1
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) + i * src_scalar_step_in_vector);
buffer_(Number<buffer_offset>{}) = src_vector[i];
});
#elif 1
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src, src_slice_origin_.GetOffset(), is_valid, src_desc.GetElementSpaceSize());
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) + i * src_scalar_step_in_vector);
buffer_(Number<buffer_offset>{}) = src_vector[i];
});
#elif 0
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) + i * src_scalar_step_in_vector);
int32x2_t is_valid_i32 = is_valid;
asm volatile("\n \
v_cmp_gt_u32_e64 is_valid_flag, is_valid_i32, 0 \n \
v_cndmask_b32_e64 src_data, 0, src_data, is_valid_flag \n \
"
: "=s"(is_valid_flag), "=v"(src_data),
: "v"(is_valid_i32), "2"(is_valid_flag), "3"(src_data));
buffer_(Number<buffer_offset>{}) = src_data;
});
#endif
// move dim1 iterator
if constexpr(iter1.value < access_lengths[I1] - 1)
{
if constexpr(forward_dim1)
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m);
}
}
});
// move dim0 iterator
if constexpr(iter0.value < access_lengths[I0] - 1)
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p_0);
}
});
// move src coordinate back to its slice origin
if constexpr(SrcResetCoordinateAfterRun)
{
const auto src_back_step =
make_dynamic_tensor_coordinate_step(src_desc, GetSrcCoordinateBackStep());
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step);
}
}
__device__ static constexpr auto GetSrcCoordinateBackStep()
{
constexpr auto src_scalar_per_access = [&]() {
Index src_scalar_per_access;
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_per_access(i) = SrcScalarPerVector;
}
else
{
src_scalar_per_access(i) = 1;
}
});
return src_scalar_per_access;
}();
MultiIndex<nDim> src_back_step;
src_back_step(Number<0>{}) = src_scalar_per_access[Number<0>{}] - SliceLengths{}[0];
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
constexpr auto i_m1 = i - Number<1>{}; constexpr auto i_m1 = i - Number<1>{};
src_back_step(i) = (SliceLengths{}[i_m1] % (2 * src_scalar_per_access[i_m1]) == 0) src_reset_iterator(i) = (SliceLengths{}[i_m1] % (2 * src_scalar_per_access[i_m1]) == 0)
? 0 ? 0
: (src_scalar_per_access[i] - SliceLengths{}[i]); : (src_scalar_per_access[i] - SliceLengths{}[i]);
}); });
return src_back_step; return src_reset_iterator;
} }
__device__ static constexpr auto GetDstCoordinateBackStep() __device__ static constexpr auto GetDstCoordinateResetStep()
{ {
constexpr auto dst_scalar_per_access = [&]() { constexpr auto dst_scalar_per_access = [&]() {
Index dst_scalar_per_access; Index dst_scalar_per_access;
...@@ -915,19 +682,19 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -915,19 +682,19 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
return dst_scalar_per_access; return dst_scalar_per_access;
}(); }();
MultiIndex<nDim> dst_back_step; MultiIndex<nDim> dst_reset_iterator;
dst_back_step(Number<0>{}) = dst_scalar_per_access[Number<0>{}] - SliceLengths{}[0]; dst_reset_iterator(Number<0>{}) = dst_scalar_per_access[Number<0>{}] - SliceLengths{}[0];
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
constexpr auto i_m1 = i - Number<1>{}; constexpr auto i_m1 = i - Number<1>{};
dst_back_step(i) = (SliceLengths{}[i_m1] % (2 * dst_scalar_per_access[i_m1]) == 0) dst_reset_iterator(i) = (SliceLengths{}[i_m1] % (2 * dst_scalar_per_access[i_m1]) == 0)
? 0 ? 0
: (dst_scalar_per_access[i] - SliceLengths{}[i]); : (dst_scalar_per_access[i] - SliceLengths{}[i]);
}); });
return dst_back_step; return dst_reset_iterator;
} }
// src_slice_origin_step_idx need to be known at compile-time, for performance reason // src_slice_origin_step_idx need to be known at compile-time, for performance reason
...@@ -935,59 +702,50 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -935,59 +702,50 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const Index& src_slice_origin_step_idx) const Index& src_slice_origin_step_idx)
{ {
// if src coord was not reset by RunRead(), then need to adjust the step here // if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun const auto adjusted_step_idx =
? src_slice_origin_step_idx SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetSrcCoordinateBackStep(); : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
// is it OK to construct a new step every time? // is it OK to construct a new step every time?
const auto adjusted_step = make_dynamic_tensor_coordinate_step(src_desc, adjusted_step_idx); const auto adjusted_step =
make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step); move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
} }
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason // src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc, template <typename SrcMoveSliceWindowIteratorHack>
const Index& dst_slice_origin_step_idx) __device__ void
MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx,
const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
{ {
// if dst coord was not reset by RunWrite(), then need to adjust the step here // if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun const auto adjusted_step_idx =
? dst_slice_origin_step_idx SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
: dst_slice_origin_step_idx + GetDstCoordinateBackStep(); : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
// is it OK to construct a new step every time? // is it OK to construct a new step every time?
const auto adjusted_step = make_dynamic_tensor_coordinate_step(dst_desc, adjusted_step_idx); const auto adjusted_step = make_dynamic_tensor_coordinate_iterator(
src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, adjusted_step); move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
} }
// src_slice_origin_step_idx need to be known at compile-time, for performance reason // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow_hack(const SrcDesc& src_desc, __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& src_slice_origin_step_idx) const Index& dst_slice_origin_step_idx)
{ {
// if src coord was not reset by RunRead(), then need to adjust the step here // if dst coord was not reset by RunWrite(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun const auto adjusted_step_idx =
? src_slice_origin_step_idx DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
: src_slice_origin_step_idx + GetSrcCoordinateBackStep(); : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
// is it OK to construct a new step every time? // is it OK to construct a new step every time?
#if 0 // hack const auto adjusted_step =
const auto adjusted_step = make_dynamic_tensor_coordinate_step( make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
src_desc, adjusted_step_idx);
#elif 1
// for padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
#elif 0
// for non-padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
#elif 1
// for 1x1, input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 1, 2>{});
#endif
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step); move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, adjusted_step);
} }
private: private:
......
...@@ -55,11 +55,12 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -55,11 +55,12 @@ void device_dummy_dynamic_transform(InDesc,
auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate( auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate(
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0)); in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
const auto in_gemmk_gemmn_gemmkpack_coord_step_0_0_1 = make_dynamic_tensor_coordinate_step( const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1 =
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 1)); make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(0, 0, 1));
print_array_v2("do_tansforms 0 0 1: ", print_array_v2("do_tansforms 0 0 1: ",
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1.do_transforms_); in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1.do_transforms_);
for(index_t iter = 0; iter < 10; ++iter) for(index_t iter = 0; iter < 10; ++iter)
{ {
...@@ -71,7 +72,7 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -71,7 +72,7 @@ void device_dummy_dynamic_transform(InDesc,
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord, in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1); in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1);
} }
} }
...@@ -79,11 +80,12 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -79,11 +80,12 @@ void device_dummy_dynamic_transform(InDesc,
auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate( auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate(
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0)); in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
const auto in_gemmk_gemmn_gemmkpack_coord_step_0_1_0 = make_dynamic_tensor_coordinate_step( const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0 =
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 1, 0)); make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(0, 1, 0));
print_array_v2("do_tansforms 0 1 0: ", print_array_v2("do_tansforms 0 1 0: ",
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0.do_transforms_); in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0.do_transforms_);
for(index_t iter = 0; iter < 10; ++iter) for(index_t iter = 0; iter < 10; ++iter)
{ {
...@@ -95,7 +97,7 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -95,7 +97,7 @@ void device_dummy_dynamic_transform(InDesc,
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord, in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0); in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0);
} }
} }
...@@ -103,11 +105,12 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -103,11 +105,12 @@ void device_dummy_dynamic_transform(InDesc,
auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate( auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate(
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0)); in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
const auto in_gemmk_gemmn_gemmkpack_coord_step_1_0_0 = make_dynamic_tensor_coordinate_step( const auto in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0 =
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(1, 0, 0)); make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(1, 0, 0));
print_array_v2("do_tansforms 1 0 0: ", print_array_v2("do_tansforms 1 0 0: ",
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0.do_transforms_); in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0.do_transforms_);
for(index_t iter = 0; iter < 10; ++iter) for(index_t iter = 0; iter < 10; ++iter)
{ {
...@@ -119,7 +122,7 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -119,7 +122,7 @@ void device_dummy_dynamic_transform(InDesc,
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord, in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0); in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0);
} }
} }
......
...@@ -233,7 +233,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -233,7 +233,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver = constexpr auto conv_driver =
#if 1 #if 0
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#elif 0 #elif 0
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
...@@ -52,7 +52,7 @@ int main(int argc, char* argv[]) ...@@ -52,7 +52,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 0
// 3x3, 71x71 // 3x3, 71x71
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
...@@ -67,7 +67,7 @@ int main(int argc, char* argv[]) ...@@ -67,7 +67,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<1, 1>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>; using RightPads = Sequence<1, 1>;
#elif 0 #elif 1
// 1x1, 8x8 // 1x1, 8x8
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 1536; constexpr index_t C = 1536;
...@@ -592,7 +592,7 @@ int main(int argc, char* argv[]) ...@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 0 #elif 1
device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment