Commit a97298a4 authored by aska-0096's avatar aska-0096
Browse files

Functional work

parent cc0ffeb7
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
...@@ -83,30 +84,30 @@ using DeviceOpInstance = ...@@ -83,30 +84,30 @@ using DeviceOpInstance =
1, 1,
128, 128,
64, 64,
128,
64, 64,
64, 8,
4,
16, 16,
16, 16,
1, 2,
4, 4,
S<4, 32, 1>, S<4, 32, 1>,
S<1, 0, 2>, S<1, 0, 2>,
S<1, 0, 2>, S<1, 0, 2>,
2, 2,
4, 8,
4, 8,
true, false,
S<4, 32, 1>, S<4, 32, 1>,
S<1, 0, 2>, S<1, 0, 2>,
S<1, 0, 2>, S<1, 0, 2>,
2, 2,
4, 8,
4, 8,
true, false,
1, 1,
1, 1,
S<1, 64, 1, 2>, S<1, 32, 1, 4>,
8>; 8>;
int main(int argc, char* argv[]) int main(int argc, char* argv[])
...@@ -208,6 +209,11 @@ int main(int argc, char* argv[]) ...@@ -208,6 +209,11 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5}); d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
break; break;
case 2:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{1.f, 1.f}(b_k_n);
ck::utils::FillUniformDistributionIntegerValue<DDataType>{1.f, 1.f}(d_m_n);
break;
default: default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
......
...@@ -39,7 +39,7 @@ using S = ck::Sequence<Is...>; ...@@ -39,7 +39,7 @@ using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvSpec = static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
......
...@@ -54,13 +54,13 @@ using DeviceConvFwdInstance = ...@@ -54,13 +54,13 @@ using DeviceConvFwdInstance =
1, // Prefetch stage 1, // Prefetch stage
128, // BlockSize 128, // BlockSize
64, // MPerBlock 64, // MPerBlock
64, // NPerBlock 128, // NPerBlock
64, // KPerBlock 64, // KPerBlock
8, // K1 8, // K1
16, // MPerWMMA 16, // MPerWMMA
16, // NPerWMMA 16, // NPerWMMA
4, // MRepeat 2, // MRepeat
1, // NRepeat 4, // NRepeat
S<4, 32, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 S<4, 32, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder
...@@ -77,7 +77,7 @@ using DeviceConvFwdInstance = ...@@ -77,7 +77,7 @@ using DeviceConvFwdInstance =
true, // BBlockLdsExtraN true, // BBlockLdsExtraN
1, 1,
1, 1,
S<1, 16, 1, 8>, S<1, 32, 1, 4>,
8>; 8>;
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial>
......
...@@ -362,18 +362,18 @@ struct BlockwiseGemmWMMA ...@@ -362,18 +362,18 @@ struct BlockwiseGemmWMMA
} }
else else
{ {
static_for<0, NRepeat, 1>{}([&](auto n0) { static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of
static_for<0, MRepeat, 1>{}([&](auto m0) { // k=0,kpack*1, ..
static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of static_for<0, NRepeat, 1>{}([&](auto n0) {
// k=0,kpack*1, .. // read B
// read B b_thread_copy_.Run(
b_thread_copy_.Run( b_block_desc_k0_n0_n1_n2_k1,
b_block_desc_k0_n0_n1_n2_k1, make_tuple(Number<k * WmmaK / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
make_tuple(Number<k * WmmaK / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0), b_block_buf,
b_block_buf, b_thread_desc_,
b_thread_desc_, make_tuple(I0, n0, I0, I0, I0, I0),
make_tuple(I0, n0, I0, I0, I0, I0), b_thread_buf);
b_thread_buf); static_for<0, MRepeat, 1>{}([&](auto m0) {
// read A // read A
a_thread_copy_.Run( a_thread_copy_.Run(
a_block_desc_k0_m0_m1_m2_k1, a_block_desc_k0_m0_m1_m2_k1,
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
namespace ck { namespace ck {
...@@ -48,6 +49,9 @@ struct ThreadGroupTensorSliceTransfer_v4r1 ...@@ -48,6 +49,9 @@ struct ThreadGroupTensorSliceTransfer_v4r1
static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{}; static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{};
static constexpr auto src_scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
using Index = MultiIndex<nDim>; using Index = MultiIndex<nDim>;
__device__ constexpr ThreadGroupTensorSliceTransfer_v4r1( __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1(
...@@ -85,7 +89,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1 ...@@ -85,7 +89,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex( const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
make_multi_index(ThreadGroup::GetThreadId())); make_multi_index(ThreadGroup::GetThreadId()));
const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths; // This line result in non-packed reading.
const auto thread_data_idx_begin = thread_cluster_idx * src_scalar_per_access;
threadwise_transfer_.SetSrcSliceOrigin(src_desc, threadwise_transfer_.SetSrcSliceOrigin(src_desc,
src_block_slice_origin + thread_data_idx_begin); src_block_slice_origin + thread_data_idx_begin);
...@@ -152,7 +157,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1 ...@@ -152,7 +157,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer = using ThreadwiseTransfer =
ThreadwiseTensorSliceTransfer_v3r1<decltype(thread_slice_lengths), ThreadwiseTensorSliceTransfer_v3r1<ThreadClusterLengths,
decltype(thread_slice_lengths),
SrcElementwiseOperation, SrcElementwiseOperation,
DstElementwiseOperation, DstElementwiseOperation,
DstInMemOp, DstInMemOp,
......
...@@ -1136,7 +1136,12 @@ struct ThreadwiseTensorSliceTransfer_v4 ...@@ -1136,7 +1136,12 @@ struct ThreadwiseTensorSliceTransfer_v4
auto src_data_coord = src_ref_coord_; auto src_data_coord = src_ref_coord_;
move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step); move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
#if 0
printf("Tid: %03d, LDS read bank: %ld, inele offset: %d, coord: (%d, %d, %d, %d, %d, %d)\n", get_thread_local_1d_id(),
(src_data_coord.GetOffset()*sizeof(SrcData)/4 )%32, src_data_coord.GetOffset(),
src_data_coord.GetIndex().At(Number<0>{}), src_data_coord.GetIndex().At(Number<1>{}), src_data_coord.GetIndex().At(Number<2>{})
, src_data_coord.GetIndex().At(Number<3>{}) , src_data_coord.GetIndex().At(Number<4>{}), src_data_coord.GetIndex().At(Number<5>{}));
#endif
vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector; vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
using src_vector_t = typename decltype(src_tmp_vector)::type; using src_vector_t = typename decltype(src_tmp_vector)::type;
......
...@@ -49,7 +49,8 @@ struct lambda_scalar_per_access_for_src_and_dst ...@@ -49,7 +49,8 @@ struct lambda_scalar_per_access_for_src_and_dst
// 2. SrcBuffer and DstBuffer are DynamicBuffer // 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time, // 3. src_slice_origin and dst_slice_origin are not known at compile-time,
// 4. Use thread buffer // 4. Use thread buffer
template <typename SliceLengths, template <typename ThreadClusterLengths,
typename SliceLengths,
typename SrcElementwiseOperation, typename SrcElementwiseOperation,
typename DstElementwiseOperation, typename DstElementwiseOperation,
InMemoryDataOperationEnum DstInMemOp, InMemoryDataOperationEnum DstInMemOp,
...@@ -134,13 +135,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -134,13 +135,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr auto ordered_src_access_lengths = constexpr auto ordered_src_access_lengths =
container_reorder_given_new2old(src_access_lengths, src_dim_access_order); container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
constexpr auto src_move_per_access = ThreadClusterLengths{}*src_scalar_per_access;
// make forward steps // make forward steps
const auto src_forward_steps = generate_tuple( const auto src_forward_steps = generate_tuple(
[&](auto i) { [&](auto i) {
Index forward_step_idx; Index forward_step_idx;
static_for<0, nDim, 1>{}([&](auto j) { static_for<0, nDim, 1>{}([&](auto j) {
forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; // Move on block-wise instead of thread-wise
forward_step_idx(j) = (i.value == j.value) ? src_move_per_access[i] : 0;
}); });
return make_tensor_coordinate_step(src_desc, forward_step_idx); return make_tensor_coordinate_step(src_desc, forward_step_idx);
...@@ -153,7 +156,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -153,7 +156,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index backward_step_idx; Index backward_step_idx;
static_for<0, nDim, 1>{}([&](auto j) { static_for<0, nDim, 1>{}([&](auto j) {
backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; backward_step_idx(j) = (i.value == j.value) ? -src_move_per_access[i] : 0;
}); });
return make_tensor_coordinate_step(src_desc, backward_step_idx); return make_tensor_coordinate_step(src_desc, backward_step_idx);
...@@ -194,7 +197,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -194,7 +197,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
src_scalar_per_access; src_scalar_per_access;
}(); }();
#if 0
printf("Tid: %03d, global buf offset inbyte: %ld, inele offset: %d, coord: (%d, %d, %d)\n", get_thread_local_1d_id(),
src_coord_.GetOffset()*sizeof(DstData), src_coord_.GetOffset(),
src_coord_.GetIndex().At(Number<0>{}), src_coord_.GetIndex().At(Number<1>{}), src_coord_.GetIndex().At(Number<2>{}));
#endif
constexpr auto src_data_idx_seq = generate_sequence_v2( constexpr auto src_data_idx_seq = generate_sequence_v2(
[&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{}); [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
...@@ -369,6 +376,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -369,6 +376,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr auto dst_scalar_per_access = generate_sequence( constexpr auto dst_scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{}); detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
constexpr auto dst_move_per_access = ThreadClusterLengths{}*dst_scalar_per_access;
constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
constexpr auto dst_dim_access_order = DstDimAccessOrder{}; constexpr auto dst_dim_access_order = DstDimAccessOrder{};
...@@ -382,7 +391,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -382,7 +391,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index forward_step_idx; Index forward_step_idx;
static_for<0, nDim, 1>{}([&](auto j) { static_for<0, nDim, 1>{}([&](auto j) {
forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; forward_step_idx(j) = (i.value == j.value) ? dst_move_per_access[i] : 0;
}); });
return make_tensor_coordinate_step(dst_desc, forward_step_idx); return make_tensor_coordinate_step(dst_desc, forward_step_idx);
...@@ -395,7 +404,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -395,7 +404,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index backward_step_idx; Index backward_step_idx;
static_for<0, nDim, 1>{}([&](auto j) { static_for<0, nDim, 1>{}([&](auto j) {
backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; backward_step_idx(j) = (i.value == j.value) ? -dst_move_per_access[i] : 0;
}); });
return make_tensor_coordinate_step(dst_desc, backward_step_idx); return make_tensor_coordinate_step(dst_desc, backward_step_idx);
...@@ -439,7 +448,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -439,7 +448,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr auto dst_data_idx_seq = generate_sequence_v2( constexpr auto dst_data_idx_seq = generate_sequence_v2(
[&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{}); [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
#if 0
printf("Tid: %03d, LDS write bank: %ld, inele offset: %04d, coord (%d, %d, %d)\n", get_thread_local_1d_id(),
(dst_coord_.GetOffset()*sizeof(DstData)/4 )%32, dst_coord_.GetOffset(),
dst_coord_.GetIndex().At(Number<0>{}), dst_coord_.GetIndex().At(Number<1>{}), dst_coord_.GetIndex().At(Number<2>{}));
#endif
const bool is_dst_valid = const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
...@@ -505,7 +518,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -505,7 +518,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
{ {
const auto dst_reset_step = const auto dst_reset_step =
make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
#if 0
const auto dst_reset_idx = GetDstCoordinateResetStep();
if (get_thread_local_1d_id()==0)
{
printf("dst_reset_step: %d, %d, %d\n",
dst_reset_idx.At(Number<0>{}),
dst_reset_idx.At(Number<1>{}),
dst_reset_idx.At(Number<2>{}));
}
#endif
move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
} }
} }
...@@ -517,12 +539,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -517,12 +539,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr auto src_scalar_per_access = generate_sequence( constexpr auto src_scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{}); detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; constexpr auto src_access_unit = SliceLengths{} / src_scalar_per_access;
constexpr auto src_access_unit_helper = generate_sequence(
detail::lambda_scalar_per_access<SrcVectorDim, 1>{}, Number<nDim>{});
constexpr auto src_access_stride = ThreadClusterLengths{} * (src_access_unit - src_access_unit_helper);
constexpr auto src_dim_access_order = SrcDimAccessOrder{}; constexpr auto src_dim_access_order = SrcDimAccessOrder{};
constexpr auto ordered_src_access_lengths = constexpr auto ordered_src_access_stride =
container_reorder_given_new2old(src_access_lengths, src_dim_access_order); container_reorder_given_new2old(src_access_stride, src_dim_access_order);
// judge move forward or move backward during the last iteration // judge move forward or move backward during the last iteration
constexpr auto forward_sweep = [&]() { constexpr auto forward_sweep = [&]() {
...@@ -531,10 +558,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -531,10 +558,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_(I0) = true; forward_sweep_(I0) = true;
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_src_access_lengths[I0] - 1; index_t tmp = ordered_src_access_stride[I0] - 1;
static_for<1, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; tmp = tmp * ordered_src_access_stride[j] + ordered_src_access_stride[j] - 1;
}); });
forward_sweep_(i) = tmp % 2 == 0; forward_sweep_(i) = tmp % 2 == 0;
...@@ -549,7 +576,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -549,7 +576,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index ordered_idx; Index ordered_idx;
static_for<0, nDim, 1>{}([&](auto i) { static_for<0, nDim, 1>{}([&](auto i) {
ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0; ordered_idx(i) = forward_sweep[i] ? ordered_src_access_stride[i] : 0;
}); });
return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
...@@ -564,7 +591,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -564,7 +591,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return reset_src_data_step_; return reset_src_data_step_;
}(); }();
#if 0
if (get_thread_local_1d_id()==0)
{
printf("ordered_src_access_stride: %d, %d, %d | src_data_idx: %d, %d, %d\n",
ordered_src_access_stride.At(Number<0>{}).value,
ordered_src_access_stride.At(Number<1>{}).value,
ordered_src_access_stride.At(Number<2>{}).value,
src_data_idx.At(Number<0>{}),
src_data_idx.At(Number<1>{}),
src_data_idx.At(Number<2>{}));
}
#endif
return reset_src_data_step; return reset_src_data_step;
} }
...@@ -574,13 +612,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -574,13 +612,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// TODO: don't use lambda_scalar_per_access // TODO: don't use lambda_scalar_per_access
constexpr auto dst_scalar_per_access = generate_sequence( constexpr auto dst_scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{}); detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
constexpr auto dst_access_unit = SliceLengths{} / dst_scalar_per_access;
constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; constexpr auto dst_access_unit_helper = generate_sequence(
detail::lambda_scalar_per_access<DstVectorDim, 1>{}, Number<nDim>{});
constexpr auto dst_access_strides = ThreadClusterLengths{} * (dst_access_unit - dst_access_unit_helper);
constexpr auto dst_dim_access_order = DstDimAccessOrder{}; constexpr auto dst_dim_access_order = DstDimAccessOrder{};
constexpr auto ordered_dst_access_lengths = constexpr auto ordered_dst_access_strides =
container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); container_reorder_given_new2old(dst_access_strides, dst_dim_access_order);
// judge move forward or move backward during the last iteration // judge move forward or move backward during the last iteration
constexpr auto forward_sweep = [&]() { constexpr auto forward_sweep = [&]() {
...@@ -589,10 +632,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -589,10 +632,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_(I0) = true; forward_sweep_(I0) = true;
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_dst_access_lengths[I0] - 1; index_t tmp = ordered_dst_access_strides[I0] - 1;
static_for<1, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; tmp = tmp * ordered_dst_access_strides[j] + ordered_dst_access_strides[j] - 1;
}); });
forward_sweep_(i) = tmp % 2 == 0; forward_sweep_(i) = tmp % 2 == 0;
...@@ -607,7 +650,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -607,7 +650,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index ordered_idx; Index ordered_idx;
static_for<0, nDim, 1>{}([&](auto i) { static_for<0, nDim, 1>{}([&](auto i) {
ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0; ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_strides[i] : 0;
}); });
return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
...@@ -637,6 +680,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -637,6 +680,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// is it OK to construct a new step every time? // is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx); const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
#if 0
if (get_thread_local_1d_id()==0)
{
printf("InputSrcStep: %d, %d, %d | MoveSrcSliceWindowStep: %d, %d, %d\n",
src_slice_origin_step_idx.At(Number<0>{}),
src_slice_origin_step_idx.At(Number<1>{}),
src_slice_origin_step_idx.At(Number<2>{}),
adjusted_step_idx.At(Number<0>{}),
adjusted_step_idx.At(Number<1>{}),
adjusted_step_idx.At(Number<2>{}));
}
#endif
move_tensor_coordinate(src_desc, src_coord_, adjusted_step); move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment