Commit 9cc63115 authored by aska-0096's avatar aska-0096
Browse files

sanity

parent 5cfe67b1
......@@ -91,14 +91,14 @@ using DeviceOpInstance =
16,
2,
4,
S<4, 32, 1>,
S<8, 16, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
false,
S<4, 32, 1>,
S<8, 16, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
......
......@@ -67,8 +67,8 @@ struct BlockwiseGemmWMMA
// When use LDS, each Row(16 consecutive lanes) read whole data from source buffer
// When not use LDS, each Row read half of whole data from source buffer, exchange the data via
// permutation
static constexpr index_t A_KRow = AEnableLds ? 2 : 2;
static constexpr index_t B_KRow = BEnableLds ? 2 : 2;
static constexpr index_t A_KRow = 2;
static constexpr index_t B_KRow = 2;
static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I5);
static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I5);
......
......@@ -8,6 +8,7 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
namespace ck {
......@@ -48,6 +49,9 @@ struct ThreadGroupTensorSliceTransfer_v4r1
static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{};
static constexpr auto src_scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
using Index = MultiIndex<nDim>;
__device__ constexpr ThreadGroupTensorSliceTransfer_v4r1(
......@@ -85,7 +89,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
make_multi_index(ThreadGroup::GetThreadId()));
const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
// This line result in non-packed reading.
const auto thread_data_idx_begin = thread_cluster_idx * src_scalar_per_access;
threadwise_transfer_.SetSrcSliceOrigin(src_desc,
src_block_slice_origin + thread_data_idx_begin);
......@@ -152,7 +157,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer =
ThreadwiseTensorSliceTransfer_v3r1<decltype(thread_slice_lengths),
ThreadwiseTensorSliceTransfer_v3r1<ThreadClusterLengths,
decltype(thread_slice_lengths),
SrcElementwiseOperation,
DstElementwiseOperation,
DstInMemOp,
......
......@@ -498,16 +498,10 @@ struct GridwiseGemmMultipleD_Wmma
if constexpr(AEnableLds)
{
// AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
// Debug this part
constexpr auto A_KRow = 2;
constexpr auto A_K0PerRow = ABlockDesc_{}.GetLength(I0) / A_KRow;
constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
// return make_naive_tensor_descriptor_packed(make_tuple(Number<A_K0PerRow>{},
// Number<MRepeat>{},
// Number<MWaves>{},
// Number<A_KRow>{},
// Number<MPerWmma>{},
// Number<A_K1>{}));
return transform_tensor_descriptor(
ABlockDesc_{},
make_tuple(
......@@ -545,7 +539,6 @@ struct GridwiseGemmMultipleD_Wmma
if constexpr(BEnableLds)
{
// BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1
#if 1
constexpr auto B_KRow = 2;
constexpr auto B_K0PerRow = BBlockDesc_{}.GetLength(I0) / B_KRow;
constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2);
......@@ -558,18 +551,6 @@ struct GridwiseGemmMultipleD_Wmma
make_pass_through_transform(Number<B_K1>{})),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
#endif
#if 0
constexpr auto B_KRow = 2;
constexpr auto B_K0PerRow = BBlockDesc_{}.GetLength(I0) / B_KRow;
constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2);
return make_naive_tensor_descriptor_packed(make_tuple(Number<B_K0PerRow>{},
Number<NRepeat>{},
Number<NWaves>{},
Number<B_KRow>{},
Number<NPerWmma>{},
Number<B_K1>{}));
#endif
}
else
{
......
......@@ -1137,15 +1137,10 @@ struct ThreadwiseTensorSliceTransfer_v4
move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
#if 0
printf("Tid: %03d, Inele_Offset: %d, Coord: (%d, %d, %d, %d, %d, %d)\n",
printf("Tid: %03d, LDS read bank: %ld, Inele_Offset: %d\n",
get_thread_local_1d_id(),
src_data_coord.GetOffset(),
src_data_coord.GetIndex().At(Number<0>{}),
src_data_coord.GetIndex().At(Number<1>{}),
src_data_coord.GetIndex().At(Number<2>{}),
src_data_coord.GetIndex().At(Number<3>{}),
src_data_coord.GetIndex().At(Number<4>{}),
src_data_coord.GetIndex().At(Number<5>{}));
(src_data_coord.GetOffset()*sizeof(SrcData)/4) %32,
src_data_coord.GetOffset());
#endif
vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
......@@ -1188,12 +1183,6 @@ struct ThreadwiseTensorSliceTransfer_v4
dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
});
#if 0
printf("Tid: %03d, Inele_Offset: %d\n",
get_thread_local_1d_id(),
dst_desc.CalculateOffset(
dst_origin_idx + data_to_origin_disp_idx));
#endif
});
}
......@@ -1410,7 +1399,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
constexpr index_t dst_offset = dst_desc.CalculateOffset(
dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
SrcData v_this_row, v_theother_row;
SrcData v_this_row;
// int type temp value due to intrinsic requirement
int temp = 0;
......@@ -1425,6 +1414,8 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
v_this_row = type_convert_sp<SrcData>(temp);
}
dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_this_row);
#if 0
// apply inter-row permute.
temp = __builtin_amdgcn_permlanex16(temp,
type_convert_sp<int>(v_this_row),
......@@ -1448,6 +1439,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
type_convert_sp<DstData>(v_this_row);
dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_theother_row);
}
#endif
});
});
}
......
......@@ -49,7 +49,8 @@ struct lambda_scalar_per_access_for_src_and_dst
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
// 4. Use thread buffer
template <typename SliceLengths,
template <typename ThreadClusterLengths,
typename SliceLengths,
typename SrcElementwiseOperation,
typename DstElementwiseOperation,
InMemoryDataOperationEnum DstInMemOp,
......@@ -134,13 +135,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr auto ordered_src_access_lengths =
container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
constexpr auto src_move_per_access = ThreadClusterLengths{}*src_scalar_per_access;
// make forward steps
const auto src_forward_steps = generate_tuple(
[&](auto i) {
Index forward_step_idx;
static_for<0, nDim, 1>{}([&](auto j) {
forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
// Move on block-wise instead of thread-wise
forward_step_idx(j) = (i.value == j.value) ? src_move_per_access[i] : 0;
});
return make_tensor_coordinate_step(src_desc, forward_step_idx);
......@@ -153,7 +156,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index backward_step_idx;
static_for<0, nDim, 1>{}([&](auto j) {
backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
backward_step_idx(j) = (i.value == j.value) ? -src_move_per_access[i] : 0;
});
return make_tensor_coordinate_step(src_desc, backward_step_idx);
......@@ -194,7 +197,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
src_scalar_per_access;
}();
#if 0
printf("Tid: %03d, global buf offset inbyte: %ld, inele offset: %d, coord: (%d, %d, %d)\n", get_thread_local_1d_id(),
src_coord_.GetOffset()*sizeof(DstData), src_coord_.GetOffset(),
src_coord_.GetIndex().At(Number<0>{}), src_coord_.GetIndex().At(Number<1>{}), src_coord_.GetIndex().At(Number<2>{}));
#endif
constexpr auto src_data_idx_seq = generate_sequence_v2(
[&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
......@@ -369,6 +376,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr auto dst_scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
constexpr auto dst_move_per_access = ThreadClusterLengths{}*dst_scalar_per_access;
constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
constexpr auto dst_dim_access_order = DstDimAccessOrder{};
......@@ -382,7 +391,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index forward_step_idx;
static_for<0, nDim, 1>{}([&](auto j) {
forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
forward_step_idx(j) = (i.value == j.value) ? dst_move_per_access[i] : 0;
});
return make_tensor_coordinate_step(dst_desc, forward_step_idx);
......@@ -395,7 +404,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index backward_step_idx;
static_for<0, nDim, 1>{}([&](auto j) {
backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
backward_step_idx(j) = (i.value == j.value) ? -dst_move_per_access[i] : 0;
});
return make_tensor_coordinate_step(dst_desc, backward_step_idx);
......@@ -439,7 +448,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr auto dst_data_idx_seq = generate_sequence_v2(
[&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
#if 0
printf("Tid: %03d, LDS write bank: %ld, inele offset: %04d, coord (%d, %d, %d)\n", get_thread_local_1d_id(),
(dst_coord_.GetOffset()*sizeof(DstData)/4 )%32, dst_coord_.GetOffset(),
dst_coord_.GetIndex().At(Number<0>{}), dst_coord_.GetIndex().At(Number<1>{}), dst_coord_.GetIndex().At(Number<2>{}));
#endif
const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
......@@ -505,7 +518,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
{
const auto dst_reset_step =
make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
#if 0
const auto dst_reset_idx = GetDstCoordinateResetStep();
if (get_thread_local_1d_id()==0)
{
printf("dst_reset_step: %d, %d, %d\n",
dst_reset_idx.At(Number<0>{}),
dst_reset_idx.At(Number<1>{}),
dst_reset_idx.At(Number<2>{}));
}
#endif
move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
}
}
......@@ -517,12 +539,19 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr auto src_scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
constexpr auto src_access_unit = SliceLengths{} / src_scalar_per_access;
constexpr auto src_access_unit_helper = generate_sequence(
detail::lambda_scalar_per_access<SrcVectorDim, 1>{}, Number<nDim>{});
constexpr auto src_access_stride = ThreadClusterLengths{} * (src_access_unit - src_access_unit_helper);
constexpr auto src_dim_access_order = SrcDimAccessOrder{};
constexpr auto ordered_src_access_lengths =
container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
constexpr auto ordered_src_access_stride =
container_reorder_given_new2old(src_access_stride, src_dim_access_order);
constexpr auto ordered_src_access_unit =
container_reorder_given_new2old(src_access_unit, src_dim_access_order);
// judge move forward or move backward during the last iteration
constexpr auto forward_sweep = [&]() {
......@@ -531,10 +560,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_(I0) = true;
static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_src_access_lengths[I0] - 1;
index_t tmp = ordered_src_access_unit[I0] - 1;
static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
tmp = tmp * ordered_src_access_unit[j] + ordered_src_access_unit[j] - 1;
});
forward_sweep_(i) = tmp % 2 == 0;
......@@ -549,7 +578,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index ordered_idx;
static_for<0, nDim, 1>{}([&](auto i) {
ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
ordered_idx(i) = forward_sweep[i] ? ordered_src_access_stride[i] : 0;
});
return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
......@@ -564,7 +593,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return reset_src_data_step_;
}();
#if 0
if (get_thread_local_1d_id()==0)
{
printf("ordered_src_access_stride: %d, %d, %d | src_data_idx: %d, %d, %d\n",
ordered_src_access_stride.At(Number<0>{}).value,
ordered_src_access_stride.At(Number<1>{}).value,
ordered_src_access_stride.At(Number<2>{}).value,
src_data_idx.At(Number<0>{}),
src_data_idx.At(Number<1>{}),
src_data_idx.At(Number<2>{}));
}
#endif
return reset_src_data_step;
}
......@@ -574,13 +614,29 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// TODO: don't use lambda_scalar_per_access
constexpr auto dst_scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
constexpr auto dst_access_unit = SliceLengths{} / dst_scalar_per_access;
constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
constexpr auto dst_access_unit_helper = generate_sequence(
detail::lambda_scalar_per_access<DstVectorDim, 1>{}, Number<nDim>{});
constexpr auto dst_access_strides = ThreadClusterLengths{} * (dst_access_unit - dst_access_unit_helper);
#if 0
if (get_thread_local_1d_id()==0)
{
printf("dst_access_strides: %d, %d, %d\n",
dst_access_strides.At(Number<0>{}).value,
dst_access_strides.At(Number<1>{}).value,
dst_access_strides.At(Number<2>{}).value);
}
#endif
constexpr auto dst_dim_access_order = DstDimAccessOrder{};
constexpr auto ordered_dst_access_lengths =
container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
constexpr auto ordered_dst_access_strides =
container_reorder_given_new2old(dst_access_strides, dst_dim_access_order);
constexpr auto ordered_dst_access_unit =
container_reorder_given_new2old(dst_access_unit, dst_dim_access_order);
// judge move forward or move backward during the last iteration
constexpr auto forward_sweep = [&]() {
......@@ -589,10 +645,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_(I0) = true;
static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_dst_access_lengths[I0] - 1;
index_t tmp = ordered_dst_access_unit[I0] - 1;
static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
tmp = tmp * ordered_dst_access_unit[j] + ordered_dst_access_unit[j] - 1;
});
forward_sweep_(i) = tmp % 2 == 0;
......@@ -600,14 +656,22 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return forward_sweep_;
}();
#if 0
if (get_thread_local_1d_id()==0)
{
printf("forward_sweep: %d, %d, %d\n",
forward_sweep[Number<0>{}],
forward_sweep[Number<1>{}],
forward_sweep[Number<2>{}]);
}
#endif
// calculate dst data index after last iteration in RunWrite(), if it has not being reset by
// RunWrite()
constexpr auto dst_data_idx = [&]() {
Index ordered_idx;
static_for<0, nDim, 1>{}([&](auto i) {
ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_strides[i] : 0;
});
return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
......@@ -637,6 +701,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
#if 0
if (get_thread_local_1d_id()==0)
{
printf("InputSrcStep: %d, %d, %d | MoveSrcSliceWindowStep: %d, %d, %d\n",
src_slice_origin_step_idx.At(Number<0>{}),
src_slice_origin_step_idx.At(Number<1>{}),
src_slice_origin_step_idx.At(Number<2>{}),
adjusted_step_idx.At(Number<0>{}),
adjusted_step_idx.At(Number<1>{}),
adjusted_step_idx.At(Number<2>{}));
}
#endif
move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment