Commit f87dddae authored by Jing Zhang's avatar Jing Zhang
Browse files

add BGlobalMoveSliceWindowStepHacks{}

parent 31a440b9
......@@ -394,7 +394,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
{
// even iteration
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step);
b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf,
......@@ -409,7 +410,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step);
b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf,
......@@ -432,7 +434,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
{
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step);
b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf,
......@@ -462,7 +465,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step);
b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
e0_block_data_begin += 1;
......@@ -497,7 +501,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
{
// even iteration
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step);
b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf,
......@@ -512,7 +517,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step);
b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf,
......@@ -535,7 +541,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
{
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step);
b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf,
......
......@@ -666,6 +666,25 @@ struct ThreadwiseTensorSliceTransfer_v2
move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
template <typename SrcMoveSliceWindowStepHack>
__device__ void
MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx,
const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx =
SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetSrcCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(
src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
}
private:
SrcCoord src_coord_;
}; // namespace ck
......
......@@ -66,8 +66,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr index_t HoPerBlock = 16;
constexpr index_t WoPerBlock = 16;
constexpr index_t E1 = 2;
constexpr index_t E2 = 8;
constexpr index_t E1 = 4;
constexpr index_t E2 = 4;
constexpr index_t EPerBlock = 2;
constexpr index_t KPerThread = KPerBlock;
......@@ -75,8 +75,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 1, 1, 8>;
using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, EPerBlock, 16, 1>;
using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 1, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, E1, KPerBlock, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
......@@ -100,7 +100,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = EPerBlock;
constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 9, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, EPerBlock, 16, 1>;
......
......@@ -53,7 +53,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
const auto N = in_n_hi_wi_c_global_desc.GetLength(I0);
const auto Hi = in_n_hi_wi_c_global_desc.GetLength(I1);
......@@ -269,12 +268,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
constexpr bool has_main_k_block_loop = (E1 + E1PerBlock) / (2 * E1PerBlock) > 1;
constexpr bool has_double_tail_k_block_loop = (E1 / E1PerBlock) % 2 == 0;
const bool has_e0_block_loop = E0 > 1;
std::cerr << "has_main_k_block_loop = " << has_main_k_block_loop
<< " has_double_tail_k_block_loop = " << has_double_tail_k_block_loop
<< std::endl;
<< " has_e0_block_loop = " << has_e0_block_loop << std::endl;
const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor =
make_single_stage_tensor_adaptor(make_tuple(make_pass_through_transform(I0)),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment