Commit f87dddae authored by Jing Zhang's avatar Jing Zhang
Browse files

add BGlobalMoveSliceWindowStepHacks{}

parent 31a440b9
...@@ -394,7 +394,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -394,7 +394,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
{ {
// even iteration // even iteration
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step); b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf, b_global_buf,
...@@ -409,7 +410,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -409,7 +410,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step); b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf, b_global_buf,
...@@ -432,7 +434,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -432,7 +434,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
{ {
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step); b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf, b_global_buf,
...@@ -462,7 +465,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -462,7 +465,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step); b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
e0_block_data_begin += 1; e0_block_data_begin += 1;
...@@ -497,7 +501,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -497,7 +501,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
{ {
// even iteration // even iteration
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step); b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf, b_global_buf,
...@@ -512,7 +517,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -512,7 +517,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step); b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf, b_global_buf,
...@@ -535,7 +541,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -535,7 +541,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
{ {
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step); b_thread_slice_copy_step,
BGlobalMoveSliceWindowStepHacks{});
b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
b_global_buf, b_global_buf,
......
...@@ -666,6 +666,25 @@ struct ThreadwiseTensorSliceTransfer_v2 ...@@ -666,6 +666,25 @@ struct ThreadwiseTensorSliceTransfer_v2
move_tensor_coordinate(src_desc, src_coord_, adjusted_step); move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
} }
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
template <typename SrcMoveSliceWindowStepHack>
__device__ void
MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx,
const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx =
SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetSrcCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(
src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
}
private: private:
SrcCoord src_coord_; SrcCoord src_coord_;
}; // namespace ck }; // namespace ck
......
...@@ -66,8 +66,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk( ...@@ -66,8 +66,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr index_t HoPerBlock = 16; constexpr index_t HoPerBlock = 16;
constexpr index_t WoPerBlock = 16; constexpr index_t WoPerBlock = 16;
constexpr index_t E1 = 2; constexpr index_t E1 = 4;
constexpr index_t E2 = 8; constexpr index_t E2 = 4;
constexpr index_t EPerBlock = 2; constexpr index_t EPerBlock = 2;
constexpr index_t KPerThread = KPerBlock; constexpr index_t KPerThread = KPerBlock;
...@@ -75,8 +75,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk( ...@@ -75,8 +75,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr index_t WoPerThread = 2; constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1; constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 1, 1, 8>; using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 1, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, EPerBlock, 16, 1>; using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, E1, KPerBlock, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
...@@ -100,7 +100,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk( ...@@ -100,7 +100,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr index_t KPerThread = KPerBlock; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 2; constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2; constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = EPerBlock; constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 9, 1, E2>; using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 9, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, EPerBlock, 16, 1>; using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, EPerBlock, 16, 1>;
......
...@@ -53,7 +53,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp ...@@ -53,7 +53,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
const auto N = in_n_hi_wi_c_global_desc.GetLength(I0); const auto N = in_n_hi_wi_c_global_desc.GetLength(I0);
const auto Hi = in_n_hi_wi_c_global_desc.GetLength(I1); const auto Hi = in_n_hi_wi_c_global_desc.GetLength(I1);
...@@ -268,13 +267,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp ...@@ -268,13 +267,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N; const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
constexpr bool has_main_k_block_loop = (E1 + E1PerBlock) / (2 * E1PerBlock) > 1; constexpr bool has_main_k_block_loop = (E1 + E1PerBlock) / (2 * E1PerBlock) > 1;
constexpr bool has_double_tail_k_block_loop = (E1 / E1PerBlock) % 2 == 0; constexpr bool has_double_tail_k_block_loop = (E1 / E1PerBlock) % 2 == 0;
const bool has_e0_block_loop = E0 > 1;
std::cerr << "has_main_k_block_loop = " << has_main_k_block_loop std::cerr << "has_main_k_block_loop = " << has_main_k_block_loop
<< " has_double_tail_k_block_loop = " << has_double_tail_k_block_loop << " has_double_tail_k_block_loop = " << has_double_tail_k_block_loop
<< std::endl; << " has_e0_block_loop = " << has_e0_block_loop << std::endl;
const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor = const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor =
make_single_stage_tensor_adaptor(make_tuple(make_pass_through_transform(I0)), make_single_stage_tensor_adaptor(make_tuple(make_pass_through_transform(I0)),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment