Commit 3d66465a authored by Jing Zhang's avatar Jing Zhang
Browse files

add non-pad

parent bf111ac6
...@@ -31,6 +31,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -31,6 +31,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
{ {
template <typename... Wei, template <typename... Wei,
typename... In, typename... In,
typename... Add,
typename... Out, typename... Out,
typename ConvStrides, typename ConvStrides,
typename ConvDilations, typename ConvDilations,
...@@ -38,6 +39,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -38,6 +39,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
typename InRightPads> typename InRightPads>
__host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc, __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc, const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const DynamicTensorDescriptor<Add...>& add_n_k0_hox2_wox2_k1_global_desc,
const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc, const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
const ConvStrides& conv_strides, const ConvStrides& conv_strides,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
...@@ -45,6 +47,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -45,6 +47,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
const InRightPads& in_right_pads, const InRightPads& in_right_pads,
const FloatAB* __restrict__ p_wei_global, const FloatAB* __restrict__ p_wei_global,
const FloatAB* __restrict__ p_in_global, const FloatAB* __restrict__ p_in_global,
const FloatC* __restrict__ p_d_global,
FloatC* __restrict__ p_out_global) const FloatC* __restrict__ p_out_global) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
...@@ -63,6 +66,9 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -63,6 +66,9 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2); const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3); const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
const auto Hox2 = Ho * 2;
const auto Wox2 = Wo * 2;
const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4); const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
const auto K = wei_k_c_y_x_global_desc.GetLength(I0); const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
...@@ -127,6 +133,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -127,6 +133,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
// add tensor
const auto add_k_n_hopx2_wopx2_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Hox2, Wox2)),
make_tuple(make_pass_through_transform(K0),
make_pass_through_transform(N),
make_pass_through_transform(Hox2),
make_pass_through_transform(Wox2)),
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const auto E = C * Y * X; const auto E = C * Y * X;
if(!((K % KPerBlock) == 0 && (Ho % HoPerBlock) == 0 && (Wo % WoPerBlock) == 0 && if(!((K % KPerBlock) == 0 && (Ho % HoPerBlock) == 0 && (Wo % WoPerBlock) == 0 &&
...@@ -158,16 +174,15 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -158,16 +174,15 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format // hack for NKHW format
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{}, make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}), Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 2, 0, 0, 0>{}, make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0>{}));
#if 0
// GEMM // GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3< using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3<
BlockSize, BlockSize,
...@@ -177,6 +192,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -177,6 +192,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
decltype(wei_e_k_global_desc), decltype(wei_e_k_global_desc),
decltype(in_e_n_ho_wo_global_desc), decltype(in_e_n_ho_wo_global_desc),
decltype(add_k_n_hopx2_wopx2_global_desc),
decltype(out_k_n_ho_wo_global_desc), decltype(out_k_n_ho_wo_global_desc),
KPerBlock, KPerBlock,
HoPerBlock, HoPerBlock,
...@@ -228,13 +244,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -228,13 +244,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
for(index_t j = 0; j < nrepeat; ++j) for(index_t j = 0; j < nrepeat; ++j)
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) #if 0
//if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel = run_gridwise_operation<gridwise_gemm,
decltype(wei_e_k_global_desc), decltype(wei_e_k_global_desc),
const FloatAB*, const FloatAB*,
decltype(in_e_n_ho_wo_global_desc), decltype(in_e_n_ho_wo_global_desc),
const FloatAB*, const FloatAB*,
decltype(add_k_n_hopx2_wopx2_global_desc),
const FloatC*,
decltype(out_k_n_ho_wo_global_desc), decltype(out_k_n_ho_wo_global_desc),
FloatC*, FloatC*,
integral_constant<bool, true>, integral_constant<bool, true>,
...@@ -249,22 +268,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -249,22 +268,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global, p_wei_global,
in_e_n_ho_wo_global_desc, in_e_n_ho_wo_global_desc,
p_in_global, p_in_global,
add_k_n_hopx2_wopx2_global_desc,
p_d_global,
out_k_n_ho_wo_global_desc, out_k_n_ho_wo_global_desc,
p_out_global, p_out_global,
integral_constant<bool, true>{}, integral_constant<bool, true>{},
integral_constant<bool, true>{}); integral_constant<bool, true>{});
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) #elif 1
// else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel =
decltype(wei_e_k_global_desc), run_gridwise_operation<gridwise_gemm,
const FloatAB*, decltype(wei_e_k_global_desc),
decltype(in_e_n_ho_wo_global_desc), const FloatAB*,
const FloatAB*, decltype(in_e_n_ho_wo_global_desc),
decltype(out_k_n_ho_wo_global_desc), const FloatAB*,
FloatC*, decltype(add_k_n_hopx2_wopx2_global_desc),
integral_constant<bool, true>, const FloatC*,
integral_constant<bool, false>>; decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel, launch_kernel(kernel,
dim3(GridSize), dim3(GridSize),
...@@ -275,22 +300,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -275,22 +300,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global, p_wei_global,
in_e_n_ho_wo_global_desc, in_e_n_ho_wo_global_desc,
p_in_global, p_in_global,
add_k_n_hopx2_wopx2_global_desc,
p_d_global,
out_k_n_ho_wo_global_desc, out_k_n_ho_wo_global_desc,
p_out_global, p_out_global,
integral_constant<bool, true>{}, integral_constant<bool, true>{},
integral_constant<bool, false>{}); integral_constant<bool, false>{});
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) #elif 1
// else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel =
decltype(wei_e_k_global_desc), run_gridwise_operation<gridwise_gemm,
const FloatAB*, decltype(wei_e_k_global_desc),
decltype(in_e_n_ho_wo_global_desc), const FloatAB*,
const FloatAB*, decltype(in_e_n_ho_wo_global_desc),
decltype(out_k_n_ho_wo_global_desc), const FloatAB*,
FloatC*, decltype(add_k_n_hopx2_wopx2_global_desc),
integral_constant<bool, false>, const FloatC*,
integral_constant<bool, true>>; decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel, launch_kernel(kernel,
dim3(GridSize), dim3(GridSize),
...@@ -301,22 +332,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -301,22 +332,28 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global, p_wei_global,
in_e_n_ho_wo_global_desc, in_e_n_ho_wo_global_desc,
p_in_global, p_in_global,
add_k_n_hopx2_wopx2_global_desc,
p_d_global,
out_k_n_ho_wo_global_desc, out_k_n_ho_wo_global_desc,
p_out_global, p_out_global,
integral_constant<bool, false>{}, integral_constant<bool, false>{},
integral_constant<bool, true>{}); integral_constant<bool, true>{});
} }
else #elif 1
// else
{ {
const auto kernel = run_gridwise_operation<gridwise_gemm, const auto kernel =
decltype(wei_e_k_global_desc), run_gridwise_operation<gridwise_gemm,
const FloatAB*, decltype(wei_e_k_global_desc),
decltype(in_e_n_ho_wo_global_desc), const FloatAB*,
const FloatAB*, decltype(in_e_n_ho_wo_global_desc),
decltype(out_k_n_ho_wo_global_desc), const FloatAB*,
FloatC*, decltype(add_k_n_hopx2_wopx2_global_desc),
integral_constant<bool, false>, const FloatC*,
integral_constant<bool, false>>; decltype(out_k_n_ho_wo_global_desc),
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel, launch_kernel(kernel,
dim3(GridSize), dim3(GridSize),
...@@ -327,11 +364,14 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -327,11 +364,14 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
p_wei_global, p_wei_global,
in_e_n_ho_wo_global_desc, in_e_n_ho_wo_global_desc,
p_in_global, p_in_global,
add_k_n_hopx2_wopx2_global_desc,
p_d_global,
out_k_n_ho_wo_global_desc, out_k_n_ho_wo_global_desc,
p_out_global, p_out_global,
integral_constant<bool, false>{}, integral_constant<bool, false>{},
integral_constant<bool, false>{}); integral_constant<bool, false>{});
} }
#endif
} }
timer.End(); timer.End();
...@@ -346,7 +386,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -346,7 +386,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl; << std::endl;
} }
#endif
} }
}; };
} // namespace ck } // namespace ck
......
...@@ -192,11 +192,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -192,11 +192,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format // hack for NKHW format
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{}, make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}), Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 2, 0, 0, 0>{}, make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0>{}));
...@@ -262,7 +262,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -262,7 +262,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
for(index_t j = 0; j < nrepeat; ++j) for(index_t j = 0; j < nrepeat; ++j)
{ {
if(has_main_k_block_loop && has_double_tail_k_block_loop) #if 0
//if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
run_gridwise_operation<gridwise_gemm, run_gridwise_operation<gridwise_gemm,
...@@ -293,7 +294,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -293,7 +294,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant<bool, true>{}, integral_constant<bool, true>{},
integral_constant<bool, true>{}); integral_constant<bool, true>{});
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) #elif 1
// else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
run_gridwise_operation<gridwise_gemm, run_gridwise_operation<gridwise_gemm,
...@@ -324,7 +326,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -324,7 +326,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant<bool, true>{}, integral_constant<bool, true>{},
integral_constant<bool, false>{}); integral_constant<bool, false>{});
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) #elif 1
// else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
run_gridwise_operation<gridwise_gemm, run_gridwise_operation<gridwise_gemm,
...@@ -355,7 +358,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -355,7 +358,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant<bool, false>{}, integral_constant<bool, false>{},
integral_constant<bool, true>{}); integral_constant<bool, true>{});
} }
else #elif 1
// else
{ {
const auto kernel = const auto kernel =
run_gridwise_operation<gridwise_gemm, run_gridwise_operation<gridwise_gemm,
...@@ -386,6 +390,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -386,6 +390,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
integral_constant<bool, false>{}, integral_constant<bool, false>{},
integral_constant<bool, false>{}); integral_constant<bool, false>{});
} }
#endif
} }
timer.End(); timer.End();
......
...@@ -134,8 +134,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -134,8 +134,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto KPerThreadSubC = 4; constexpr auto KPerThreadSubC = 4;
constexpr auto HoPerThreadSubC = 2; constexpr auto HoPerThreadSubC = HPerThread;
constexpr auto WoPerThreadSubC = 2; constexpr auto WoPerThreadSubC = WPerThread;
static_assert(KPerThread % KPerThreadSubC == 0, ""); static_assert(KPerThread % KPerThreadSubC == 0, "");
static_assert(HPerThread % HoPerThreadSubC == 0, ""); static_assert(HPerThread % HoPerThreadSubC == 0, "");
......
...@@ -175,6 +175,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -175,6 +175,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock; const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock; const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
const index_t k_thread_data_on_global = k_block_data_on_global + k_thread_id * KPerThread;
const index_t ho_thread_data_on_global = const index_t ho_thread_data_on_global =
ho_block_data_on_global + ho_thread_id * HoPerThread; ho_block_data_on_global + ho_thread_id * HoPerThread;
const index_t wo_thread_data_on_global = const index_t wo_thread_data_on_global =
...@@ -262,11 +263,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -262,11 +263,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
a_blockwise_copy.RunWrite(a_e_k_desc, p_a_block); a_blockwise_copy.RunWrite(a_e_k_desc, p_a_block);
} }
__syncthreads(); block_sync_lds();
//__syncthreads();
#if 1
constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector; constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector;
constexpr auto HoPerThreadx2 = HoPerThread * 2; constexpr auto HoPerThreadx2 = HoPerThread * 2;
constexpr auto WoPerThreadx2 = WoPerThread * 2; constexpr auto WoPerThreadx2 = WoPerThread * 2;
...@@ -281,25 +281,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -281,25 +281,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{}; constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
const index_t hox2_block_data_on_global = ho_block_work_id * HoPerBlock * 2; const index_t hox2_thread_data_on_global = ho_thread_data_on_global * 2;
const index_t wox2_block_data_on_global = wo_block_work_id * WoPerBlock * 2; const index_t wox2_thread_data_on_global = wo_thread_data_on_global * 2;
const index_t k_thread_data_on_global_add =
const index_t hox2_thread_data_on_global = k_thread_data_on_global / CThreadTransferDstScalarPerVector;
hox2_block_data_on_global + ho_thread_id * HoPerThreadx2;
const index_t wox2_thread_data_on_global =
wox2_block_data_on_global + wo_thread_id * WoPerThreadx2;
static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, ""); static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, "");
static_assert(CThreadTransferDstScalarPerVector == 16, ""); static_assert(CThreadTransferDstScalarPerVector == 16, "");
const index_t k_block_data_on_global_add =
k_block_work_id * KPerBlock / CThreadTransferDstScalarPerVector;
const index_t k_thread_data_on_global_add =
k_block_data_on_global_add + k_thread_id * KPerThreadAdd;
static_assert(vec_len == 256, ""); static_assert(vec_len == 256, "");
vector_type<int8_t, vec_len> d_vec; vector_type<int8_t, vec_len> d_vec;
#if 1
{ {
ThreadwiseDynamicTensorSliceTransfer_v2< ThreadwiseDynamicTensorSliceTransfer_v2<
...@@ -419,104 +412,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -419,104 +412,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
#endif #endif
// output: register to global memory // output: register to global memory
#if 0
{
constexpr auto HoPerThreadx2 = HoPerThread * 2;
constexpr auto WoPerThreadx2 = WoPerThread * 2;
const index_t hox2_block_data_on_global = ho_block_work_id * HoPerBlock * 2;
const index_t wox2_block_data_on_global = wo_block_work_id * WoPerBlock * 2;
const index_t hox2_thread_data_on_global =
hox2_block_data_on_global + ho_thread_id * HoPerThreadx2;
const index_t wox2_thread_data_on_global =
wox2_block_data_on_global + wo_thread_id * WoPerThreadx2;
static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, "");
constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector;
const index_t k_block_data_on_global_add =
k_block_work_id * KPerBlock / CThreadTransferDstScalarPerVector;
const index_t k_thread_data_on_global_add =
k_block_data_on_global_add + k_thread_id * KPerThreadAdd;
constexpr auto d_k_n_hox2_wox2_thread_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<1>{}, Number<1>{}, Number<1>{}, Number<1>{}));
constexpr auto vector_len = CThreadTransferDstScalarPerVector;
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
vector_type<int8_t, vector_len> d_vec;
for(index_t k_i = 0; k_i < KPerThreadAdd; ++k_i)
{
for(index_t h_i = 0; h_i < HoPerThreadx2; ++h_i)
{
for(index_t w_i = 0; w_i < WoPerThreadx2; ++w_i)
{
ThreadwiseDynamicTensorSliceTransfer_v2<
FloatC,
decltype(d_vec),
decltype(d_k_n_hox2_wox2_global_desc),
decltype(d_k_n_hox2_wox2_thread_desc),
Sequence<1, 1, 1, 1>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
// CThreadTransferDstScalarPerVector,
1,
AddressSpace::Global,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
1,
true>(d_k_n_hox2_wox2_global_desc,
make_multi_index(k_thread_data_on_global_add + k_i,
0,
hox2_thread_data_on_global + h_i,
wox2_thread_data_on_global + w_i))
.Run2(d_k_n_hox2_wox2_global_desc,
p_d_global,
d_k_n_hox2_wox2_thread_desc,
make_tuple(I0, I0, I0, I0),
d_vec,
c_k_n_ho_wo_global_tensor_iterator_hacks);
static_for<0, vector_len, 1>{}([&](auto i) {
d_vec.template AsType<int8_t>()(i) +=
p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(
make_tuple(k_i * vector_len + i, 0, h_i / 2, w_i / 2))];
});
ThreadwiseDynamicTensorSliceTransfer_v1r3<
decltype(d_vec),
FloatC,
decltype(d_k_n_hox2_wox2_thread_desc),
decltype(d_k_n_hox2_wox2_global_desc),
Sequence<1, 1, 1, 1>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
// CThreadTransferDstScalarPerVector,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
CGlobalMemoryDataOperation,
1,
true>(d_k_n_hox2_wox2_global_desc,
make_multi_index(k_thread_data_on_global_add + k_i,
0,
hox2_thread_data_on_global + h_i,
wox2_thread_data_on_global + w_i))
.Run2(d_k_n_hox2_wox2_thread_desc,
make_tuple(I0, I0, I0, I0),
d_vec,
d_k_n_hox2_wox2_global_desc,
p_c_global,
c_k_n_ho_wo_global_tensor_iterator_hacks);
}
}
}
}
#else
{ {
static_for<0, KPerThreadAdd, 1>{}([&](auto k_i) { static_for<0, KPerThreadAdd, 1>{}([&](auto k_i) {
static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) { static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) {
...@@ -569,7 +464,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -569,7 +464,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
p_c_global, p_c_global,
c_k_n_ho_wo_global_tensor_iterator_hacks); c_k_n_ho_wo_global_tensor_iterator_hacks);
} }
#endif
} }
// pass tensor descriptor by reference // pass tensor descriptor by reference
......
...@@ -109,6 +109,29 @@ struct ThreadwiseGemm_km_kn_mn_v3 ...@@ -109,6 +109,29 @@ struct ThreadwiseGemm_km_kn_mn_v3
p_c[c_offset_2], p_c[c_offset_2],
p_c[c_offset_3]); p_c[c_offset_3]);
} }
else if constexpr(H == 1 && W == 4)
{
constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 2));
constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 3));
constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 2));
constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 3));
amd_assembly_outer_product_1x4(p_a[a_offset],
p_b[b_offset_0],
p_b[b_offset_1],
p_b[b_offset_2],
p_b[b_offset_3],
p_c[c_offset_0],
p_c[c_offset_1],
p_c[c_offset_2],
p_c[c_offset_3]);
}
else else
{ {
static_for<0, H, 1>{}([&](auto h) { static_for<0, H, 1>{}([&](auto h) {
......
...@@ -142,8 +142,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -142,8 +142,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr index_t EPerBlock = C0; constexpr index_t EPerBlock = C0;
constexpr index_t KPerThread = KPerBlock; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 2; constexpr index_t HoPerThread = 4;
constexpr index_t WoPerThread = 2; constexpr index_t WoPerThread = 1;
constexpr index_t EPerThread = EPerBlock; constexpr index_t EPerThread = EPerBlock;
using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>; using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>;
...@@ -184,7 +184,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -184,7 +184,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
#endif #endif
constexpr auto conv_driver = constexpr auto conv_driver =
#if 0 #if 1
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
#else #else
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
......
...@@ -95,7 +95,7 @@ int main(int argc, char* argv[]) ...@@ -95,7 +95,7 @@ int main(int argc, char* argv[])
#elif 1 #elif 1
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
constexpr index_t HI = 540; constexpr index_t HI = 544;
constexpr index_t WI = 960; constexpr index_t WI = 960;
constexpr index_t K = 16; constexpr index_t K = 16;
constexpr index_t Y = 3; constexpr index_t Y = 3;
...@@ -118,6 +118,20 @@ int main(int argc, char* argv[]) ...@@ -118,6 +118,20 @@ int main(int argc, char* argv[])
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t HI = 135;
constexpr index_t WI = 240;
constexpr index_t K = 16;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>; using RightPads = Sequence<1, 1>;
#elif 0 #elif 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment