Commit 7e87e0b3 authored by Jing Zhang's avatar Jing Zhang
Browse files

tuning

parent f87dddae
...@@ -114,7 +114,7 @@ template <index_t BlockSize, ...@@ -114,7 +114,7 @@ template <index_t BlockSize,
index_t KPerBlock, index_t KPerBlock,
index_t HoPerBlock, index_t HoPerBlock,
index_t WoPerBlock, index_t WoPerBlock,
index_t EPerBlock, index_t E1PerBlock,
index_t KPerThread, index_t KPerThread,
index_t HoPerThread, index_t HoPerThread,
index_t WoPerThread, index_t WoPerThread,
...@@ -185,7 +185,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -185,7 +185,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize()); p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
static_assert(E1 % EPerBlock == 0, ""); static_assert(E1 % E1PerBlock == 0, "");
// const auto E = a_e0_e1_k_e2_global_desc.GetLength(I0); // const auto E = a_e0_e1_k_e2_global_desc.GetLength(I0);
// const auto K = a_e0_e1_k_e2_global_desc.GetLength(I1); // const auto K = a_e0_e1_k_e2_global_desc.GetLength(I1);
...@@ -229,10 +229,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -229,10 +229,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3
make_tuple(Number<I1>{}, Number<E1>{}, Number<KPerBlock>{}, Number<E2>{}), make_tuple(Number<I1>{}, Number<E1>{}, Number<KPerBlock>{}, Number<E2>{}),
max_lds_align); max_lds_align);
// B matrix in LDS memory, dst of blockwise copy // B matrix in thread, dst of blockwise copy
// be careful of LDS alignment
constexpr auto b_e1_n_ho_wo_e2_block_desc = constexpr auto b_e1_n_ho_wo_e2_block_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<EPerBlock>{}, make_naive_tensor_descriptor_packed(make_tuple(Number<E1PerBlock>{},
Number<1>{}, Number<1>{},
Number<HoPerBlock>{}, Number<HoPerBlock>{},
Number<WoPerBlock>{}, Number<WoPerBlock>{},
...@@ -244,7 +243,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -244,7 +243,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{})); Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
constexpr auto a_e1_k_e2_block_desc = make_naive_tensor_descriptor_aligned( constexpr auto a_e1_k_e2_block_desc = make_naive_tensor_descriptor_aligned(
make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align); make_tuple(Number<E1PerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
auto blockwise_gemm = auto blockwise_gemm =
BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize, BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
...@@ -295,16 +294,17 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -295,16 +294,17 @@ struct GridwiseGemmDlops_km_kn_mn_v3
1, 1,
1, 1,
AThreadTransferSrcResetCoordinateAfterRun, AThreadTransferSrcResetCoordinateAfterRun,
true>(a_e0_e1_k_e2_global_desc, false>(
make_multi_index(0, 0, k_block_data_on_global, 0), a_e0_e1_k_e2_global_desc,
a_e0_e1_k_e2_block_desc, make_multi_index(0, 0, k_block_data_on_global, 0),
make_multi_index(0, 0, 0, 0)); a_e0_e1_k_e2_block_desc,
make_multi_index(0, 0, 0, 0));
constexpr auto a_block_slice_copy_step = make_multi_index(I1, 0, 0, 0); constexpr auto a_block_slice_copy_step = make_multi_index(I1, 0, 0, 0);
constexpr auto b_e0_e1_n_ho_wo_e2_thread_desc = constexpr auto b_e0_e1_n_ho_wo_e2_thread_desc =
make_naive_tensor_descriptor_packed(make_tuple(I1, make_naive_tensor_descriptor_packed(make_tuple(I1,
Number<EPerBlock>{}, Number<E1PerBlock>{},
Number<1>{}, Number<1>{},
Number<HoPerThread>{}, Number<HoPerThread>{},
Number<WoPerThread>{}, Number<WoPerThread>{},
...@@ -315,7 +315,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -315,7 +315,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
FloatAB, FloatAB,
decltype(b_e0_e1_n_ho_wo_e2_global_desc), decltype(b_e0_e1_n_ho_wo_e2_global_desc),
decltype(b_e0_e1_n_ho_wo_e2_thread_desc), decltype(b_e0_e1_n_ho_wo_e2_thread_desc),
Sequence<I1, EPerBlock, 1, HoPerThread, WoPerThread, E2>, Sequence<I1, E1PerBlock, 1, HoPerThread, WoPerThread, E2>,
BBlockTransferSrcAccessOrder, BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim, BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector,
...@@ -344,7 +344,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -344,7 +344,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{} Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
.Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0}); .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
constexpr auto b_thread_slice_copy_step = make_multi_index(0, EPerBlock, 0, 0, 0, 0); constexpr auto b_thread_slice_copy_step = make_multi_index(0, E1PerBlock, 0, 0, 0, 0);
// hack to control index calculation when iterating over A and B matrix for threadwise copy // hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr auto a_e0_e1_k_e2_global_step_hacks = AGlobalStepHacks{}; constexpr auto a_e0_e1_k_e2_global_step_hacks = AGlobalStepHacks{};
...@@ -407,7 +407,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -407,7 +407,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step, b_thread_slice_copy_step,
...@@ -423,11 +423,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -423,11 +423,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
e1_block_data_begin += 2 * EPerBlock; e1_block_data_begin += 2 * E1PerBlock;
} while(e1_block_data_begin < E1 - 2 * EPerBlock); } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
} }
// LDS double buffer: tail // LDS double buffer: tail
...@@ -447,7 +447,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -447,7 +447,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on 2nd-last data // LDS double buffer: GEMM on 2nd-last data
blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
// LDS double buffer: GEMM on last data // LDS double buffer: GEMM on last data
blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
...@@ -462,7 +462,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -462,7 +462,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
a_block_slice_copy_step, a_block_slice_copy_step,
AGlobalMoveSliceWindowStepHacks{}); AGlobalMoveSliceWindowStepHacks{});
blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - E1PerBlock), 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step, b_thread_slice_copy_step,
...@@ -514,7 +514,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -514,7 +514,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step, b_thread_slice_copy_step,
...@@ -530,11 +530,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -530,11 +530,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
e1_block_data_begin += 2 * EPerBlock; e1_block_data_begin += 2 * E1PerBlock;
} while(e1_block_data_begin < E1 - 2 * EPerBlock); } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
} }
// LDS double buffer: tail // LDS double buffer: tail
...@@ -554,7 +554,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -554,7 +554,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on 2nd-last data // LDS double buffer: GEMM on 2nd-last data
blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
// LDS double buffer: GEMM on last data // LDS double buffer: GEMM on last data
blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
......
...@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
const auto Y = wei_k_c_y_x_lengths[I2]; const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3]; const auto X = wei_k_c_y_x_lengths[I3];
constexpr auto InWeiVectorSize = 4; constexpr auto InWeiVectorSize = 8;
#if 1 #if 1
const auto C0 = C / Number<InWeiVectorSize>{}; const auto C0 = C / Number<InWeiVectorSize>{};
...@@ -106,9 +106,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -106,9 +106,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
constexpr index_t HoPerBlock = 8; constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 32; constexpr index_t WoPerBlock = 32;
constexpr index_t E1 = 4 * 9; constexpr index_t E1 = 2 * 9;
constexpr index_t E2 = C1; constexpr index_t E2 = C1;
constexpr index_t EPerBlock = 4; constexpr index_t EPerBlock = 2;
constexpr index_t KPerThread = KPerBlock; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 2; constexpr index_t HoPerThread = 2;
......
...@@ -10,8 +10,8 @@ template <ck::index_t BlockSize, ...@@ -10,8 +10,8 @@ template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
ck::index_t E1, ck::index_t E1_,
ck::index_t E2, ck::index_t E2_,
ck::index_t KPerBlock, ck::index_t KPerBlock,
ck::index_t HoPerBlock, ck::index_t HoPerBlock,
ck::index_t WoPerBlock, ck::index_t WoPerBlock,
...@@ -95,12 +95,15 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -95,12 +95,15 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
const auto E = C0 * Y * X; const auto E = C0 * Y * X;
constexpr auto E1 = Number<E1_>{};
constexpr auto E2 = Number<E2_>{};
static_assert(E2 == C1, ""); static_assert(E2 == C1, "");
const auto E0 = E / E1; const auto E0 = E / E1;
// weight tensor // weight tensor
const auto a_e0_k_e2_grid_desc = transform_tensor_descriptor( const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)), make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
make_tuple(make_pass_through_transform(K), make_tuple(make_pass_through_transform(K),
make_pass_through_transform(C0 * Y * X), make_pass_through_transform(C0 * Y * X),
...@@ -109,7 +112,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -109,7 +112,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{})); make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
const auto a_e0_e1_k_e2_grid_desc = const auto a_e0_e1_k_e2_grid_desc =
transform_tensor_descriptor(a_e0_k_e2_grid_desc, transform_tensor_descriptor(a_e_k_e2_grid_desc,
make_tuple(make_unmerge_transform(make_tuple(E0, E1)), make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
make_pass_through_transform(K), make_pass_through_transform(K),
make_pass_through_transform(E2)), make_pass_through_transform(E2)),
...@@ -139,7 +142,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -139,7 +142,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_tuple( make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{})); Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
const auto b_e0_n_ho_wo_e2_grid_desc = transform_tensor_descriptor( const auto b_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
in_n_c0_y_ho_x_wo_c1_global_desc, in_n_c0_y_ho_x_wo_c1_global_desc,
make_tuple(make_merge_transform(make_tuple(C0, Y, X)), make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
make_pass_through_transform(N), make_pass_through_transform(N),
...@@ -151,7 +154,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -151,7 +154,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor( const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
b_e0_n_ho_wo_e2_grid_desc, b_e_n_ho_wo_e2_grid_desc,
make_tuple(make_unmerge_transform(make_tuple(E0, E1)), make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
make_pass_through_transform(N), make_pass_through_transform(N),
make_pass_through_transform(Hop), make_pass_through_transform(Hop),
...@@ -199,13 +202,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -199,13 +202,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
constexpr auto b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack = constexpr auto b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}; Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
...@@ -245,17 +248,17 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -245,17 +248,17 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
ABlockTransferThreadSliceLengths_E0_E1_K_E2, ABlockTransferThreadSliceLengths_E0_E1_K_E2,
ABlockTransferThreadClusterLengths_E0_E1_K_E2, ABlockTransferThreadClusterLengths_E0_E1_K_E2,
Sequence<2, 0, 1, 3>, Sequence<2, 0, 1, 3>,
Sequence<2, 0, 1, 3>, Sequence<0, 1, 2, 3>,
3, 3,
ABlockTransferSrcScalarPerVector_E2, ABlockTransferSrcScalarPerVector_E2,
ABlockTransferDstScalarPerVector_E2, ABlockTransferDstScalarPerVector_E2,
false, // don't move back src coordinate after threadwise copy false, // don't move back src coordinate after threadwise copy
Sequence<0, 2, 3, 4, 1, 5>, Sequence<0, 1, 2, 3, 4, 5>,
5, 5,
BThreadTransferSrcScalarPerVector_E2, BThreadTransferSrcScalarPerVector_E2,
false, // don't move back src coordinate after threadwise copy, which will be fused with false, // don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation // MoveSrcSliceWindow() to save addr computation
Sequence<2, 3, 1, 0>, Sequence<0, 1, 2, 3>,
0, 0,
CThreadTransferDstScalarPerVector_K, CThreadTransferDstScalarPerVector_K,
decltype(a_e0_e1_k_e2_global_step_hacks), decltype(a_e0_e1_k_e2_global_step_hacks),
...@@ -286,9 +289,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -286,9 +289,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
using CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo = using CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo =
decltype(c_blockid_to_k_n_ho_wo_block_cluster_adaptor); decltype(c_blockid_to_k_n_ho_wo_block_cluster_adaptor);
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
float ave_time = 0; float ave_time = 0;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
...@@ -393,8 +396,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -393,8 +396,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
c_k_n_hop_wop_grid_desc, c_k_n_hop_wop_grid_desc,
c_blockid_to_k_n_ho_wo_block_cluster_adaptor); c_blockid_to_k_n_ho_wo_block_cluster_adaptor);
} }
return ave_time;
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER #elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem a_e0_e1_k_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K_E2)); DeviceMem a_e0_e1_k_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K_E2));
DeviceMem b_e0_e1_n_ho_wo_e2_grid_desc_dev_buf(sizeof(BGridDesc_E0_E1_N_Ho_Wo_E2)); DeviceMem b_e0_e1_n_ho_wo_e2_grid_desc_dev_buf(sizeof(BGridDesc_E0_E1_N_Ho_Wo_E2));
...@@ -408,8 +409,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -408,8 +409,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.ToDevice( c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.ToDevice(
&c_blockid_to_k_n_ho_wo_block_cluster_adaptor); &c_blockid_to_k_n_ho_wo_block_cluster_adaptor);
float ave_time = 0;
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
...@@ -534,9 +533,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -534,9 +533,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
return ave_time;
#endif #endif
return ave_time;
} }
}; };
#endif #endif
...@@ -20,12 +20,12 @@ ...@@ -20,12 +20,12 @@
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE 1 #define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V4R4_NCHW 0 #define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC 0 #define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0 #define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V5R1_NHWC 1 #define USE_CONV_FWD_V5R1_NHWC 0
#define USE_CONV_FWD_V5R1_NCHWC 0 #define USE_CONV_FWD_V5R1_NCHWC 1
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
...@@ -105,16 +105,16 @@ int main(int argc, char* argv[]) ...@@ -105,16 +105,16 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[5]); const bool do_log = std::stoi(argv[5]);
const int nrepeat = std::stoi(argv[6]); const int nrepeat = std::stoi(argv[6]);
constexpr auto N = Number<128>{}; constexpr auto N = Number<1>{};
constexpr auto C = Number<192>{}; constexpr auto C = Number<16>{};
constexpr auto Hi = Number<71>{}; constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<71>{}; constexpr auto Wi = Number<1920>{};
constexpr auto K = Number<256>{}; constexpr auto K = Number<16>{};
constexpr auto Y = Number<3>{}; constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{}; constexpr auto X = Number<3>{};
constexpr auto conv_stride_h = I2; constexpr auto conv_stride_h = I1;
constexpr auto conv_stride_w = I2; constexpr auto conv_stride_w = I1;
constexpr auto conv_dilation_h = I1; constexpr auto conv_dilation_h = I1;
constexpr auto conv_dilation_w = I1; constexpr auto conv_dilation_w = I1;
constexpr auto in_left_pad_h = I1; constexpr auto in_left_pad_h = I1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment