Commit fb7b4609 authored by Jing Zhang's avatar Jing Zhang
Browse files

debug

parent 90276e6b
...@@ -181,6 +181,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -181,6 +181,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize()); p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
static_assert(E1 % EPerBlock == 0, "");
// const auto E = a_e0_e1_k_global_desc.GetLength(I0); // const auto E = a_e0_e1_k_global_desc.GetLength(I0);
// const auto K = a_e0_e1_k_global_desc.GetLength(I1); // const auto K = a_e0_e1_k_global_desc.GetLength(I1);
......
...@@ -106,17 +106,17 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -106,17 +106,17 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
constexpr index_t WoPerBlock = 8; constexpr index_t WoPerBlock = 8;
constexpr index_t E1 = 16; constexpr index_t E1 = 16;
constexpr index_t EPerBlock = 16; constexpr index_t EPerBlock = 8;
constexpr index_t KPerThread = KPerBlock; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 1; constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 1; constexpr index_t WoPerThread = 1;
constexpr index_t EPerThread = EPerBlock; constexpr index_t EPerThread = EPerBlock;
using ABlockTransferThreadSliceLengths_E_K = Sequence<1, 4, 1>; using ABlockTransferThreadSliceLengths_E0_E1_K = Sequence<1, 4, 1>;
using ABlockTransferThreadClusterLengths_E_K = Sequence<1, 4, 16>; using ABlockTransferThreadClusterLengths_E0_E1_K = Sequence<1, 4, 16>;
constexpr index_t ABlockTransferSrcScalarPerVector_E = 4; constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
constexpr index_t ABlockTransferDstScalarPerVector_K = 1; constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
constexpr index_t BThreadTransferSrcScalarPerVector_E = 1; constexpr index_t BThreadTransferSrcScalarPerVector_E = 1;
...@@ -139,8 +139,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -139,8 +139,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
HoPerThread, HoPerThread,
WoPerThread, WoPerThread,
EPerThread, EPerThread,
ABlockTransferThreadSliceLengths_E_K, ABlockTransferThreadSliceLengths_E0_E1_K,
ABlockTransferThreadClusterLengths_E_K, ABlockTransferThreadClusterLengths_E0_E1_K,
ABlockTransferSrcScalarPerVector_E, ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K, ABlockTransferDstScalarPerVector_K,
BThreadTransferSrcScalarPerVector_E, BThreadTransferSrcScalarPerVector_E,
......
...@@ -256,6 +256,10 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -256,6 +256,10 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
const bool has_double_tail_k_block_loop = (E1 / EPerBlock) % 2 == 0; const bool has_double_tail_k_block_loop = (E1 / EPerBlock) % 2 == 0;
std::cerr << "has_main_k_block_loop = " << has_main_k_block_loop
<< " has_double_tail_k_block_loop = " << has_double_tail_k_block_loop
<< std::endl;
const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor = const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor =
make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(I0, I0))), make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(I0, I0))),
make_tuple(Sequence<0, 1>{}), make_tuple(Sequence<0, 1>{}),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment