Commit f9560180 authored by Jing Zhang's avatar Jing Zhang
Browse files

debugging maxpool

parent b5bc31bd
...@@ -633,13 +633,15 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -633,13 +633,15 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
// if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) #if 0
// printf("a: %d b: %d c: %d d: %d bias: %d\n", if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
//(int)a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize(), printf("a: %d b: %d c: %d d: %d bias: %d\n",
//(int)b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize(), (int)a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize(),
//(int)c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize(), (int)b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize(),
//(int)d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize(), (int)c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize(),
//(int)bias_k0_k1_grid_desc.GetElementSpaceSize()); (int)d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize(),
(int)bias_k0_k1_grid_desc.GetElementSpaceSize());
#endif
constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop(); constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop();
constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop(); constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
......
...@@ -319,7 +319,6 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w ...@@ -319,7 +319,6 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
printf("half8_t buffer_load\n");
// use fp32 load to mimic fp16 load // use fp32 load to mimic fp16 load
float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4( float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
......
...@@ -126,10 +126,12 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1 ...@@ -126,10 +126,12 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
Sequence<1, E1PerBlock, 1, KPerBlock, 1>; Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2; constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
#endif #endif
const auto in_n_c0_hi_wi_c1_desc = const auto in_n_c0_hi_wi_c1_desc =
......
...@@ -111,7 +111,7 @@ int main(int argc, char* argv[]) ...@@ -111,7 +111,7 @@ int main(int argc, char* argv[])
constexpr auto Wi = Number<1920>{}; constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{}; constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{}; constexpr auto X = Number<3>{};
constexpr auto C0 = Number<3>{}; constexpr auto C0 = Number<1>{};
constexpr auto C1 = Number<4>{}; constexpr auto C1 = Number<4>{};
constexpr auto K0 = Number<2>{}; constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment