Commit b5bc31bd authored by Jing Zhang's avatar Jing Zhang
Browse files

clean

parent 1fb77ae6
...@@ -633,6 +633,14 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add ...@@ -633,6 +633,14 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
// if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
// printf("a: %d b: %d c: %d d: %d bias: %d\n",
//(int)a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize(),
//(int)b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize(),
//(int)c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize(),
//(int)d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize(),
//(int)bias_k0_k1_grid_desc.GetElementSpaceSize());
constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop(); constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop();
constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop(); constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
......
...@@ -319,6 +319,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w ...@@ -319,6 +319,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
printf("half8_t buffer_load\n");
// use fp32 load to mimic fp16 load // use fp32 load to mimic fp16 load
float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4( float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
......
...@@ -56,10 +56,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1 ...@@ -56,10 +56,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) * DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
in_n_c0_hi_wi_c1.mDesc.GetElementSpace()); in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace()); DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) * DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
out_n_k0_ho_wo_k1.mDesc.GetElementSpace()); out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data()); bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
...@@ -119,9 +118,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1 ...@@ -119,9 +118,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2; constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
constexpr index_t CThreadTransferDstScalarPerVector_K = K1; constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
#endif #endif
......
...@@ -106,48 +106,41 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1 ...@@ -106,48 +106,41 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1
constexpr index_t CThreadTransferDstScalarPerVector_K = K1; constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
#elif 1 #elif 1
constexpr auto BlockSize = 64; constexpr index_t BlockSize = 64;
constexpr auto KPerBlock = K; constexpr index_t KPerBlock = K;
constexpr auto HoPerBlock = 8; constexpr index_t HoPerBlock = 8;
constexpr auto WoPerBlock = 32; constexpr index_t WoPerBlock = 32;
constexpr auto E1 = C0 * 9; constexpr index_t E1 = C0 * Y * X;
constexpr auto E2 = C1 / InWeiVectorSize; constexpr index_t E2 = C1 / InWeiVectorSize;
constexpr auto K2 = 2; constexpr index_t K2 = 2;
constexpr auto E1PerBlock = C0; constexpr index_t E1PerBlock = C0;
constexpr auto KPerThread = K; constexpr index_t KPerThread = K;
constexpr auto HoPerThread = 2; constexpr index_t HoPerThread = 2;
constexpr auto WoPerThread = 2; constexpr index_t WoPerThread = 2;
constexpr auto EPerThread = 1; constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>; using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, Y * X, 1, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
Sequence<1, E1PerBlock, 1, KPerBlock, 1>; Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
constexpr auto ABlockTransferSrcScalarPerVector_E2 = E2; constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr auto ABlockTransferDstScalarPerVector_E2 = E2; constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
constexpr auto BThreadTransferSrcScalarPerVector_E2 = E2; constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
constexpr auto CThreadTransferDstScalarPerVector_K = K1;
#endif #endif
const auto in_n_c0_hi_wi_c1_desc = const auto in_n_c0_hi_wi_c1_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, C1)); make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
const auto wei_k_c0_y_x_c1_desc = const auto wei_k_c0_y_x_c1_desc =
make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, C1)); make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
const auto max_n_k0_hx_wx_k1_desc = const auto max_n_k0_hx_wx_k1_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)); make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1));
const auto out_n_k0_ho_wo_k1_desc = const auto out_n_k0_ho_wo_k1_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)); make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
static_assert(in_n_c0_hi_wi_c1_desc.IsKnownAtCompileTime(), "");
static_assert(wei_k_c0_y_x_c1_desc.IsKnownAtCompileTime(), "");
static_assert(max_n_k0_hx_wx_k1_desc.IsKnownAtCompileTime(), "");
static_assert(out_n_k0_ho_wo_k1_desc.IsKnownAtCompileTime(), "");
constexpr auto conv_driver = constexpr auto conv_driver =
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool< DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool<
BlockSize, BlockSize,
......
...@@ -32,7 +32,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -32,7 +32,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
{ {
template <typename... Wei, template <typename... Wei,
typename... In, typename... In,
typename... Add, typename... MaxPool,
typename... Out, typename... Out,
typename ConvStrides, typename ConvStrides,
typename ConvDilations, typename ConvDilations,
...@@ -41,7 +41,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -41,7 +41,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
__host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc, __host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc, const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc, const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
const ck::TensorDescriptor<Add...>& max_n_k0_hx_wx_k1_global_desc, const ck::TensorDescriptor<MaxPool...>& max_n_k0_hx_wx_k1_global_desc,
const ConvStrides& conv_strides, const ConvStrides& conv_strides,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
...@@ -96,6 +96,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -96,6 +96,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const auto OutRightPadH = Hop - Ho; const auto OutRightPadH = Hop - Ho;
const auto OutRightPadW = Wop - Wo; const auto OutRightPadW = Wop - Wo;
const auto OutRightPadHx = OutRightPadH / 2;
const auto OutRightPadWx = OutRightPadW / 2;
const auto InLeftPadH = in_left_pads[I0]; const auto InLeftPadH = in_left_pads[I0];
const auto InLeftPadW = in_left_pads[I1]; const auto InLeftPadW = in_left_pads[I1];
...@@ -187,8 +190,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -187,8 +190,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)), make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)),
make_tuple(make_merge_transform(make_tuple(K0, K1)), make_tuple(make_merge_transform(make_tuple(K0, K1)),
make_pass_through_transform(N), make_pass_through_transform(N),
make_pad_transform(Hx, I0, Number<OutRightPadH / 2>{}), make_pad_transform(Hx, I0, Number<OutRightPadHx>{}),
make_pad_transform(Wx, I0, Number<OutRightPadW / 2>{})), make_pad_transform(Wx, I0, Number<OutRightPadWx>{})),
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
...@@ -290,6 +293,10 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -290,6 +293,10 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
// clang-format on // clang-format on
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_ho_wo_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), "");
// GEMM // GEMM
using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3_add< using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3_add<
BlockSize, BlockSize,
...@@ -369,12 +376,17 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -369,12 +376,17 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
float ave_time = 0; float ave_time = 0;
static_assert(a_e0_e1_k0_k1_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE #if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if(has_main_e0_block_loop) if(has_main_e0_block_loop)
{ {
const auto kernel = const auto kernel = kernel_gemm_dlops_v2_maxpool<
kernel_gemm_dlops_v2_add<GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>, remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
...@@ -402,8 +414,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -402,8 +414,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
} }
else else
{ {
const auto kernel = const auto kernel = kernel_gemm_dlops_v2_maxpool<
kernel_gemm_dlops_v2_add<GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>, remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
...@@ -454,8 +466,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -454,8 +466,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
if(has_main_e0_block_loop) if(has_main_e0_block_loop)
{ {
const auto kernel = const auto kernel = kernel_gemm_dlops_v2_maxpool<
kernel_gemm_dlops_v2_add<GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>, remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
...@@ -490,8 +502,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -490,8 +502,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
else else
{ {
const auto kernel = const auto kernel = kernel_gemm_dlops_v2_maxpool<
kernel_gemm_dlops_v2_add<GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>, remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
......
...@@ -105,14 +105,14 @@ int main(int argc, char* argv[]) ...@@ -105,14 +105,14 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<4>{}; constexpr auto C1 = Number<4>{};
constexpr auto K0 = Number<2>{}; constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
#elif 0 #elif 1
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{}; constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{}; constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{}; constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{}; constexpr auto X = Number<3>{};
constexpr auto C0 = Number<1>{}; constexpr auto C0 = Number<1>{};
constexpr auto C1 = Number<8>{}; constexpr auto C1 = Number<4>{};
constexpr auto K0 = Number<2>{}; constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
#elif 0 #elif 0
...@@ -125,7 +125,7 @@ int main(int argc, char* argv[]) ...@@ -125,7 +125,7 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<8>{}; constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{}; constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
#elif 1 #elif 0
constexpr auto N = Number<128>{}; constexpr auto N = Number<128>{};
constexpr auto Hi = Number<270>{}; constexpr auto Hi = Number<270>{};
constexpr auto Wi = Number<480>{}; constexpr auto Wi = Number<480>{};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment