Commit 27bad50b authored by Jing Zhang's avatar Jing Zhang
Browse files

add dynamic mode of maxpool

parent 982c3b60
...@@ -73,7 +73,7 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1 ...@@ -73,7 +73,7 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data()); bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
max_n_k0_hx_wx_k1_device_buf.ToDevice(max_n_k0_hx_wx_k1.mData.data()); max_n_k0_hx_wx_k1_device_buf.ToDevice(max_n_k0_hx_wx_k1.mData.data());
constexpr index_t InWeiVectorSize = C1; constexpr index_t InWeiVectorSize = 8;
if(C1 % InWeiVectorSize != 0) if(C1 % InWeiVectorSize != 0)
{ {
...@@ -108,32 +108,35 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1 ...@@ -108,32 +108,35 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1
#elif 1 #elif 1
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
constexpr index_t KPerBlock = K; constexpr index_t KPerBlock = 8;
constexpr index_t HoPerBlock = 8; constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 32; constexpr index_t WoPerBlock = 32;
constexpr index_t E1 = C0 * Y * X; constexpr index_t E1 = 2 * 9;
constexpr index_t E2 = C1 / InWeiVectorSize; constexpr index_t E2 = 1;
constexpr index_t K2 = 2; constexpr index_t K2 = 2;
constexpr index_t E1PerBlock = C0; constexpr index_t E1PerBlock = 2;
constexpr index_t KPerThread = K; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 2; constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2; constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1; constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, Y * X, 1, 1, E2>; using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
Sequence<1, E1PerBlock, 1, KPerBlock, 1>; Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2; constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
constexpr index_t CThreadTransferDstScalarPerVector_K = InWeiVectorSize;
constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
#endif #endif
if(KPerThread % InWeiVectorSize != 0)
{
throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
}
const auto in_n_c0_hi_wi_c1_desc = const auto in_n_c0_hi_wi_c1_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)); make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
const auto wei_k_c0_y_x_c1_desc = const auto wei_k_c0_y_x_c1_desc =
......
...@@ -85,7 +85,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -85,7 +85,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const auto ConvDilationH = conv_dilations[I0]; const auto ConvDilationH = conv_dilations[I0];
const auto ConvDilationW = conv_dilations[I1]; const auto ConvDilationW = conv_dilations[I1];
#if 1 #if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{}; const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{}; const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
#else #else
...@@ -190,8 +190,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -190,8 +190,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)), make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)),
make_tuple(make_merge_transform(make_tuple(K0, K1)), make_tuple(make_merge_transform(make_tuple(K0, K1)),
make_pass_through_transform(N), make_pass_through_transform(N),
make_pad_transform(Hx, I0, Number<OutRightPadHx>{}), make_pad_transform(Hx, I0, OutRightPadHx),
make_pad_transform(Wx, I0, Number<OutRightPadWx>{})), make_pad_transform(Wx, I0, OutRightPadWx)),
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
...@@ -293,10 +293,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -293,10 +293,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
// clang-format on // clang-format on
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_ho_wo_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), "");
// GEMM // GEMM
using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3< using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
BlockSize, BlockSize,
...@@ -371,16 +367,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -371,16 +367,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
float ave_time = 0; float ave_time = 0;
static_assert(a_e0_e1_k0_k1_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE #if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if(has_main_e0_block_loop) if(has_main_e0_block_loop)
{ {
const auto kernel = kernel_gemm_dlops_v2_maxpool< const auto kernel = kernel_gemm_dlops_v3_maxpool<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -409,7 +400,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -409,7 +400,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
} }
else else
{ {
const auto kernel = kernel_gemm_dlops_v2_maxpool< const auto kernel = kernel_gemm_dlops_v3_maxpool<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -461,7 +452,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -461,7 +452,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
if(has_main_e0_block_loop) if(has_main_e0_block_loop)
{ {
const auto kernel = kernel_gemm_dlops_v2_maxpool< const auto kernel = kernel_gemm_dlops_v3_maxpool<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -497,7 +488,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -497,7 +488,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
else else
{ {
const auto kernel = kernel_gemm_dlops_v2_maxpool< const auto kernel = kernel_gemm_dlops_v3_maxpool<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" #include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0 #define USE_DYNAMIC_MODE 1
#define USE_CONV_FWD_V5R1_NCHWC 1 #define USE_CONV_FWD_V5R1_NCHWC 1
enum ConvForwardAlgo enum ConvForwardAlgo
...@@ -46,7 +46,7 @@ int main(int argc, char* argv[]) ...@@ -46,7 +46,7 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
constexpr index_t activ_type = 0; constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]); const bool do_verification = std::stoi(argv[2]);
...@@ -78,6 +78,9 @@ int main(int argc, char* argv[]) ...@@ -78,6 +78,9 @@ int main(int argc, char* argv[])
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const index_t Ho_2 = Ho / 2;
const index_t Wo_2 = Wo / 2;
#else #else
// static mode // static mode
if(argc < 6) if(argc < 6)
...@@ -93,7 +96,7 @@ int main(int argc, char* argv[]) ...@@ -93,7 +96,7 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]); const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]); const int nrepeat = std::stoi(argv[5]);
constexpr index_t activ_type = 1; constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
#if 0 #if 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment