Commit b8bb1480 authored by Jing Zhang's avatar Jing Zhang
Browse files

3x3 test

parent 40b7d356
...@@ -271,6 +271,14 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -271,6 +271,14 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
index_t nrepeat = 100; index_t nrepeat = 100;
std::cout << "conv_v5r1__NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi
<< "-k" << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0]
<< "v" << conv_strides[I1] << "l" << conv_dilations[I0] << "j"
<< conv_dilations[I1] << "q" << in_left_pads[I0] << "p" << in_right_pads[I0]
<< std::endl;
std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;
for(index_t i = 0; i < 5; ++i) for(index_t i = 0; i < 5; ++i)
{ {
std::cout << "Start running " << nrepeat << " times..." << std::endl; std::cout << "Start running " << nrepeat << " times..." << std::endl;
......
...@@ -103,13 +103,47 @@ int main(int argc, char* argv[]) ...@@ -103,13 +103,47 @@ int main(int argc, char* argv[])
const bool do_log = atoi(argv[5]); const bool do_log = atoi(argv[5]);
const int nrepeat = atoi(argv[6]); const int nrepeat = atoi(argv[6]);
#if 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 1080;
constexpr index_t Wi = 1920;
constexpr index_t K = 16;
constexpr index_t Y = 3;
constexpr index_t X = 3;
#elif 0
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
constexpr index_t Hi = 544; constexpr index_t Hi = 540;
constexpr index_t Wi = 960; constexpr index_t Wi = 960;
constexpr index_t K = 16; constexpr index_t K = 16;
constexpr index_t Y = 3; constexpr index_t Y = 3;
constexpr index_t X = 3; constexpr index_t X = 3;
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 480;
constexpr index_t Wi = 270;
constexpr index_t K = 16;
constexpr index_t Y = 3;
constexpr index_t X = 3;
#elif 1
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 240;
constexpr index_t Wi = 135;
constexpr index_t K = 16;
constexpr index_t Y = 3;
constexpr index_t X = 3;
#elif 1
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 1080;
constexpr index_t Wi = 1920;
constexpr index_t K = 16;
constexpr index_t Y = 1;
constexpr index_t X = 1;
#endif
const index_t conv_stride_h = 1; const index_t conv_stride_h = 1;
const index_t conv_stride_w = 1; const index_t conv_stride_w = 1;
......
...@@ -92,7 +92,6 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -92,7 +92,6 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
const auto out_n_k0_ho_wo_k1_desc = const auto out_n_k0_ho_wo_k1_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)); make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1));
#if 1
// cdata = 64, BlockSize = 64, 16x8x32x4 // cdata = 64, BlockSize = 64, 16x8x32x4
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
...@@ -114,34 +113,9 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -114,34 +113,9 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr index_t BThreadTransferSrcScalarPerVector_W = 1; constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
constexpr index_t CThreadTransferDstScalarPerVector_W = 8;
static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
#else
constexpr index_t BlockSize = 64;
constexpr index_t KPerBlock = 16;
constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 32;
constexpr index_t EPerBlock = 1;
constexpr index_t KPerThread = 16;
constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = EPerBlock;
using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>;
using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, 16>;
constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
constexpr index_t CThreadTransferDstScalarPerVector_W = K1; constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, ""); static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
#endif
constexpr auto conv_driver = constexpr auto conv_driver =
#if 0 #if 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment