Commit 59252249 authored by Chao Liu's avatar Chao Liu
Browse files

try bwd weight

parent 4908fe3f
...@@ -156,7 +156,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -156,7 +156,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not satisfied"); "GemmDataPerReadB alignment requirement is not satisfied");
#if 0 #if 1
// input blockwise copy // input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor // slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in // this copy operator already has blockwise offset built-in
...@@ -196,8 +196,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -196,8 +196,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// weight tensor // weight tensor
// tensor descriptor in device memory, src of blockwise copy // tensor descriptor in device memory, src of blockwise copy
constexpr auto wei_e_k_global_desc = constexpr auto wei_e_k_global_merged_desc = make_ConstantMergedTensorDescriptor(
wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); wei_k_c_y_x_global_desc.Unfold(I2, I3), Sequence<1, 2>{}, Sequence<0>{});
// tensor descriptor in LDS, dst of blockwise copy // tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
...@@ -205,13 +205,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -205,13 +205,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Sequence<EPerBlock, KPerBlock>{}, Sequence<EPerBlock, KPerBlock>{},
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{}); Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
#if 0 #if 1
// operator for blockwise copy of weight into LDS // operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor // slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in // this copy operator already have blockwise offset built-in
auto blockwise_wei_copy = auto blockwise_wei_copy =
BlockwiseGenericTensorSliceCopy_v1<BlockSize, BlockwiseGenericTensorSliceCopy_v1<BlockSize,
decltype(wei_e_k_global_desc), decltype(wei_e_k_global_merged_desc),
decltype(wei_e_k_block_desc), decltype(wei_e_k_block_desc),
decltype(wei_e_k_block_desc.GetLengths()), decltype(wei_e_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_E_K, WeiBlockCopySubLengths_E_K,
...@@ -336,7 +336,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -336,7 +336,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True); blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); blockwise_wei_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True);
__syncthreads(); __syncthreads();
...@@ -361,7 +361,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -361,7 +361,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True); blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); blockwise_wei_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True);
__syncthreads(); __syncthreads();
......
...@@ -86,14 +86,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -86,14 +86,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr index_t InBlockCopySrcDataPerRead_B = 1; constexpr index_t InBlockCopySrcDataPerRead_B = 1;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; using WeiBlockCopySubLengths_E_K = Sequence<1, 4>;
using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; using WeiBlockCopyClusterLengths_E_K = Sequence<8, 32>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopySrcDataPerRead_E = 1;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 4;
constexpr index_t OutThreadCopyDataPerAccess_W = 1; constexpr index_t OutThreadCopyDataPerAccess_W = 1;
#elif 1 #elif 1
......
...@@ -72,15 +72,15 @@ int main(int argc, char* argv[]) ...@@ -72,15 +72,15 @@ int main(int argc, char* argv[])
using namespace ck; using namespace ck;
#if 1 #if 1
constexpr index_t N = 64; constexpr index_t N = 128;
constexpr index_t C = 1536; constexpr index_t C = 256;
constexpr index_t HI = 8; constexpr index_t HI = 35;
constexpr index_t WI = 8; constexpr index_t WI = 35;
constexpr index_t K = 256; constexpr index_t K = 384;
constexpr index_t Y = 1; constexpr index_t Y = 3;
constexpr index_t X = 1; constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
...@@ -322,12 +322,21 @@ int main(int argc, char* argv[]) ...@@ -322,12 +322,21 @@ int main(int argc, char* argv[])
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
// for backward weight
auto in_nchw_wrw_desc = in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
auto wei_kcyx_wrw_desc = out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
auto out_nkhw_wrw_desc = wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
ostream_ConstantTensorDescriptor(in_nchw_wrw_desc, std::cout << "in_nchw_wrw_desc: ");
ostream_ConstantTensorDescriptor(wei_kcyx_wrw_desc, std::cout << "wei_kcyx_wrw_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_wrw_desc, std::cout << "out_nkhw_wrw_desc: ");
using in_data_t = float; using in_data_t = float;
using out_data_t = float; using out_data_t = float;
Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc)); Tensor<in_data_t> in_nchw_wrw(make_TensorDescriptor(in_nchw_wrw_desc));
Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); Tensor<in_data_t> wei_kcyx_wrw(make_TensorDescriptor(wei_kcyx_wrw_desc));
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); Tensor<out_data_t> out_nkhw_wrw_host(make_TensorDescriptor(out_nkhw_wrw_desc));
Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); Tensor<out_data_t> out_nkhw_wrw_device(make_TensorDescriptor(out_nkhw_wrw_desc));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = std::thread::hardware_concurrency();
...@@ -343,8 +352,8 @@ int main(int argc, char* argv[]) ...@@ -343,8 +352,8 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 0 #if 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in_nchw_wrw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx_wrw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 0 #elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
...@@ -352,8 +361,8 @@ int main(int argc, char* argv[]) ...@@ -352,8 +361,8 @@ int main(int argc, char* argv[])
in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 1 #elif 1
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in_nchw_wrw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx_wrw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#elif 0 #elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
...@@ -379,15 +388,16 @@ int main(int argc, char* argv[]) ...@@ -379,15 +388,16 @@ int main(int argc, char* argv[])
#elif 0 #elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0 #elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(
in_nchw, in_nchw_wrw_desc,
wei_kcyx_desc, in_nchw_wrw,
wei_kcyx, wei_kcyx_wrw_desc,
out_nkhw_desc, wei_kcyx_wrw,
out_nkhw_device, out_nkhw_wrw_desc,
out_nkhw_wrw_device,
ConvDilations{}, // exchange dilation and strides
ConvStrides{}, ConvStrides{},
ConvDilations{},
nrepeat); nrepeat);
#elif 0 #elif 0
device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc,
...@@ -433,7 +443,7 @@ int main(int argc, char* argv[]) ...@@ -433,7 +443,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 1 #if 0
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 && if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1) ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
{ {
...@@ -442,21 +452,21 @@ int main(int argc, char* argv[]) ...@@ -442,21 +452,21 @@ int main(int argc, char* argv[])
else else
#endif #endif
{ {
host_direct_convolution(in_nchw, host_direct_convolution(in_nchw_wrw,
wei_kcyx, wei_kcyx_wrw,
out_nkhw_host, out_nkhw_wrw_host,
ConvStrides{},
ConvDilations{}, ConvDilations{},
ConvStrides{},
lower_pads, lower_pads,
upper_pads); upper_pads);
} }
check_error(out_nkhw_host, out_nkhw_device); check_error(out_nkhw_wrw_host, out_nkhw_wrw_device);
#if 0 #if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; LogRange(std::cout << "in_nchw_wrw : ", in_nchw_wrw.mData, ",") << std::endl;
LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; LogRange(std::cout << "wei_kcyx_wrw: ", wei_kcyx_wrw.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl; LogRange(std::cout << "out_nkhw_wrw_host : ", out_nkhw_wrw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl; LogRange(std::cout << "out_nkhw_wrw_device: ", out_nkhw_wrw_device.mData, ",") << std::endl;
#endif #endif
} }
} }
driver.cpp
\ No newline at end of file
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "device.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
struct GeneratorTensor_1
{
template <class... Is>
double operator()(Is... is)
{
return 1;
}
};
struct GeneratorTensor_2
{
int min_value = 0;
int max_value = 1;
template <class... Is>
double operator()(Is...)
{
return (std::rand() % (max_value - min_value)) + min_value;
}
};
struct GeneratorTensor_3
{
template <class... Is>
double operator()(Is... is)
{
std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};
auto f_acc = [](auto a, auto b) { return 100 * a + b; };
return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
}
};
struct GeneratorTensor_Checkboard
{
template <class... Ts>
double operator()(Ts... Xs) const
{
std::array<index_t, sizeof...(Ts)> dims = {{Xs...}};
return std::accumulate(dims.begin(),
dims.end(),
true,
[](bool init, index_t x) -> int { return init != (x % 2); })
? 1
: -1;
}
};
int main(int argc, char* argv[])
{
using namespace ck;
#if 1
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 35;
constexpr index_t WI = 35;
constexpr index_t K = 384;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 3x3, 34x34
constexpr index_t N = 64;
constexpr index_t C = 256;
constexpr index_t HI = 34;
constexpr index_t WI = 34;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr index_t N = 64;
constexpr index_t C = 1536;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
constexpr index_t K = 256;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
constexpr index_t N = 128;
constexpr index_t C = 2048;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
constexpr index_t K = 384;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
constexpr index_t N = 128;
constexpr index_t C = 832;
constexpr index_t HI = 7;
constexpr index_t WI = 7;
constexpr index_t K = 384;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
constexpr index_t N = 128;
constexpr index_t C = 1280;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
constexpr index_t K = 384;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
constexpr index_t N = 128;
constexpr index_t C = 512;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
constexpr index_t N = 64;
constexpr index_t C = 1536;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
constexpr index_t K = 384;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 28x28 image
// cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 28;
constexpr index_t WI = 28;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
constexpr index_t N = 128;
constexpr index_t C = 832;
constexpr index_t HI = 7;
constexpr index_t WI = 7;
constexpr index_t K = 256;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128;
constexpr index_t C = 288;
constexpr index_t HI = 35;
constexpr index_t WI = 35;
constexpr index_t K = 384;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
// 1x1 filter, 17x17 input
// cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
constexpr index_t N = 128;
constexpr index_t C = 768;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
constexpr index_t N = 128;
constexpr index_t C = 528;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
constexpr index_t N = 128;
constexpr index_t C = 528;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 256;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
constexpr index_t N = 128;
constexpr index_t C = 832;
constexpr index_t HI = 7;
constexpr index_t WI = 7;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#endif
auto lower_pads = Sequence<HPad, WPad>{};
auto upper_pads = Sequence<HPad, WPad>{};
auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, lower_pads, upper_pads);
ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
// for backward weight
auto in_nchw_wrw_desc = in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
auto wei_kcyx_wrw_desc = out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
auto out_nkhw_wrw_desc = wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
ostream_ConstantTensorDescriptor(in_nchw_wrw_desc, std::cout << "in_nchw_wrw_desc: ");
ostream_ConstantTensorDescriptor(wei_kcyx_wrw_desc, std::cout << "wei_kcyx_wrw_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_wrw_desc, std::cout << "out_nkhw_wrw_desc: ");
using in_data_t = float;
using out_data_t = float;
Tensor<in_data_t> in_nchw_wrw(make_TensorDescriptor(in_nchw_wrw_desc));
Tensor<in_data_t> wei_kcyx_wrw(make_TensorDescriptor(wei_kcyx_wrw_desc));
Tensor<out_data_t> out_nkhw_wrw_host(make_TensorDescriptor(out_nkhw_wrw_desc));
Tensor<out_data_t> out_nkhw_wrw_device(make_TensorDescriptor(out_nkhw_wrw_desc));
std::size_t num_thread = std::thread::hardware_concurrency();
if(argc != 3)
{
printf("arg1: do_verification, arg2: nrepeat\n");
exit(1);
}
bool do_verification = atoi(argv[1]);
index_t nrepeat = atoi(argv[2]);
if(do_verification)
{
#if 0
in_nchw_wrw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx_wrw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 1
in_nchw_wrw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx_wrw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
#endif
}
#if 0
device_convolution_direct_v2_nchw_kcyx_nkhw
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(
in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(
in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(
in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(
in_nchw_wrw_desc,
in_nchw_wrw,
wei_kcyx_wrw_desc,
wei_kcyx_wrw,
out_nkhw_wrw_desc,
out_nkhw_wrw_device,
ConvDilations{}, // exchange dilation and strides
ConvStrides{},
nrepeat);
#elif 0
device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
nrepeat);
#elif 0
device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
nrepeat);
#elif 1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
nrepeat);
#elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
lower_pads,
upper_pads,
nrepeat);
#endif
if(do_verification)
{
#if 0
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
{
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
}
else
#endif
{
host_direct_convolution(in_nchw_wrw,
wei_kcyx_wrw,
out_nkhw_wrw_host,
ConvDilations{},
ConvStrides{},
lower_pads,
upper_pads);
}
check_error(out_nkhw_wrw_host, out_nkhw_wrw_device);
#if 0
LogRange(std::cout << "in_nchw_wrw : ", in_nchw_wrw.mData, ",") << std::endl;
LogRange(std::cout << "wei_kcyx_wrw: ", wei_kcyx_wrw.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_wrw_host : ", out_nkhw_wrw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_wrw_device: ", out_nkhw_wrw_device.mData, ",") << std::endl;
#endif
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment