Commit 20423a35 authored by Chao Liu's avatar Chao Liu
Browse files

tune direct

parent 1eafc9c1
...@@ -325,57 +325,41 @@ int main() ...@@ -325,57 +325,41 @@ int main()
#if 0 #if 0
constexpr unsigned N = 1; constexpr unsigned N = 1;
constexpr unsigned C = 1; constexpr unsigned C = 1;
constexpr unsigned HI = 34; constexpr unsigned HI = 4;
constexpr unsigned WI = 34; constexpr unsigned WI = 4;
constexpr unsigned K = 1; constexpr unsigned K = 1;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#elif 1
constexpr unsigned N = 64;
constexpr unsigned C = 256;
constexpr unsigned HI = 34;
constexpr unsigned WI = 34;
constexpr unsigned K = 64;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0
constexpr unsigned N = 72;
constexpr unsigned C = 288;
constexpr unsigned HI = 38;
constexpr unsigned WI = 38;
constexpr unsigned K = 72;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0 #elif 0
constexpr unsigned N = 1; constexpr unsigned N = 1;
constexpr unsigned C = 1; constexpr unsigned C = 1;
constexpr unsigned HI = 18; constexpr unsigned HI = 34;
constexpr unsigned WI = 18; constexpr unsigned WI = 34;
constexpr unsigned K = 1; constexpr unsigned K = 1;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#elif 0 #elif 0
constexpr unsigned N = 1; constexpr unsigned N = 64;
constexpr unsigned C = 1; constexpr unsigned C = 256;
constexpr unsigned HI = 4; constexpr unsigned HI = 34;
constexpr unsigned WI = 4; constexpr unsigned WI = 34;
constexpr unsigned K = 1; constexpr unsigned K = 64;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#elif 0 #elif 1
constexpr unsigned N = 2; constexpr unsigned N = 64;
constexpr unsigned C = 3; constexpr unsigned C = 64;
constexpr unsigned HI = 130; constexpr unsigned HI = 56;
constexpr unsigned WI = 130; constexpr unsigned WI = 56;
constexpr unsigned K = 5; constexpr unsigned K = 64;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#elif 0 #elif 0
constexpr unsigned N = 3; constexpr unsigned N = 64;
constexpr unsigned C = 16; constexpr unsigned C = 64;
constexpr unsigned HI = 130; constexpr unsigned HI = 66;
constexpr unsigned WI = 130; constexpr unsigned WI = 66;
constexpr unsigned K = 4; constexpr unsigned K = 64;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#endif #endif
...@@ -397,7 +381,7 @@ int main() ...@@ -397,7 +381,7 @@ int main()
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = std::thread::hardware_concurrency();
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 0 #elif 1
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = std::thread::hardware_concurrency();
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
...@@ -412,7 +396,7 @@ int main() ...@@ -412,7 +396,7 @@ int main()
#endif #endif
} }
#if 0 #if 1
host_winograd_3x3_convolution(in, wei, out_host); host_winograd_3x3_convolution(in, wei, out_host);
check_error(out_host, out_device); check_error(out_host, out_device);
#elif 0 #elif 0
......
...@@ -24,6 +24,8 @@ void device_direct_convolution_1( ...@@ -24,6 +24,8 @@ void device_direct_convolution_1(
constexpr auto in_desc = InDesc{}; constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{}; constexpr auto out_desc = OutDesc{};
#if 0
constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2; constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
...@@ -37,6 +39,21 @@ void device_direct_convolution_1( ...@@ -37,6 +39,21 @@ void device_direct_convolution_1(
constexpr unsigned CPerThread = 2; constexpr unsigned CPerThread = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1
constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 16;
constexpr unsigned CPerBlock = 2;
constexpr unsigned YPerBlock = 2;
constexpr unsigned XPerBlock = 27;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned BlockSize = 216;
#endif
constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
(out_desc.GetLength(I1) / KPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
......
...@@ -24,6 +24,8 @@ void device_direct_convolution_2( ...@@ -24,6 +24,8 @@ void device_direct_convolution_2(
constexpr auto in_desc = InDesc{}; constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{}; constexpr auto out_desc = OutDesc{};
#if 0
constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2; constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
...@@ -37,6 +39,35 @@ void device_direct_convolution_2( ...@@ -37,6 +39,35 @@ void device_direct_convolution_2(
constexpr unsigned CPerThread = 2; constexpr unsigned CPerThread = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1
constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned YPerBlock = 1;
constexpr unsigned XPerBlock = 27;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned BlockSize = 216;
#elif 1
constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned YPerBlock = 1;
constexpr unsigned XPerBlock = 32;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned BlockSize = 256;
#endif
constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
(out_desc.GetLength(I1) / KPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment