Commit e63a1d9e authored by Jehandad Khan's avatar Jehandad Khan
Browse files

basic copy

parent a951c345
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "device.hpp" #include "device.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_conv.hpp" #include "host_conv.hpp"
#include "host_redux.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
...@@ -18,6 +19,8 @@ ...@@ -18,6 +19,8 @@
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_tensor_redux.hpp"
struct GeneratorTensor_1 struct GeneratorTensor_1
{ {
template <class... Is> template <class... Is>
...@@ -70,8 +73,13 @@ struct GeneratorTensor_Checkboard ...@@ -70,8 +73,13 @@ struct GeneratorTensor_Checkboard
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 1
constexpr index_t N = 4;
constexpr index_t C = 1;
constexpr index_t H = 16;
constexpr index_t W = 16;
#elif 0
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 256; constexpr index_t C = 256;
constexpr index_t HI = 35; constexpr index_t HI = 35;
...@@ -310,33 +318,17 @@ int main(int argc, char* argv[]) ...@@ -310,33 +318,17 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#endif #endif
auto lower_pads = Sequence<HPad, WPad>{}; auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, H, W>{});
auto upper_pads = Sequence<HPad, WPad>{}; auto out_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, H, W>{});
auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, lower_pads, upper_pads);
ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(out_nchw_desc, std::cout << "out_nchw_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
// for backward weight
auto in_nchw_wrw_desc = in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
auto wei_kcyx_wrw_desc = out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
auto out_nkhw_wrw_desc = wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
ostream_ConstantTensorDescriptor(in_nchw_wrw_desc, std::cout << "in_nchw_wrw_desc: ");
ostream_ConstantTensorDescriptor(wei_kcyx_wrw_desc, std::cout << "wei_kcyx_wrw_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_wrw_desc, std::cout << "out_nkhw_wrw_desc: ");
using in_data_t = float; using in_data_t = float;
using out_data_t = float; using out_data_t = float;
Tensor<in_data_t> in_nchw_wrw(make_TensorDescriptor(in_nchw_wrw_desc)); Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx_wrw(make_TensorDescriptor(wei_kcyx_wrw_desc)); Tensor<out_data_t> out_nchw_host(make_TensorDescriptor(out_nchw_desc));
Tensor<out_data_t> out_nkhw_wrw_host(make_TensorDescriptor(out_nkhw_wrw_desc)); Tensor<out_data_t> out_nchw_device(make_TensorDescriptor(out_nchw_desc));
Tensor<out_data_t> out_nkhw_wrw_device(make_TensorDescriptor(out_nkhw_wrw_desc));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = std::thread::hardware_concurrency();
...@@ -351,7 +343,10 @@ int main(int argc, char* argv[]) ...@@ -351,7 +343,10 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 0 #if 1
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
out_nchw_host.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 0
in_nchw_wrw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in_nchw_wrw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx_wrw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx_wrw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 0 #elif 0
...@@ -373,7 +368,9 @@ int main(int argc, char* argv[]) ...@@ -373,7 +368,9 @@ int main(int argc, char* argv[])
#endif #endif
} }
#if 0 #if 1
device_tensor_redux(in_nchw_desc, in_nchw, out_nchw_desc, out_nchw_device, nrepeat);
#elif 0
device_convolution_direct_v2_nchw_kcyx_nkhw device_convolution_direct_v2_nchw_kcyx_nkhw
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0 #elif 0
...@@ -443,14 +440,18 @@ int main(int argc, char* argv[]) ...@@ -443,14 +440,18 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 0 #if 1
// host_redux(in_nchw, out_nchw_host);
// check_error(out_nchw_host, out_nchw_device);
std::cout << "skipping host verification" << std::endl;
#elif 0
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 && if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1) ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
{ {
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
} }
else else
#endif #elif 1
{ {
host_direct_convolution(in_nchw_wrw, host_direct_convolution(in_nchw_wrw,
wei_kcyx_wrw, wei_kcyx_wrw,
...@@ -461,6 +462,7 @@ int main(int argc, char* argv[]) ...@@ -461,6 +462,7 @@ int main(int argc, char* argv[])
upper_pads); upper_pads);
} }
check_error(out_nkhw_wrw_host, out_nkhw_wrw_device); check_error(out_nkhw_wrw_host, out_nkhw_wrw_device);
#endif
#if 0 #if 0
LogRange(std::cout << "in_nchw_wrw : ", in_nchw_wrw.mData, ",") << std::endl; LogRange(std::cout << "in_nchw_wrw : ", in_nchw_wrw.mData, ",") << std::endl;
......
#!/bin/bash #!/bin/bash
MY_PROJECT_SOURCE=../../../ MY_PROJECT_SOURCE=..
MY_PROJECT_INSTALL=../install.dir MY_PROJECT_INSTALL=../install.dir
export CUDA_ROOT=/usr/local/cuda export CUDA_ROOT=/usr/local/cuda
...@@ -11,11 +11,11 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_ROOT/lib64 ...@@ -11,11 +11,11 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_ROOT/lib64
cmake \ cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_CXX_COMPILER=clang++-6.0 \ -D CMAKE_CXX_COMPILER=clang++-6.0 \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Debug \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=NVIDIA \ -D DEVICE_BACKEND=NVIDIA \
-D CUDA_COMMON_INCLUDE_DIR="/root/NVIDIA_CUDA-10.1_Samples/common/inc" \ -D CUDA_COMMON_INCLUDE_DIR="/root/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \ -D CMAKE_CUDA_FLAGS="-g -G -ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_52,code=sm_52" \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment