Commit 5d2cafcb authored by Chao Liu's avatar Chao Liu
Browse files

clean up

parent eace3255
#!/bin/bash
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=/package/code/github/test_feature/SpMV
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -std=c++11" \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D BOOST_ROOT="/package/install/boost_1.66.0-mpich_3.2" \
-D CMAKE_CUDA_COMPILER="/package/install/cuda_9.0/bin/nvcc" \
-D CUDA_COMMON_INCLUDE_DIR="/package/code/github/test_feature/cuda_9.0_common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin g++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35 -Xptxas -v -maxrregcount=40" \
${MY_PROJECT_SOURCE}
#-D CMAKE_CUDA_FLAGS="-lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35 -Xptxas -v -maxrregcount=32" \
#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35" \
...@@ -5,13 +5,7 @@ ...@@ -5,13 +5,7 @@
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "tensor.hpp" #include "tensor.hpp"
#include "constant_tensor_descriptor.cuh" #include "constant_tensor_descriptor.cuh"
#include "device_tensor_descriptor.cuh"
#if 0
#include "direct_convolution.cuh" #include "direct_convolution.cuh"
#else
#include "constant_direct_convolution.cuh"
#endif
template <class T> template <class T>
struct GeneratorConstant struct GeneratorConstant
...@@ -116,7 +110,7 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out) ...@@ -116,7 +110,7 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
} }
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void const_device_convolution( void device_convolution(
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out) InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
{ {
std::size_t data_sz = sizeof(T); std::size_t data_sz = sizeof(T);
...@@ -126,10 +120,6 @@ void const_device_convolution( ...@@ -126,10 +120,6 @@ void const_device_convolution(
int num_thread = std::thread::hardware_concurrency(); int num_thread = std::thread::hardware_concurrency();
#if 0
out.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
#endif
in_device_buf.ToDevice(in.mData.data()); in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data()); wei_device_buf.ToDevice(wei.mData.data());
out_device_buf.ToDevice(out.mData.data()); out_device_buf.ToDevice(out.mData.data());
...@@ -147,13 +137,13 @@ void const_device_convolution( ...@@ -147,13 +137,13 @@ void const_device_convolution(
constexpr unsigned CPerBlockLoop = 1; constexpr unsigned CPerBlockLoop = 1;
constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2; constexpr unsigned OutTileSizeW = 2;
constexpr unsigned YPerBlock = 16; constexpr unsigned YPerBlock = 4;
constexpr unsigned XPerBlock = 16; constexpr unsigned XPerBlock = 8;
constexpr unsigned NBlockCopyLen0 = 1; constexpr unsigned NBlockCopyLen0 = 1;
constexpr unsigned NBlockCopyLen1 = 1; constexpr unsigned NBlockCopyLen1 = 1;
constexpr unsigned NBlockCopyLen2 = 1; constexpr unsigned NBlockCopyLen2 = 2;
constexpr unsigned NBlockCopyLen3 = 64; constexpr unsigned NBlockCopyLen3 = 16;
constexpr unsigned nblock = (out_desc.GetLength(I0) / NPerBlock) * constexpr unsigned nblock = (out_desc.GetLength(I0) / NPerBlock) *
(out_desc.GetLength(I1) / KPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
...@@ -239,31 +229,23 @@ int main() ...@@ -239,31 +229,23 @@ int main()
Tensor<float> wei(make_TensorDescriptor(wei_desc)); Tensor<float> wei(make_TensorDescriptor(wei_desc));
Tensor<float> out_host(make_TensorDescriptor(out_desc)); Tensor<float> out_host(make_TensorDescriptor(out_desc));
Tensor<float> out_device = out_host;
int num_thread = std::thread::hardware_concurrency(); int num_thread = std::thread::hardware_concurrency();
#if 0 #if 1
in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread); in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread); wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
out_host.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
#endif #endif
#if 0 Tensor<float> out_device = out_host;
host_convolution(in, wei, out_host);
#endif
const_device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
std::cout << __func__ << ": done" << std::endl; std::cout << __func__ << ": done" << std::endl;
#if 0 #if 1
LogRange(std::cout << __func__ << "in : ", in.mData, ",") << std::endl; host_convolution(in, wei, out_host);
LogRange(std::cout << __func__ << "wei: ", wei.mData, ",") << std::endl;
LogRange(std::cout, out_host.mData, ",") << std::endl;
LogRange(std::cout, out_device.mData, ",") << std::endl;
#endif
#if 0
float error = 0; float error = 0;
float max_diff = 0; float max_diff = 0;
float host_value = 0, device_value = 0; float host_value = 0, device_value = 0;
...@@ -282,4 +264,11 @@ int main() ...@@ -282,4 +264,11 @@ int main()
std::cout << "max_diff: " << max_diff << ", " << host_value << ", " << device_value std::cout << "max_diff: " << max_diff << ", " << host_value << ", " << device_value
<< std::endl; << std::endl;
#endif #endif
#if 0
LogRange(std::cout << __func__ << "in : ", in.mData, ",") << std::endl;
LogRange(std::cout << __func__ << "wei: ", wei.mData, ",") << std::endl;
LogRange(std::cout, out_host.mData, ",") << std::endl;
LogRange(std::cout, out_device.mData, ",") << std::endl;
#endif
} }
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment