clean up

5d2cafcb · Chao Liu · eace3255 · eace3255 · 5d2cafcb · eace3255
Commit 5d2cafcb authored Nov 07, 2018 by Chao Liu
4 changed files
--- a/cmake.sh
+++ b/cmake.sh
-#!/bin/bash
-rm -f CMakeCache.txt
-rm -f *.cmake
-rm -rf CMakeFiles
-MY_PROJECT_SOURCE=/package/code/github/test_feature/SpMV
-MY_PROJECT_INSTALL=../install.dir
-cmake                                                                                      \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                              \
-D CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -std=c++11"                                         \
-D CMAKE_BUILD_TYPE=Release                                                                \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                          \
-D BOOST_ROOT="/package/install/boost_1.66.0-mpich_3.2"                                    \
-D CMAKE_CUDA_COMPILER="/package/install/cuda_9.0/bin/nvcc"                                \
-D CUDA_COMMON_INCLUDE_DIR="/package/code/github/test_feature/cuda_9.0_common/inc"         \
-D CMAKE_CUDA_FLAGS="-ccbin g++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35 -Xptxas -v -maxrregcount=40"            \
-${MY_PROJECT_SOURCE}
-#-D CMAKE_CUDA_FLAGS="-lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35 -Xptxas -v -maxrregcount=32"            \
-#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35"            \
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -5,13 +5,7 @@
 #include "nvToolsExt.h"
 #include "tensor.hpp"
 #include "constant_tensor_descriptor.cuh"
-#include "device_tensor_descriptor.cuh"
-#if 0
 #include "direct_convolution.cuh"
-#else
-#include "constant_direct_convolution.cuh"
-#endif
 template <class T>
 struct GeneratorConstant
@@ -116,7 +110,7 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
 }
 template <class T, class InDesc, class WeiDesc, class OutDesc>
-void const_device_convolution(
+void device_convolution(
    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
 {
    std::size_t data_sz = sizeof(T);
@@ -126,10 +120,6 @@ void const_device_convolution(
    int num_thread = std::thread::hardware_concurrency();
-#if 0
-    out.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
-#endif
    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    out_device_buf.ToDevice(out.mData.data());
@@ -147,13 +137,13 @@ void const_device_convolution(
    constexpr unsigned CPerBlockLoop = 1;
    constexpr unsigned OutTileSizeH  = 2;
    constexpr unsigned OutTileSizeW  = 2;
-    constexpr unsigned YPerBlock     = 16;
+    constexpr unsigned YPerBlock     = 4;
-    constexpr unsigned XPerBlock     = 16;
+    constexpr unsigned XPerBlock     = 8;
    constexpr unsigned NBlockCopyLen0 = 1;
    constexpr unsigned NBlockCopyLen1 = 1;
-    constexpr unsigned NBlockCopyLen2 = 1;
+    constexpr unsigned NBlockCopyLen2 = 2;
-    constexpr unsigned NBlockCopyLen3 = 64;
+    constexpr unsigned NBlockCopyLen3 = 16;
    constexpr unsigned nblock = (out_desc.GetLength(I0) / NPerBlock) *
                                (out_desc.GetLength(I1) / KPerBlock) *
@@ -239,31 +229,23 @@ int main()
    Tensor<float> wei(make_TensorDescriptor(wei_desc));
    Tensor<float> out_host(make_TensorDescriptor(out_desc));
-    Tensor<float> out_device = out_host;
    int num_thread = std::thread::hardware_concurrency();
-#if 0
+#if 1
    in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
    wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
+    out_host.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
 #endif
-#if 0
+    Tensor<float> out_device = out_host;
-    host_convolution(in, wei, out_host);
-#endif
-    const_device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
+    device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
    std::cout << __func__ << ": done" << std::endl;
-#if 0
+#if 1
-    LogRange(std::cout << __func__ << "in : ", in.mData, ",") << std::endl;
+    host_convolution(in, wei, out_host);
-    LogRange(std::cout << __func__ << "wei: ", wei.mData, ",") << std::endl;
-    LogRange(std::cout, out_host.mData, ",") << std::endl;
-    LogRange(std::cout, out_device.mData, ",") << std::endl;
-#endif
-#if 0
    float error      = 0;
    float max_diff   = 0;
    float host_value = 0, device_value = 0;
@@ -282,4 +264,11 @@ int main()
    std::cout << "max_diff: " << max_diff << ", " << host_value << ", " << device_value
              << std::endl;
 #endif
+#if 0
+    LogRange(std::cout << __func__ << "in : ", in.mData, ",") << std::endl;
+    LogRange(std::cout << __func__ << "wei: ", wei.mData, ",") << std::endl;
+    LogRange(std::cout, out_host.mData, ",") << std::endl;
+    LogRange(std::cout, out_device.mData, ",") << std::endl;
+#endif
 }
--- a/src/include/constant_direct_convolution.cuh
+++ b/src/include/constant_direct_convolution.cuh
--- a/src/include/direct_convolution.cuh
+++ b/src/include/direct_convolution.cuh