host verification in progress

c1b3fb95 · Jehandad Khan · eb8a1bf9 · c1b3fb95 · c1b3fb95 · c1b3fb95
Commit c1b3fb95 authored Aug 07, 2019 by Jehandad Khan
9 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -412,8 +412,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
                arithmetic_sequence_gen<0, 8, 1>::type{},
                Number<1>{});
-#elif 0
-	p_out_global[0] = p_out_thread[0];
 #endif
        }
    }

--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
@@ -3,6 +3,8 @@

 #include "ConstantTensorDescriptor.hpp"

+
+typedef enum ConvolutionDir{ Forward=0, BackwardData=1, BackwardWeights=2};
 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>
 constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -16,9 +16,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
 // the input desc needs to be reordered for wrw : cnhw would be the new order
 // the forward kernel always assumes red on the second dim and this would make it reduce on the n dimension due to the switchibng we did

-                                                          const Tensor<T>& in_nchw,
+                                                          Tensor<T>& in_nchw,
                                                          WeiDesc,
-                                                          const Tensor<T>& wei_kcyx,
+                                                          Tensor<T>& wei_kcyx,
                                                          OutDesc,
                                                          Tensor<T>& out_nkhw,
                                                          ConvStrides,
@@ -252,4 +252,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    }

    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+    in_nchw_device_buf.FromDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.FromDevice(wei_kcyx.mData.data());
 }
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
 #pragma once
 #include "tensor.hpp"
 #include "common_header.hpp"
+#include "conv_common.hpp"
 #include "ConstantTensorDescriptor.hpp"

 // this is ugly, only for 4d
@@ -52,15 +53,26 @@ template <class TIn,
          class ConvDilations,
          class LowerPads,
          class UpperPads>
-void host_direct_convolution(const Tensor<TIn>& in_nchw,
-                             const Tensor<TWei>& wei_kcyx,
+void host_direct_convolution(Tensor<TIn>& in_nchw,
+                             Tensor<TWei>& wei_kcyx,
                             Tensor<TOut>& out_nkhw,
                             ConvStrides,
                             ConvDilations,
                             LowerPads,
-                             UpperPads)
+                             UpperPads,
+                             ConvolutionDir dir)
 {
    using namespace ck;
+    constexpr auto I0   = Number<0>{};
+    constexpr auto I1   = Number<1>{};
+    constexpr auto I2   = Number<2>{};
+    constexpr auto I3   = Number<3>{};
+#if 1
+    // wrw
+    in_nchw.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
+    wei_kcyx.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
+    out_nkhw.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
+#endif

    index_t h_pad_low = LowerPads{}.Get(Number<0>{});
    index_t w_pad_low = LowerPads{}.Get(Number<1>{});
@@ -81,7 +93,7 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
                    if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
                       wi < in_nchw.mDesc.GetLengths()[3])
                    {
-                        v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x));
+                        v += double(in_nchw(n, c, hi, wi))  /*double(wei_kcyx(k, c, y, x))*/;
                    }
                }
            }

--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
@@ -101,6 +101,22 @@ struct TensorDescriptor
        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
    }
+    void ReorderGivenNew2Old(std::vector<std::size_t> is)
+    {
+        assert(mLens.size() == is.size());
+        assert(mStrides.size() == is.size());
+        std::vector<std::size_t> newLens(mLens.size());
+        std::vector<std::size_t> newStrides(mStrides.size());
+        auto cnt = 0;
+        for(auto& idx : is)
+        {
+            newLens[cnt] = mLens[idx];
+            newStrides[cnt] = mStrides[idx];
+            ++cnt;
+        }
+        mLens= newLens;
+        mStrides = newStrides;
+    }

    private:
    std::vector<std::size_t> mLens;

--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -67,12 +67,14 @@ struct GeneratorTensor_Checkboard
    }
 };

+
 int main(int argc, char* argv[])
 {
    using namespace ck;
+    ConvolutionDir dir = Forward;

 #if 1
-    constexpr index_t N  = 128;
+    constexpr index_t N  = 64;
    constexpr index_t C  = 1536;
    constexpr index_t HI = 8;
    constexpr index_t WI = 8;
@@ -85,6 +87,7 @@ int main(int argc, char* argv[])

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
+    dir = BackwardWeights;
 #elif 0
    // 3x3, 34x34
    constexpr index_t N  = 128;
@@ -477,8 +480,10 @@ int main(int argc, char* argv[])

    using in_data_t  = float;
    using out_data_t = float;
-    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
-    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
+    Tensor<in_data_t> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
+    Tensor<in_data_t> wei_kcyx_device(make_TensorDescriptor(wei_kcyx_desc));
+    Tensor<in_data_t> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
+    Tensor<in_data_t> wei_kcyx_host(make_TensorDescriptor(wei_kcyx_desc));
    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
    Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));

@@ -505,8 +510,24 @@ int main(int argc, char* argv[])
        in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 1
-        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        in_nchw_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        assert(in_nchw_device.mData.size() == in_nchw_host.mData.size());
+        for(auto i = 0; i < in_nchw_device.mData.size(); ++i)
+        {
+            in_nchw_host.mData[i] = in_nchw_device.mData[i];
+        }
+        wei_kcyx_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        assert(wei_kcyx_device.mData.size() == wei_kcyx_host.mData.size());
+        for(auto i =0; i < wei_kcyx_device.mData.size(); ++i)
+        {
+            wei_kcyx_host.mData[i] = wei_kcyx_device.mData[i];
+        }
+        out_nkhw_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        assert(out_nkhw_device.mData.size() == out_nkhw_host.mData.size());
+        for(auto i = 0; i < out_nkhw_device.mData.size(); ++i)
+        {
+            out_nkhw_host.mData[i] = out_nkhw_device.mData[i];
+        }
 #elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);

@@ -536,9 +557,9 @@ int main(int argc, char* argv[])
 // this is the same as MIOpen
 // I should modify this one 
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
-                                                         in_nchw,
+                                                         in_nchw_device,
                                                         wei_kcyx_desc,
-                                                         wei_kcyx,
+                                                         wei_kcyx_device,
                                                         out_nkhw_desc,
                                                         out_nkhw_device,
                                                         ConvStrides{},
@@ -588,7 +609,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-#if 1
+#if 0
        if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
           ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
        {
@@ -597,15 +618,18 @@ int main(int argc, char* argv[])
        else
 #endif
        {
-            host_direct_convolution(in_nchw,
-                                    wei_kcyx,
+            host_direct_convolution(in_nchw_host,
                                    out_nkhw_host,
-                                    ConvStrides{},
+                                    wei_kcyx_host,
                                    ConvDilations{},
+                                    ConvStrides{},
                                    lower_pads,
-                                    upper_pads);
+                                    upper_pads, dir);
        }
+        if(dir == Forward)
            check_error(out_nkhw_host, out_nkhw_device);
+        else
+            check_error(wei_kcyx_host, wei_kcyx_device);

 #if 0
        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;

--- a/driver/src/tensor.cpp
+++ b/driver/src/tensor.cpp
@@ -7,6 +7,10 @@ TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mL
 {
    this->CalculateStrides();
 }
+TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens, std::initializer_list<std::size_t> strides)
+    : mLens(lens), mStrides(strides)
+{
+}

 TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides)
    : mLens(lens), mStrides(strides)

--- a/script/cmake-cuda.sh
+++ b/script/cmake-cuda.sh
@@ -4,17 +4,17 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles

-MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
+MY_PROJECT_SOURCE=..
 MY_PROJECT_INSTALL=../install.dir

 cmake                                                                                       \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
 -D CMAKE_CXX_COMPILER=clang++                                                               \
-D CMAKE_BUILD_TYPE=Release                                                                 \
+-D CMAKE_BUILD_TYPE=Debug                                                                   \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 -D DEVICE_BACKEND=NVIDIA                                                                    \
-D CUDA_COMMON_INCLUDE_DIR="/package/install/cuda/10.1/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
+-D CUDA_COMMON_INCLUDE_DIR="/home/jehandad/NVIDIA_CUDA-10.1_Samples/common/inc" \
+-D CMAKE_CUDA_FLAGS="-g -G -Xcompiler -O0 -Xptxas -O0 -lineinfo -O0  -ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_52,code=sm_52" \
 ${MY_PROJECT_SOURCE}

 #-D BOOST_ROOT="/package/install/boost_1.67.0"                                               \

--- a/script/cmake-cuda_docker.sh
+++ b/script/cmake-cuda_docker.sh
 #!/bin/bash

-MY_PROJECT_SOURCE=../../../
+MY_PROJECT_SOURCE=../
 MY_PROJECT_INSTALL=../install.dir

 export CUDA_ROOT=/usr/local/cuda
@@ -15,7 +15,7 @@ cmake
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 -D DEVICE_BACKEND=NVIDIA                                                                    \
 -D CUDA_COMMON_INCLUDE_DIR="/root/NVIDIA_CUDA-10.1_Samples/common/inc"                      \
-D CMAKE_CUDA_FLAGS="-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep  -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
+-D CMAKE_CUDA_FLAGS="-g -G -ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep  -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_52,code=sm_52" \
 ${MY_PROJECT_SOURCE}