host verification in progress

c1b3fb95 · Jehandad Khan · eb8a1bf9 · c1b3fb95 · c1b3fb95 · c1b3fb95
Commit c1b3fb95 authored Aug 07, 2019 by Jehandad Khan
9 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -333,8 +333,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
            // LDS doubel buffer: load next data from device mem
            blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard);
-             blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global,
+            blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global,
-                                                        p_wei_register_clipboard);
+                                                       p_wei_register_clipboard);
            // LDS double buffer: GEMM on current data
            blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
@@ -399,8 +399,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
            //     origin of dst in device memory
            Float* p_out_thread_on_global =
                p_out_global  +
-                 out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
+                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
-                     k_thread_data_on_global, 0, b_thread_data_on_global, 0);
+                    k_thread_data_on_global, 0, b_thread_data_on_global, 0);
 #if 1
            threadwise_generic_tensor_slice_copy_v1(
                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
@@ -412,8 +412,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
                arithmetic_sequence_gen<0, 8, 1>::type{},
                Number<1>{});
-#elif 0
-	p_out_global[0] = p_out_thread[0];
 #endif
        }
    }

--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
@@ -3,6 +3,8 @@
 #include "ConstantTensorDescriptor.hpp"
+typedef enum ConvolutionDir{ Forward=0, BackwardData=1, BackwardWeights=2};
 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>
 constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -16,9 +16,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
 // the input desc needs to be reordered for wrw : cnhw would be the new order
 // the forward kernel always assumes red on the second dim and this would make it reduce on the n dimension due to the switchibng we did
-                                                          const Tensor<T>& in_nchw,
+                                                          Tensor<T>& in_nchw,
                                                          WeiDesc,
-                                                          const Tensor<T>& wei_kcyx,
+                                                          Tensor<T>& wei_kcyx,
                                                          OutDesc,
                                                          Tensor<T>& out_nkhw,
                                                          ConvStrides,
@@ -252,4 +252,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    }
    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+    in_nchw_device_buf.FromDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.FromDevice(wei_kcyx.mData.data());
 }
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
 #pragma once
 #include "tensor.hpp"
 #include "common_header.hpp"
+#include "conv_common.hpp"
 #include "ConstantTensorDescriptor.hpp"
 // this is ugly, only for 4d
@@ -52,15 +53,26 @@ template <class TIn,
          class ConvDilations,
          class LowerPads,
          class UpperPads>
-void host_direct_convolution(const Tensor<TIn>& in_nchw,
+void host_direct_convolution(Tensor<TIn>& in_nchw,
-                             const Tensor<TWei>& wei_kcyx,
+                             Tensor<TWei>& wei_kcyx,
                             Tensor<TOut>& out_nkhw,
                             ConvStrides,
                             ConvDilations,
                             LowerPads,
-                             UpperPads)
+                             UpperPads,
+                             ConvolutionDir dir)
 {
    using namespace ck;
+    constexpr auto I0   = Number<0>{};
+    constexpr auto I1   = Number<1>{};
+    constexpr auto I2   = Number<2>{};
+    constexpr auto I3   = Number<3>{};
+#if 1
+    // wrw
+    in_nchw.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
+    wei_kcyx.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
+    out_nkhw.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
+#endif
    index_t h_pad_low = LowerPads{}.Get(Number<0>{});
    index_t w_pad_low = LowerPads{}.Get(Number<1>{});
@@ -81,7 +93,7 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
                    if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
                       wi < in_nchw.mDesc.GetLengths()[3])
                    {
-                        v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x));
+                        v += double(in_nchw(n, c, hi, wi))  /*double(wei_kcyx(k, c, y, x))*/;
                    }
                }
            }

--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
@@ -101,6 +101,22 @@ struct TensorDescriptor
        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
    }
+    void ReorderGivenNew2Old(std::vector<std::size_t> is)
+    {
+        assert(mLens.size() == is.size());
+        assert(mStrides.size() == is.size());
+        std::vector<std::size_t> newLens(mLens.size());
+        std::vector<std::size_t> newStrides(mStrides.size());
+        auto cnt = 0;
+        for(auto& idx : is)
+        {
+            newLens[cnt] = mLens[idx];
+            newStrides[cnt] = mStrides[idx];
+            ++cnt;
+        }
+        mLens= newLens;
+        mStrides = newStrides;
+    }
    private:
    std::vector<std::size_t> mLens;

--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -67,12 +67,14 @@ struct GeneratorTensor_Checkboard
    }
 };
 int main(int argc, char* argv[])
 {
    using namespace ck;
+    ConvolutionDir dir = Forward;
 #if 1
-    constexpr index_t N  = 128;
+    constexpr index_t N  = 64;
    constexpr index_t C  = 1536;
    constexpr index_t HI = 8;
    constexpr index_t WI = 8;
@@ -85,6 +87,7 @@ int main(int argc, char* argv[])
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
+    dir = BackwardWeights;
 #elif 0
    // 3x3, 34x34
    constexpr index_t N  = 128;
@@ -477,8 +480,10 @@ int main(int argc, char* argv[])
    using in_data_t  = float;
    using out_data_t = float;
-    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
+    Tensor<in_data_t> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
-    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
+    Tensor<in_data_t> wei_kcyx_device(make_TensorDescriptor(wei_kcyx_desc));
+    Tensor<in_data_t> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
+    Tensor<in_data_t> wei_kcyx_host(make_TensorDescriptor(wei_kcyx_desc));
    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
    Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
@@ -505,8 +510,24 @@ int main(int argc, char* argv[])
        in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 1
-        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        in_nchw_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        assert(in_nchw_device.mData.size() == in_nchw_host.mData.size());
+        for(auto i = 0; i < in_nchw_device.mData.size(); ++i)
+        {
+            in_nchw_host.mData[i] = in_nchw_device.mData[i];
+        }
+        wei_kcyx_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        assert(wei_kcyx_device.mData.size() == wei_kcyx_host.mData.size());
+        for(auto i =0; i < wei_kcyx_device.mData.size(); ++i)
+        {
+            wei_kcyx_host.mData[i] = wei_kcyx_device.mData[i];
+        }
+        out_nkhw_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        assert(out_nkhw_device.mData.size() == out_nkhw_host.mData.size());
+        for(auto i = 0; i < out_nkhw_device.mData.size(); ++i)
+        {
+            out_nkhw_host.mData[i] = out_nkhw_device.mData[i];
+        }
 #elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
@@ -536,9 +557,9 @@ int main(int argc, char* argv[])
 // this is the same as MIOpen
 // I should modify this one 
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
-                                                         in_nchw,
+                                                         in_nchw_device,
                                                         wei_kcyx_desc,
-                                                         wei_kcyx,
+                                                         wei_kcyx_device,
                                                         out_nkhw_desc,
                                                         out_nkhw_device,
                                                         ConvStrides{},
@@ -588,7 +609,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-#if 1
+#if 0
        if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
           ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
        {
@@ -597,15 +618,18 @@ int main(int argc, char* argv[])
        else
 #endif
        {
-            host_direct_convolution(in_nchw,
+            host_direct_convolution(in_nchw_host,
-                                    wei_kcyx,
                                    out_nkhw_host,
-                                    ConvStrides{},
+                                    wei_kcyx_host,
                                    ConvDilations{},
+                                    ConvStrides{},
                                    lower_pads,
-                                    upper_pads);
+                                    upper_pads, dir);
        }
-        check_error(out_nkhw_host, out_nkhw_device);
+        if(dir == Forward)
+            check_error(out_nkhw_host, out_nkhw_device);
+        else
+            check_error(wei_kcyx_host, wei_kcyx_device);
 #if 0
        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;

--- a/driver/src/tensor.cpp
+++ b/driver/src/tensor.cpp
@@ -7,6 +7,10 @@ TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mL
 {
    this->CalculateStrides();
 }
+TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens, std::initializer_list<std::size_t> strides)
+    : mLens(lens), mStrides(strides)
+{
+}
 TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides)
    : mLens(lens), mStrides(strides)

--- a/script/cmake-cuda.sh
+++ b/script/cmake-cuda.sh
@@ -4,17 +4,17 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
-MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
+MY_PROJECT_SOURCE=..
 MY_PROJECT_INSTALL=../install.dir
 cmake                                                                                       \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
 -D CMAKE_CXX_COMPILER=clang++                                                               \
-D CMAKE_BUILD_TYPE=Release                                                                 \
+-D CMAKE_BUILD_TYPE=Debug                                                                   \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 -D DEVICE_BACKEND=NVIDIA                                                                    \
-D CUDA_COMMON_INCLUDE_DIR="/package/install/cuda/10.1/NVIDIA_CUDA-10.1_Samples/common/inc" \
+-D CUDA_COMMON_INCLUDE_DIR="/home/jehandad/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
+-D CMAKE_CUDA_FLAGS="-g -G -Xcompiler -O0 -Xptxas -O0 -lineinfo -O0  -ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_52,code=sm_52" \
 ${MY_PROJECT_SOURCE}
 #-D BOOST_ROOT="/package/install/boost_1.67.0"                                               \

--- a/script/cmake-cuda_docker.sh
+++ b/script/cmake-cuda_docker.sh
 #!/bin/bash
-MY_PROJECT_SOURCE=../../../
+MY_PROJECT_SOURCE=../
 MY_PROJECT_INSTALL=../install.dir
 export CUDA_ROOT=/usr/local/cuda
@@ -15,7 +15,7 @@ cmake
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 -D DEVICE_BACKEND=NVIDIA                                                                    \
 -D CUDA_COMMON_INCLUDE_DIR="/root/NVIDIA_CUDA-10.1_Samples/common/inc"                      \
-D CMAKE_CUDA_FLAGS="-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep  -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
+-D CMAKE_CUDA_FLAGS="-g -G -ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep  -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_52,code=sm_52" \
 ${MY_PROJECT_SOURCE}