"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "52e4ded85ff2448fc82e0112242c0cbb523632ec"
Commit c1b3fb95 authored by Jehandad Khan's avatar Jehandad Khan
Browse files

host verification in progress

parent eb8a1bf9
...@@ -333,8 +333,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -333,8 +333,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// LDS doubel buffer: load next data from device mem // LDS doubel buffer: load next data from device mem
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard); blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global,
p_wei_register_clipboard); p_wei_register_clipboard);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
...@@ -399,8 +399,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -399,8 +399,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// origin of dst in device memory // origin of dst in device memory
Float* p_out_thread_on_global = Float* p_out_thread_on_global =
p_out_global + p_out_global +
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0); k_thread_data_on_global, 0, b_thread_data_on_global, 0);
#if 1 #if 1
threadwise_generic_tensor_slice_copy_v1( threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
...@@ -412,8 +412,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -412,8 +412,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::type{}, arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{}); Number<1>{});
#elif 0
p_out_global[0] = p_out_thread[0];
#endif #endif
} }
} }
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor.hpp"
typedef enum ConvolutionDir{ Forward=0, BackwardData=1, BackwardWeights=2};
// this is ugly, only for 4d // this is ugly, only for 4d
template <class InDesc, class WeiDesc> template <class InDesc, class WeiDesc>
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc) constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
......
...@@ -16,9 +16,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -16,9 +16,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
// the input desc needs to be reordered for wrw : cnhw would be the new order // the input desc needs to be reordered for wrw : cnhw would be the new order
// the forward kernel always assumes red on the second dim and this would make it reduce on the n dimension due to the switchibng we did // the forward kernel always assumes red on the second dim and this would make it reduce on the n dimension due to the switchibng we did
const Tensor<T>& in_nchw, Tensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, Tensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, Tensor<T>& out_nkhw,
ConvStrides, ConvStrides,
...@@ -252,4 +252,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -252,4 +252,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
} }
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
in_nchw_device_buf.FromDevice(in_nchw.mData.data());
wei_kcyx_device_buf.FromDevice(wei_kcyx.mData.data());
} }
#pragma once #pragma once
#include "tensor.hpp" #include "tensor.hpp"
#include "common_header.hpp" #include "common_header.hpp"
#include "conv_common.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor.hpp"
// this is ugly, only for 4d // this is ugly, only for 4d
...@@ -52,15 +53,26 @@ template <class TIn, ...@@ -52,15 +53,26 @@ template <class TIn,
class ConvDilations, class ConvDilations,
class LowerPads, class LowerPads,
class UpperPads> class UpperPads>
void host_direct_convolution(const Tensor<TIn>& in_nchw, void host_direct_convolution(Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx, Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw, Tensor<TOut>& out_nkhw,
ConvStrides, ConvStrides,
ConvDilations, ConvDilations,
LowerPads, LowerPads,
UpperPads) UpperPads,
ConvolutionDir dir)
{ {
using namespace ck; using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
#if 1
// wrw
in_nchw.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
wei_kcyx.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
out_nkhw.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
#endif
index_t h_pad_low = LowerPads{}.Get(Number<0>{}); index_t h_pad_low = LowerPads{}.Get(Number<0>{});
index_t w_pad_low = LowerPads{}.Get(Number<1>{}); index_t w_pad_low = LowerPads{}.Get(Number<1>{});
...@@ -81,7 +93,7 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw, ...@@ -81,7 +93,7 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 && if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in_nchw.mDesc.GetLengths()[3]) wi < in_nchw.mDesc.GetLengths()[3])
{ {
v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x)); v += double(in_nchw(n, c, hi, wi)) /*double(wei_kcyx(k, c, y, x))*/;
} }
} }
} }
......
...@@ -101,6 +101,22 @@ struct TensorDescriptor ...@@ -101,6 +101,22 @@ struct TensorDescriptor
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...}; std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0}); return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
} }
void ReorderGivenNew2Old(std::vector<std::size_t> is)
{
assert(mLens.size() == is.size());
assert(mStrides.size() == is.size());
std::vector<std::size_t> newLens(mLens.size());
std::vector<std::size_t> newStrides(mStrides.size());
auto cnt = 0;
for(auto& idx : is)
{
newLens[cnt] = mLens[idx];
newStrides[cnt] = mStrides[idx];
++cnt;
}
mLens= newLens;
mStrides = newStrides;
}
private: private:
std::vector<std::size_t> mLens; std::vector<std::size_t> mLens;
......
...@@ -67,12 +67,14 @@ struct GeneratorTensor_Checkboard ...@@ -67,12 +67,14 @@ struct GeneratorTensor_Checkboard
} }
}; };
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
ConvolutionDir dir = Forward;
#if 1 #if 1
constexpr index_t N = 128; constexpr index_t N = 64;
constexpr index_t C = 1536; constexpr index_t C = 1536;
constexpr index_t HI = 8; constexpr index_t HI = 8;
constexpr index_t WI = 8; constexpr index_t WI = 8;
...@@ -85,6 +87,7 @@ int main(int argc, char* argv[]) ...@@ -85,6 +87,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
dir = BackwardWeights;
#elif 0 #elif 0
// 3x3, 34x34 // 3x3, 34x34
constexpr index_t N = 128; constexpr index_t N = 128;
...@@ -477,8 +480,10 @@ int main(int argc, char* argv[]) ...@@ -477,8 +480,10 @@ int main(int argc, char* argv[])
using in_data_t = float; using in_data_t = float;
using out_data_t = float; using out_data_t = float;
Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc)); Tensor<in_data_t> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); Tensor<in_data_t> wei_kcyx_device(make_TensorDescriptor(wei_kcyx_desc));
Tensor<in_data_t> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx_host(make_TensorDescriptor(wei_kcyx_desc));
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
...@@ -505,8 +510,24 @@ int main(int argc, char* argv[]) ...@@ -505,8 +510,24 @@ int main(int argc, char* argv[])
in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 1 #elif 1
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in_nchw_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); assert(in_nchw_device.mData.size() == in_nchw_host.mData.size());
for(auto i = 0; i < in_nchw_device.mData.size(); ++i)
{
in_nchw_host.mData[i] = in_nchw_device.mData[i];
}
wei_kcyx_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
assert(wei_kcyx_device.mData.size() == wei_kcyx_host.mData.size());
for(auto i =0; i < wei_kcyx_device.mData.size(); ++i)
{
wei_kcyx_host.mData[i] = wei_kcyx_device.mData[i];
}
out_nkhw_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
assert(out_nkhw_device.mData.size() == out_nkhw_host.mData.size());
for(auto i = 0; i < out_nkhw_device.mData.size(); ++i)
{
out_nkhw_host.mData[i] = out_nkhw_device.mData[i];
}
#elif 0 #elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
...@@ -536,9 +557,9 @@ int main(int argc, char* argv[]) ...@@ -536,9 +557,9 @@ int main(int argc, char* argv[])
// this is the same as MIOpen // this is the same as MIOpen
// I should modify this one // I should modify this one
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw_device,
wei_kcyx_desc, wei_kcyx_desc,
wei_kcyx, wei_kcyx_device,
out_nkhw_desc, out_nkhw_desc,
out_nkhw_device, out_nkhw_device,
ConvStrides{}, ConvStrides{},
...@@ -588,7 +609,7 @@ int main(int argc, char* argv[]) ...@@ -588,7 +609,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 1 #if 0
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 && if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1) ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
{ {
...@@ -597,15 +618,18 @@ int main(int argc, char* argv[]) ...@@ -597,15 +618,18 @@ int main(int argc, char* argv[])
else else
#endif #endif
{ {
host_direct_convolution(in_nchw, host_direct_convolution(in_nchw_host,
wei_kcyx,
out_nkhw_host, out_nkhw_host,
ConvStrides{}, wei_kcyx_host,
ConvDilations{}, ConvDilations{},
ConvStrides{},
lower_pads, lower_pads,
upper_pads); upper_pads, dir);
} }
check_error(out_nkhw_host, out_nkhw_device); if(dir == Forward)
check_error(out_nkhw_host, out_nkhw_device);
else
check_error(wei_kcyx_host, wei_kcyx_device);
#if 0 #if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
......
...@@ -7,6 +7,10 @@ TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mL ...@@ -7,6 +7,10 @@ TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mL
{ {
this->CalculateStrides(); this->CalculateStrides();
} }
TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens, std::initializer_list<std::size_t> strides)
: mLens(lens), mStrides(strides)
{
}
TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides) TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides)
: mLens(lens), mStrides(strides) : mLens(lens), mStrides(strides)
......
...@@ -4,17 +4,17 @@ rm -f CMakeCache.txt ...@@ -4,17 +4,17 @@ rm -f CMakeCache.txt
rm -f *.cmake rm -f *.cmake
rm -rf CMakeFiles rm -rf CMakeFiles
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution MY_PROJECT_SOURCE=..
MY_PROJECT_INSTALL=../install.dir MY_PROJECT_INSTALL=../install.dir
cmake \ cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_CXX_COMPILER=clang++ \ -D CMAKE_CXX_COMPILER=clang++ \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Debug \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=NVIDIA \ -D DEVICE_BACKEND=NVIDIA \
-D CUDA_COMMON_INCLUDE_DIR="/package/install/cuda/10.1/NVIDIA_CUDA-10.1_Samples/common/inc" \ -D CUDA_COMMON_INCLUDE_DIR="/home/jehandad/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \ -D CMAKE_CUDA_FLAGS="-g -G -Xcompiler -O0 -Xptxas -O0 -lineinfo -O0 -ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_52,code=sm_52" \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
#-D BOOST_ROOT="/package/install/boost_1.67.0" \ #-D BOOST_ROOT="/package/install/boost_1.67.0" \
......
#!/bin/bash #!/bin/bash
MY_PROJECT_SOURCE=../../../ MY_PROJECT_SOURCE=../
MY_PROJECT_INSTALL=../install.dir MY_PROJECT_INSTALL=../install.dir
export CUDA_ROOT=/usr/local/cuda export CUDA_ROOT=/usr/local/cuda
...@@ -15,7 +15,7 @@ cmake ...@@ -15,7 +15,7 @@ cmake
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=NVIDIA \ -D DEVICE_BACKEND=NVIDIA \
-D CUDA_COMMON_INCLUDE_DIR="/root/NVIDIA_CUDA-10.1_Samples/common/inc" \ -D CUDA_COMMON_INCLUDE_DIR="/root/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \ -D CMAKE_CUDA_FLAGS="-g -G -ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_52,code=sm_52" \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment