Commit c1b3fb95 authored by Jehandad Khan's avatar Jehandad Khan
Browse files

host verification in progress

parent eb8a1bf9
......@@ -412,8 +412,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{});
#elif 0
p_out_global[0] = p_out_thread[0];
#endif
}
}
......
......@@ -3,6 +3,8 @@
#include "ConstantTensorDescriptor.hpp"
typedef enum ConvolutionDir{ Forward=0, BackwardData=1, BackwardWeights=2};
// this is ugly, only for 4d
template <class InDesc, class WeiDesc>
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
......
......@@ -16,9 +16,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
// the input desc needs to be reordered for wrw : cnhw would be the new order
// the forward kernel always assumes red on the second dim and this would make it reduce on the n dimension due to the switchibng we did
const Tensor<T>& in_nchw,
Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
......@@ -252,4 +252,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
}
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
in_nchw_device_buf.FromDevice(in_nchw.mData.data());
wei_kcyx_device_buf.FromDevice(wei_kcyx.mData.data());
}
#pragma once
#include "tensor.hpp"
#include "common_header.hpp"
#include "conv_common.hpp"
#include "ConstantTensorDescriptor.hpp"
// this is ugly, only for 4d
......@@ -52,15 +53,26 @@ template <class TIn,
class ConvDilations,
class LowerPads,
class UpperPads>
void host_direct_convolution(const Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx,
void host_direct_convolution(Tensor<TIn>& in_nchw,
Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw,
ConvStrides,
ConvDilations,
LowerPads,
UpperPads)
UpperPads,
ConvolutionDir dir)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
#if 1
// wrw
in_nchw.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
wei_kcyx.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
out_nkhw.mDesc.ReorderGivenNew2Old({1, 0, 2, 3});
#endif
index_t h_pad_low = LowerPads{}.Get(Number<0>{});
index_t w_pad_low = LowerPads{}.Get(Number<1>{});
......@@ -81,7 +93,7 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in_nchw.mDesc.GetLengths()[3])
{
v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x));
v += double(in_nchw(n, c, hi, wi)) /*double(wei_kcyx(k, c, y, x))*/;
}
}
}
......
......@@ -101,6 +101,22 @@ struct TensorDescriptor
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
void ReorderGivenNew2Old(std::vector<std::size_t> is)
{
assert(mLens.size() == is.size());
assert(mStrides.size() == is.size());
std::vector<std::size_t> newLens(mLens.size());
std::vector<std::size_t> newStrides(mStrides.size());
auto cnt = 0;
for(auto& idx : is)
{
newLens[cnt] = mLens[idx];
newStrides[cnt] = mStrides[idx];
++cnt;
}
mLens= newLens;
mStrides = newStrides;
}
private:
std::vector<std::size_t> mLens;
......
......@@ -67,12 +67,14 @@ struct GeneratorTensor_Checkboard
}
};
int main(int argc, char* argv[])
{
using namespace ck;
ConvolutionDir dir = Forward;
#if 1
constexpr index_t N = 128;
constexpr index_t N = 64;
constexpr index_t C = 1536;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
......@@ -85,6 +87,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
dir = BackwardWeights;
#elif 0
// 3x3, 34x34
constexpr index_t N = 128;
......@@ -477,8 +480,10 @@ int main(int argc, char* argv[])
using in_data_t = float;
using out_data_t = float;
Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
Tensor<in_data_t> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx_device(make_TensorDescriptor(wei_kcyx_desc));
Tensor<in_data_t> in_nchw_host(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx_host(make_TensorDescriptor(wei_kcyx_desc));
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
......@@ -505,8 +510,24 @@ int main(int argc, char* argv[])
in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 1
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
in_nchw_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
assert(in_nchw_device.mData.size() == in_nchw_host.mData.size());
for(auto i = 0; i < in_nchw_device.mData.size(); ++i)
{
in_nchw_host.mData[i] = in_nchw_device.mData[i];
}
wei_kcyx_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
assert(wei_kcyx_device.mData.size() == wei_kcyx_host.mData.size());
for(auto i =0; i < wei_kcyx_device.mData.size(); ++i)
{
wei_kcyx_host.mData[i] = wei_kcyx_device.mData[i];
}
out_nkhw_device.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
assert(out_nkhw_device.mData.size() == out_nkhw_host.mData.size());
for(auto i = 0; i < out_nkhw_device.mData.size(); ++i)
{
out_nkhw_host.mData[i] = out_nkhw_device.mData[i];
}
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
......@@ -536,9 +557,9 @@ int main(int argc, char* argv[])
// this is the same as MIOpen
// I should modify this one
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
in_nchw_device,
wei_kcyx_desc,
wei_kcyx,
wei_kcyx_device,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
......@@ -588,7 +609,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
#if 1
#if 0
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
{
......@@ -597,15 +618,18 @@ int main(int argc, char* argv[])
else
#endif
{
host_direct_convolution(in_nchw,
wei_kcyx,
host_direct_convolution(in_nchw_host,
out_nkhw_host,
ConvStrides{},
wei_kcyx_host,
ConvDilations{},
ConvStrides{},
lower_pads,
upper_pads);
upper_pads, dir);
}
if(dir == Forward)
check_error(out_nkhw_host, out_nkhw_device);
else
check_error(wei_kcyx_host, wei_kcyx_device);
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
......
......@@ -7,6 +7,10 @@ TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mL
{
this->CalculateStrides();
}
TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens, std::initializer_list<std::size_t> strides)
: mLens(lens), mStrides(strides)
{
}
TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides)
: mLens(lens), mStrides(strides)
......
......@@ -4,17 +4,17 @@ rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
MY_PROJECT_SOURCE=..
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_CXX_COMPILER=clang++ \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_BUILD_TYPE=Debug \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=NVIDIA \
-D CUDA_COMMON_INCLUDE_DIR="/package/install/cuda/10.1/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
-D CUDA_COMMON_INCLUDE_DIR="/home/jehandad/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-g -G -Xcompiler -O0 -Xptxas -O0 -lineinfo -O0 -ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_52,code=sm_52" \
${MY_PROJECT_SOURCE}
#-D BOOST_ROOT="/package/install/boost_1.67.0" \
......
#!/bin/bash
MY_PROJECT_SOURCE=../../../
MY_PROJECT_SOURCE=../
MY_PROJECT_INSTALL=../install.dir
export CUDA_ROOT=/usr/local/cuda
......@@ -15,7 +15,7 @@ cmake
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=NVIDIA \
-D CUDA_COMMON_INCLUDE_DIR="/root/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
-D CMAKE_CUDA_FLAGS="-g -G -ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_52,code=sm_52" \
${MY_PROJECT_SOURCE}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment