Commit d51b8158 authored by Chao Liu's avatar Chao Liu
Browse files

cpu direct conv

parent 06c9f9fe
...@@ -41,5 +41,7 @@ link_libraries(${PYTHON_LIBRARIES}) ...@@ -41,5 +41,7 @@ link_libraries(${PYTHON_LIBRARIES})
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
# #
include_directories(BEFORE src/include)
add_subdirectory(src) add_subdirectory(src)
add_subdirectory(driver) add_subdirectory(driver)
#include <iostream>
#include "tensor.hpp" #include "tensor.hpp"
int main() template <typename T>
void direct_convolution(const Tensor<T>& in,
const Tensor<T>& wei,
Tensor<T>& out,
std::size_t num_thread)
{ {
auto f = [&](auto n, auto k, auto ho, auto wo) {
double v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho + y;
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo + x;
v += in(n, c, hi, wi) * wei(k, c, y, x);
}
}
}
out(n, k, ho, wo) = v;
};
int len_in = 100; auto f_par = make_ParallelTensorFunctor(f,
int len_wei = 3; out.mDesc.GetLengths()[0],
int len_out = len_in - len_wei + 1; out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
std::vector<float> in(len_in, 1); out.mDesc.GetLengths()[3]);
std::vector<float> wei(len_wei, 1);
std::vector<float> out(len_out, 1);
direct_convolution(in.data(), wei.data(), out.data(), len_in, len_wei); f_par(num_thread);
} }
template <typename T> template <class T>
void direct_convolution(const T* in, const T* wei, T* out, const int len_in, const int len_wei) struct Generator
{ {
int len_out = len_in - len_wei + 1;
for(int i_out = 0; i_out < len_out++ i_out) template <class... Is>
T operator()(Is... is)
{ {
double acc = 0; return 1;
for(int i_wei = 0; i_wei < len_wei; ++i_wei)
{
acc += in[i_out + i_wei] * *wei[i_wei];
}
out[i_out] = acc;
} }
};
int main()
{
Tensor<float> in({3, 16, 128, 128});
Tensor<float> wei({4, 16, 3, 3});
Tensor<float> out({3, 4, 126, 126});
int num_thread = std::thread::hardware_concurrency();
std::cout << __func__ << ": num_thread " << num_thread << std::endl;
in.GenerateTensorValue(Generator<float>{}, num_thread);
wei.GenerateTensorValue(Generator<float>{}, num_thread);
direct_convolution(in, wei, out, num_thread);
std::cout << __func__ << ": done" << std::endl;
LogRange(std::cout, in.mData, ",") << std::endl;
LogRange(std::cout, wei.mData, ",") << std::endl;
LogRange(std::cout, out.mData, ",") << std::endl;
} }
include_directories(BEFORE include)
set(SOURCE set(SOURCE
tensor.cpp; tensor.cpp;
) )
...@@ -11,7 +9,7 @@ set_target_properties(convolution PROPERTIES PREFIX "") ...@@ -11,7 +9,7 @@ set_target_properties(convolution PROPERTIES PREFIX "")
target_link_libraries(convolution boost_python3) target_link_libraries(convolution boost_python3)
# cuda # cuda
target_link_libraries(convolution nvToolsExt) target_link_libraries(convolution nvToolsExt cudart)
target_compile_features(convolution PUBLIC) target_compile_features(convolution PUBLIC)
set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF)
......
#include <thread> #include <thread>
#include <vector> #include <vector>
#include <numeric> #include <numeric>
#include <algorithm>
#include <utility> #include <utility>
#include <cassert>
#include <iostream>
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "helper_cuda.h" #include "helper_cuda.h"
template <class Range>
std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
{
bool first = true;
for(auto&& x : r)
{
if(first)
first = false;
else
os << delim;
os << x;
}
return os;
}
typedef enum typedef enum
{ {
Half = 0, Half = 0,
...@@ -19,6 +37,34 @@ struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float> ...@@ -19,6 +37,34 @@ struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
{ {
}; };
template <class F, class T, std::size_t... Is>
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
{
return f(std::get<Is>(args)...);
}
template <class F, class T>
auto call_f_unpack_args(F f, T args)
{
constexpr std::size_t N = std::tuple_size<T>::value;
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
}
template <class F, class T, std::size_t... Is>
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
{
return F(std::get<Is>(args)...);
}
template <class F, class T>
auto construct_f_unpack_args(F, T args)
{
constexpr std::size_t N = std::tuple_size<T>::value;
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
}
struct TensorDescriptor struct TensorDescriptor
{ {
TensorDescriptor() = delete; TensorDescriptor() = delete;
...@@ -50,12 +96,12 @@ struct TensorDescriptor ...@@ -50,12 +96,12 @@ struct TensorDescriptor
const std::vector<std::size_t>& GetLengths() const; const std::vector<std::size_t>& GetLengths() const;
const std::vector<std::size_t>& GetStrides() const; const std::vector<std::size_t>& GetStrides() const;
template <class... Xs> template <class... Is>
std::size_t Get1dIndex(Xs... xs) const std::size_t Get1dIndex(Is... is) const
{ {
assert(sizeof...(Xs) == this->GetDimension()); assert(sizeof...(Is) == this->GetDimension());
std::initializer_list<std::size_t> is{xs...}; std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0}); return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
} }
private: private:
...@@ -65,90 +111,6 @@ struct TensorDescriptor ...@@ -65,90 +111,6 @@ struct TensorDescriptor
DataType_t mDataType; DataType_t mDataType;
}; };
template <class T>
struct Tensor
{
template <class X>
Tensor(std::initializer_list<X> lens)
: mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
{
}
template <class X>
Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
{
}
template <class X, class Y>
Tensor(std::vector<X> lens, std::vector<Y> strides)
: mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
{
}
template <class G>
void GenerateTensorValue(G g)
{
// ParallelTensorFunctor([&](Xs... xs) { mData(mDesc.Get1dIndex(xs...)) = g(xs...); },
// mDesc.mLens)();
switch(mDesc.GetDimension())
{
case 1:
{
ParallelTensorFunctor([&](auto i) { mData(mDesc.Get1dIndex(i)) = g(i); },
mDesc.GetLengths()[0])();
break;
}
case 2:
{
ParallelTensorFunctor(
[&](auto i0, auto i1) { mData(mDesc.Get1dIndex(i0, i1)) = g(i0, i1); },
mDesc.GetLengths()[0],
mDesc.GetLengths()[1])();
break;
}
case 3:
{
ParallelTensorFunctor(
[&](auto i0, auto i1, auto i2) {
mData(mDesc.Get1dIndex(i0, i1, i2)) = g(i0, i1, i2);
},
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2])();
break;
}
case 4:
{
ParallelTensorFunctor(
[&](auto i0, auto i1, auto i2, auto i3) {
mData(mDesc.Get1dIndex(i0, i1, i2, i3)) = g(i0, i1, i2, i3);
},
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[3],
mDesc.GetLengths()[4])();
break;
}
default: throw std::runtime_error("unspported dimension");
}
}
T& operator[](std::size_t i) { return mData.at(i); }
const T& operator[](std::size_t i) const { return mData.at(i); }
typename std::vector<T>::iterator begin() { return mData.begin(); }
typename std::vector<T>::iterator end() { return mData.end(); }
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
typename std::vector<T>::const_iterator end() const { return mData.end(); }
TensorDescriptor mDesc;
std::vector<T> mData;
};
struct GpuMem struct GpuMem
{ {
GpuMem() = delete; GpuMem() = delete;
...@@ -194,12 +156,6 @@ struct joinable_thread : std::thread ...@@ -194,12 +156,6 @@ struct joinable_thread : std::thread
template <class F, class... Xs> template <class F, class... Xs>
struct ParallelTensorFunctor struct ParallelTensorFunctor
{ {
enum ParallelMethod_t
{
Serial = 0,
Parallel = 1,
};
F mF; F mF;
static constexpr std::size_t NDIM = sizeof...(Xs); static constexpr std::size_t NDIM = sizeof...(Xs);
std::array<std::size_t, NDIM> mLens; std::array<std::size_t, NDIM> mLens;
...@@ -229,16 +185,7 @@ struct ParallelTensorFunctor ...@@ -229,16 +185,7 @@ struct ParallelTensorFunctor
return indices; return indices;
} }
void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Serial>) void operator()(std::size_t num_thread) const
{
for(std::size_t i = 0; i < mN1d; ++i)
{
call_f_unpack_args(mF, GetNdIndices(i));
}
}
void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Parallel>,
std::size_t num_thread)
{ {
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
...@@ -247,7 +194,7 @@ struct ParallelTensorFunctor ...@@ -247,7 +194,7 @@ struct ParallelTensorFunctor
for(std::size_t it = 0; it < num_thread; ++it) for(std::size_t it = 0; it < num_thread; ++it)
{ {
std::size_t iw_begin = it * work_per_thread; std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end = std::min(((it + 1) * work_per_thread, mN1d)); std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
auto f = [=] { auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw) for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
...@@ -260,30 +207,92 @@ struct ParallelTensorFunctor ...@@ -260,30 +207,92 @@ struct ParallelTensorFunctor
} }
}; };
template <class F, class T> template <class F, class... Xs>
auto call_f_unpack_args(F f, T args) auto make_ParallelTensorFunctor(F f, Xs... xs)
{ {
static constexpr std::size_t N = std::tuple_size<T>::value; return ParallelTensorFunctor<F, Xs...>(f, xs...);
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
} }
template <class F, class T, class... Is> template <class T>
auto call_f_unpack_args_impl(F f, T args, std::integer_sequence<Is...>) struct Tensor
{ {
return f(std::get<Is>(args)...); template <class X>
} Tensor(std::initializer_list<X> lens)
: mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
{
}
template <class F, class T, class... Is> template <class X>
auto construct_f_unpack_args_impl(T args, std::integer_sequence<Is...>) Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
{ {
return F(std::get<Is>(args)...); }
}
template <class F, class T> template <class X, class Y>
auto construct_f_unpack_args(F, T args) Tensor(std::vector<X> lens, std::vector<Y> strides)
{ : mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
static constexpr std::size_t N = std::tuple_size<T>::value; {
}
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{}); template <class G>
} void GenerateTensorValue(G g, std::size_t num_thread = 1)
{
switch(mDesc.GetDimension())
{
case 1:
{
auto f = [&](auto i) { (*this)(i) = g(i); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
break;
}
case 2:
{
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
break;
}
case 3:
{
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
make_ParallelTensorFunctor(
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
break;
}
case 4:
{
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
};
make_ParallelTensorFunctor(f,
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2],
mDesc.GetLengths()[3])(num_thread);
break;
}
default: throw std::runtime_error("unspported dimension");
}
}
template <class... Is>
T& operator()(Is... is)
{
return mData[mDesc.Get1dIndex(is...)];
}
template <class... Is>
const T& operator()(Is... is) const
{
return mData[mDesc.Get1dIndex(is...)];
}
typename std::vector<T>::iterator begin() { return mData.begin(); }
typename std::vector<T>::iterator end() { return mData.end(); }
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
typename std::vector<T>::const_iterator end() const { return mData.end(); }
TensorDescriptor mDesc;
std::vector<T> mData;
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment