"src/include/threadwise_4d_tensor_op.cuh" did not exist on "c5877261905a36238d8162c817478cff9bc77bc4"
Commit d51b8158 authored by Chao Liu's avatar Chao Liu
Browse files

cpu direct conv

parent 06c9f9fe
......@@ -41,5 +41,7 @@ link_libraries(${PYTHON_LIBRARIES})
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
#
include_directories(BEFORE src/include)
add_subdirectory(src)
add_subdirectory(driver)
#include <iostream>
#include "tensor.hpp"
int main()
template <typename T>
void direct_convolution(const Tensor<T>& in,
const Tensor<T>& wei,
Tensor<T>& out,
std::size_t num_thread)
{
auto f = [&](auto n, auto k, auto ho, auto wo) {
double v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho + y;
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo + x;
v += in(n, c, hi, wi) * wei(k, c, y, x);
}
}
}
out(n, k, ho, wo) = v;
};
int len_in = 100;
int len_wei = 3;
int len_out = len_in - len_wei + 1;
std::vector<float> in(len_in, 1);
std::vector<float> wei(len_wei, 1);
std::vector<float> out(len_out, 1);
auto f_par = make_ParallelTensorFunctor(f,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3]);
direct_convolution(in.data(), wei.data(), out.data(), len_in, len_wei);
f_par(num_thread);
}
template <typename T>
void direct_convolution(const T* in, const T* wei, T* out, const int len_in, const int len_wei)
template <class T>
struct Generator
{
int len_out = len_in - len_wei + 1;
for(int i_out = 0; i_out < len_out++ i_out)
{
double acc = 0;
for(int i_wei = 0; i_wei < len_wei; ++i_wei)
template <class... Is>
T operator()(Is... is)
{
acc += in[i_out + i_wei] * *wei[i_wei];
}
out[i_out] = acc;
return 1;
}
};
int main()
{
Tensor<float> in({3, 16, 128, 128});
Tensor<float> wei({4, 16, 3, 3});
Tensor<float> out({3, 4, 126, 126});
int num_thread = std::thread::hardware_concurrency();
std::cout << __func__ << ": num_thread " << num_thread << std::endl;
in.GenerateTensorValue(Generator<float>{}, num_thread);
wei.GenerateTensorValue(Generator<float>{}, num_thread);
direct_convolution(in, wei, out, num_thread);
std::cout << __func__ << ": done" << std::endl;
LogRange(std::cout, in.mData, ",") << std::endl;
LogRange(std::cout, wei.mData, ",") << std::endl;
LogRange(std::cout, out.mData, ",") << std::endl;
}
include_directories(BEFORE include)
set(SOURCE
tensor.cpp;
)
......@@ -11,7 +9,7 @@ set_target_properties(convolution PROPERTIES PREFIX "")
target_link_libraries(convolution boost_python3)
# cuda
target_link_libraries(convolution nvToolsExt)
target_link_libraries(convolution nvToolsExt cudart)
target_compile_features(convolution PUBLIC)
set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF)
......
#include <thread>
#include <vector>
#include <numeric>
#include <algorithm>
#include <utility>
#include <cassert>
#include <iostream>
#include "cuda_runtime.h"
#include "helper_cuda.h"
template <class Range>
std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
{
bool first = true;
for(auto&& x : r)
{
if(first)
first = false;
else
os << delim;
os << x;
}
return os;
}
typedef enum
{
Half = 0,
......@@ -19,6 +37,34 @@ struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
{
};
template <class F, class T, std::size_t... Is>
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
{
return f(std::get<Is>(args)...);
}
template <class F, class T>
auto call_f_unpack_args(F f, T args)
{
constexpr std::size_t N = std::tuple_size<T>::value;
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
}
template <class F, class T, std::size_t... Is>
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
{
return F(std::get<Is>(args)...);
}
template <class F, class T>
auto construct_f_unpack_args(F, T args)
{
constexpr std::size_t N = std::tuple_size<T>::value;
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
}
struct TensorDescriptor
{
TensorDescriptor() = delete;
......@@ -50,12 +96,12 @@ struct TensorDescriptor
const std::vector<std::size_t>& GetLengths() const;
const std::vector<std::size_t>& GetStrides() const;
template <class... Xs>
std::size_t Get1dIndex(Xs... xs) const
template <class... Is>
std::size_t Get1dIndex(Is... is) const
{
assert(sizeof...(Xs) == this->GetDimension());
std::initializer_list<std::size_t> is{xs...};
return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0});
assert(sizeof...(Is) == this->GetDimension());
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
private:
......@@ -65,90 +111,6 @@ struct TensorDescriptor
DataType_t mDataType;
};
template <class T>
struct Tensor
{
template <class X>
Tensor(std::initializer_list<X> lens)
: mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
{
}
template <class X>
Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
{
}
template <class X, class Y>
Tensor(std::vector<X> lens, std::vector<Y> strides)
: mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
{
}
template <class G>
void GenerateTensorValue(G g)
{
// ParallelTensorFunctor([&](Xs... xs) { mData(mDesc.Get1dIndex(xs...)) = g(xs...); },
// mDesc.mLens)();
switch(mDesc.GetDimension())
{
case 1:
{
ParallelTensorFunctor([&](auto i) { mData(mDesc.Get1dIndex(i)) = g(i); },
mDesc.GetLengths()[0])();
break;
}
case 2:
{
ParallelTensorFunctor(
[&](auto i0, auto i1) { mData(mDesc.Get1dIndex(i0, i1)) = g(i0, i1); },
mDesc.GetLengths()[0],
mDesc.GetLengths()[1])();
break;
}
case 3:
{
ParallelTensorFunctor(
[&](auto i0, auto i1, auto i2) {
mData(mDesc.Get1dIndex(i0, i1, i2)) = g(i0, i1, i2);
},
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2])();
break;
}
case 4:
{
ParallelTensorFunctor(
[&](auto i0, auto i1, auto i2, auto i3) {
mData(mDesc.Get1dIndex(i0, i1, i2, i3)) = g(i0, i1, i2, i3);
},
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[3],
mDesc.GetLengths()[4])();
break;
}
default: throw std::runtime_error("unspported dimension");
}
}
T& operator[](std::size_t i) { return mData.at(i); }
const T& operator[](std::size_t i) const { return mData.at(i); }
typename std::vector<T>::iterator begin() { return mData.begin(); }
typename std::vector<T>::iterator end() { return mData.end(); }
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
typename std::vector<T>::const_iterator end() const { return mData.end(); }
TensorDescriptor mDesc;
std::vector<T> mData;
};
struct GpuMem
{
GpuMem() = delete;
......@@ -194,12 +156,6 @@ struct joinable_thread : std::thread
template <class F, class... Xs>
struct ParallelTensorFunctor
{
enum ParallelMethod_t
{
Serial = 0,
Parallel = 1,
};
F mF;
static constexpr std::size_t NDIM = sizeof...(Xs);
std::array<std::size_t, NDIM> mLens;
......@@ -229,16 +185,7 @@ struct ParallelTensorFunctor
return indices;
}
void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Serial>)
{
for(std::size_t i = 0; i < mN1d; ++i)
{
call_f_unpack_args(mF, GetNdIndices(i));
}
}
void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Parallel>,
std::size_t num_thread)
void operator()(std::size_t num_thread) const
{
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
......@@ -247,7 +194,7 @@ struct ParallelTensorFunctor
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end = std::min(((it + 1) * work_per_thread, mN1d));
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
......@@ -260,30 +207,92 @@ struct ParallelTensorFunctor
}
};
template <class F, class T>
auto call_f_unpack_args(F f, T args)
template <class F, class... Xs>
auto make_ParallelTensorFunctor(F f, Xs... xs)
{
static constexpr std::size_t N = std::tuple_size<T>::value;
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
return ParallelTensorFunctor<F, Xs...>(f, xs...);
}
template <class F, class T, class... Is>
auto call_f_unpack_args_impl(F f, T args, std::integer_sequence<Is...>)
template <class T>
struct Tensor
{
return f(std::get<Is>(args)...);
}
template <class X>
Tensor(std::initializer_list<X> lens)
: mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
{
}
template <class F, class T, class... Is>
auto construct_f_unpack_args_impl(T args, std::integer_sequence<Is...>)
{
return F(std::get<Is>(args)...);
}
template <class X>
Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
{
}
template <class F, class T>
auto construct_f_unpack_args(F, T args)
{
static constexpr std::size_t N = std::tuple_size<T>::value;
template <class X, class Y>
Tensor(std::vector<X> lens, std::vector<Y> strides)
: mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
{
}
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
}
template <class G>
void GenerateTensorValue(G g, std::size_t num_thread = 1)
{
switch(mDesc.GetDimension())
{
case 1:
{
auto f = [&](auto i) { (*this)(i) = g(i); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
break;
}
case 2:
{
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
break;
}
case 3:
{
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
make_ParallelTensorFunctor(
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
break;
}
case 4:
{
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
};
make_ParallelTensorFunctor(f,
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2],
mDesc.GetLengths()[3])(num_thread);
break;
}
default: throw std::runtime_error("unspported dimension");
}
}
template <class... Is>
T& operator()(Is... is)
{
return mData[mDesc.Get1dIndex(is...)];
}
template <class... Is>
const T& operator()(Is... is) const
{
return mData[mDesc.Get1dIndex(is...)];
}
typename std::vector<T>::iterator begin() { return mData.begin(); }
typename std::vector<T>::iterator end() { return mData.end(); }
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
typename std::vector<T>::const_iterator end() const { return mData.end(); }
TensorDescriptor mDesc;
std::vector<T> mData;
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment