#include #include #include typedef enum { Half = 0, Float = 1, } DataType_t; template struct DataType; template <> struct DataType : std::integral_constant { }; struct TensorDescriptor { TensorDescriptor() = delete; TensorDescriptor(DataType_t t, std::initializer_list lens); TensorDescriptor(DataType_t t, std::initializer_list lens, std::initializer_list strides); TensorDescriptor(DataType_t t, std::vector lens, std::vector strides); void CalculateStrides(); template TensorDescriptor(DataType_t t, const Range& lens) : mLens(lens.begin(), lens.end()), mDataType(t) { this->CalculateStrides(); } template TensorDescriptor(DataType_t t, const Range1& lens, const Range2& strides) : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()), mDataType(t) {} std::size_t GetDimension() const; std::size_t GetElementSize() const; std::size_t GetElementSpace() const; template std::size_t GetIndex(Xs... xs) const { assert(sizeof...(Xs) == this->GetDimension()); std::initializer_list is{xs...}; return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0}); } private: std::vector mLens; std::vector mStrides; DataType_t mDataType; }; template struct Tensor { template Tensor(std::initializer_list lens) : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) { } template Tensor(std::vector lens) : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) { } template Tensor(std::vector lens, std::vector strides) : mDesc(DataType{}, lens, strides), mData(mDesc.GetElementSpace()) { } template void GenerateTensorValue(G g) { parallel_for([&](Xs... xs) { mData(mDesc.GetIndex(xs...)) = g(xs...); }, mDesc.mLens); } T& operator[](std::size_t i) { return mData.at(i); } const T& operator[](std::size_t i) const { return mData.at(i); } typename std::vector::iterator begin() { return mData.begin(); } typename std::vector::iterator end() { return mData.end(); } typename std::vector::const_iterator begin() const { return mData.begin(); } typename std::vector::const_iterator end() const { return mData.end(); } TensorDescriptor mDesc; std::vector mData; }; struct GpuMem { GpuMem() = delete; GpuMem(std::size_t sz, std::size_t data_sz) : mSz(sz), mDataSz(data_sz) { cudaMalloc(statci_cast(&GpuBuf), mDataSize * mSz); } int ToGpu(void* p) { return static_cast(cudaMemcpy(mGpuBuf, p, mDataSz * mSz, cudaMemCpyHostToDevice)); } int FromGpu(void* p) { return static_cast(cuadMemCpy(p, mGpuBuf, mDataSz * mSz)); } ~GpuMem() { cudaFree(mGpuBuf); } void* mGpuBuf; std::size_t mSz; std::size_t mDataSz; }; void dummy() { auto f1 = [](int n, int c, int h, int w) { do_f1(n, c, h, w); }; auto f2 = [](int n, int c, int h, int w) { do_f2(n, c, h, w); }; auto par_f1 = generate_ParallelTensorFunctor(f1, 3, 3, 3, 3, 3); auto par_f2 = generate_ParallelTensorFunctor(f2, 4, 4, 4); auto r1 = par_f1(); auto r2 = par_f2(); } template auto generate_parallel_tensor_functor(F f, Xs... xs) { return ParallelTensorFunctor(f, xs...); } template struct ParallelTensorFunctor { enum ParallelMethod_t { Serial = 0, Parallel = 1, }; F mF; constexpr std::size_t DIM = sizeof...(Xs); std::array mLens; std::array mStrides; std::size_t mN1d; ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast(xs)...}) { mStrides.back() = 1; std::partial_sum(mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies()); mN1d = mStrides[0] * mLens[0]; } void operator()(std::integral_constant) { for(std::size_t i = 0; i < mN1d; ++i) { call_f_unpack_indices(mF, GetNdIndices(i)); } } void operator()(std::integral_constant, std::size_t::num_thread) { std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; std::vector threads(num_thread); for(std::size_t it = 0; it < num_thread; ++it) { std::size_t iw_begin = it * work_per_thread; std::size_t iw_end = std::min(((it+1)*work_per_thread, mN1d)); auto f = [=] { for(std::size_t iw = iw_begin; iw < iw_end; ++iw) call_f_unpack_indices(mF, GetNdIndices(iw); }; threads[it] = joinable_thread(f); } } }; struct joinable_thread : std::thread { template joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) { } ~joinable_thread() { if(this->joinable()) this->join; } } template auto call_f_unpack_indices(F f, T indices) { constexpr std::size_t N = std::tuple_size::value; using NSeq = std::make_integer_sequence; return call_f_unpack_indices_impl(f, indices, NSeq{}); } template auto call_f_unpack_indices_impl(F f, T indices, std::integer_sequence) { return f(std::get(indices)...); }