#include #include #include #include #include "cuda_runtime.h" #include "helper_cuda.h" typedef enum { Half = 0, Float = 1, } DataType_t; template struct DataType; template <> struct DataType : std::integral_constant { }; struct TensorDescriptor { TensorDescriptor() = delete; TensorDescriptor(DataType_t t, std::initializer_list lens); TensorDescriptor(DataType_t t, std::initializer_list lens, std::initializer_list strides); TensorDescriptor(DataType_t t, std::vector lens, std::vector strides); void CalculateStrides(); template TensorDescriptor(DataType_t t, const Range& lens) : mLens(lens.begin(), lens.end()), mDataType(t) { this->CalculateStrides(); } template TensorDescriptor(DataType_t t, const Range1& lens, const Range2& strides) : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()), mDataType(t) { } std::size_t GetDimension() const; std::size_t GetElementSize() const; std::size_t GetElementSpace() const; const std::vector& GetLengths() const; const std::vector& GetStrides() const; template std::size_t Get1dIndex(Xs... xs) const { assert(sizeof...(Xs) == this->GetDimension()); std::initializer_list is{xs...}; return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0}); } private: std::vector mLens; std::vector mStrides; DataType_t mDataType; }; template struct Tensor { template Tensor(std::initializer_list lens) : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) { } template Tensor(std::vector lens) : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) { } template Tensor(std::vector lens, std::vector strides) : mDesc(DataType{}, lens, strides), mData(mDesc.GetElementSpace()) { } template void GenerateTensorValue(G g) { // ParallelTensorFunctor([&](Xs... xs) { mData(mDesc.Get1dIndex(xs...)) = g(xs...); }, // mDesc.mLens)(); switch(mDesc.GetDimension()) { case 1: { ParallelTensorFunctor([&](auto i) { mData(mDesc.Get1dIndex(i)) = g(i); }, mDesc.GetLengths()[0])(); break; } case 2: { ParallelTensorFunctor( [&](auto i0, auto i1) { mData(mDesc.Get1dIndex(i0, i1)) = g(i0, i1); }, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(); break; } case 3: { ParallelTensorFunctor( [&](auto i0, auto i1, auto i2) { mData(mDesc.Get1dIndex(i0, i1, i2)) = g(i0, i1, i2); }, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(); break; } case 4: { ParallelTensorFunctor( [&](auto i0, auto i1, auto i2, auto i3) { mData(mDesc.Get1dIndex(i0, i1, i2, i3)) = g(i0, i1, i2, i3); }, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[3], mDesc.GetLengths()[4])(); break; } default: throw std::runtime_error("unspported dimension"); } } T& operator[](std::size_t i) { return mData.at(i); } const T& operator[](std::size_t i) const { return mData.at(i); } typename std::vector::iterator begin() { return mData.begin(); } typename std::vector::iterator end() { return mData.end(); } typename std::vector::const_iterator begin() const { return mData.begin(); } typename std::vector::const_iterator end() const { return mData.end(); } TensorDescriptor mDesc; std::vector mData; }; struct GpuMem { GpuMem() = delete; GpuMem(std::size_t size, std::size_t data_size) : mSize(size), mDataSize(data_size) { cudaMalloc(static_cast(&mGpuBuf), mDataSize * mSize); } int ToGpu(void* p) { return static_cast(cudaMemcpy(mGpuBuf, p, mDataSize * mSize, cudaMemcpyHostToDevice)); } int FromGpu(void* p) { return static_cast(cudaMemcpy(p, mGpuBuf, mDataSize * mSize, cudaMemcpyDeviceToHost)); } ~GpuMem() { cudaFree(mGpuBuf); } void* mGpuBuf; std::size_t mSize; std::size_t mDataSize; }; struct joinable_thread : std::thread { template joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) { } joinable_thread(joinable_thread&&) = default; joinable_thread& operator=(joinable_thread&&) = default; ~joinable_thread() { if(this->joinable()) this->join(); } }; template struct ParallelTensorFunctor { enum ParallelMethod_t { Serial = 0, Parallel = 1, }; F mF; static constexpr std::size_t NDIM = sizeof...(Xs); std::array mLens; std::array mStrides; std::size_t mN1d; ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast(xs)...}) { mStrides.back() = 1; std::partial_sum(mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies()); mN1d = mStrides[0] * mLens[0]; } std::array GetNdIndices(std::size_t i) const { std::array indices; for(int idim = 0; idim < NDIM; ++idim) { indices[idim] = i / mStrides[idim]; i -= indices[idim] * mStrides[idim]; } return indices; } void operator()(std::integral_constant) { for(std::size_t i = 0; i < mN1d; ++i) { call_f_unpack_args(mF, GetNdIndices(i)); } } void operator()(std::integral_constant, std::size_t num_thread) { std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; std::vector threads(num_thread); for(std::size_t it = 0; it < num_thread; ++it) { std::size_t iw_begin = it * work_per_thread; std::size_t iw_end = std::min(((it + 1) * work_per_thread, mN1d)); auto f = [=] { for(std::size_t iw = iw_begin; iw < iw_end; ++iw) { call_f_unpack_args(mF, GetNdIndices(iw)); } }; threads[it] = joinable_thread(f); } } }; template auto call_f_unpack_args(F f, T args) { static constexpr std::size_t N = std::tuple_size::value; return call_f_unpack_args_impl(f, args, std::make_index_sequence{}); } template auto call_f_unpack_args_impl(F f, T args, std::integer_sequence) { return f(std::get(args)...); } template auto construct_f_unpack_args_impl(T args, std::integer_sequence) { return F(std::get(args)...); } template auto construct_f_unpack_args(F, T args) { static constexpr std::size_t N = std::tuple_size::value; return construct_f_unpack_args_impl(args, std::make_index_sequence{}); }