#ifndef TENSOR_HPP #define TENSOR_HPP #include #include #include #include #include #include #include template std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim) { bool first = true; for(auto&& v : range) { if(first) first = false; else os << delim; os << v; } return os; } typedef enum { Half = 0, Float = 1, } DataType_t; template struct DataType; template <> struct DataType : std::integral_constant { }; template auto call_f_unpack_args_impl(F f, T args, std::index_sequence) { return f(std::get(args)...); } template auto call_f_unpack_args(F f, T args) { constexpr std::size_t N = std::tuple_size{}; return call_f_unpack_args_impl(f, args, std::make_index_sequence{}); } template auto construct_f_unpack_args_impl(T args, std::index_sequence) { return F(std::get(args)...); } template auto construct_f_unpack_args(F, T args) { constexpr std::size_t N = std::tuple_size{}; return construct_f_unpack_args_impl(args, std::make_index_sequence{}); } struct TensorDescriptor { TensorDescriptor() = delete; template TensorDescriptor(std::vector lens); template TensorDescriptor(std::vector lens, std::vector strides); void CalculateStrides(); template TensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end()) { this->CalculateStrides(); } template TensorDescriptor(const Range1& lens, const Range2& strides) : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()) { } std::size_t GetNumOfDimension() const; std::size_t GetElementSize() const; std::size_t GetElementSpace() const; const std::vector& GetLengths() const; const std::vector& GetStrides() const; template std::size_t GetOffsetFromMultiIndex(Is... is) const { assert(sizeof...(Is) == this->GetNumOfDimension()); std::initializer_list iss{static_cast(is)...}; return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0}); } private: std::vector mLens; std::vector mStrides; }; struct joinable_thread : std::thread { template joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) { } joinable_thread(joinable_thread&&) = default; joinable_thread& operator=(joinable_thread&&) = default; ~joinable_thread() { if(this->joinable()) this->join(); } }; template struct ParallelTensorFunctor { F mF; static constexpr std::size_t NDIM = sizeof...(Xs); std::array mLens; std::array mStrides; std::size_t mN1d; ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast(xs)...}) { mStrides.back() = 1; std::partial_sum(mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies()); mN1d = mStrides[0] * mLens[0]; } std::array GetNdIndices(std::size_t i) const { std::array indices; for(int idim = 0; idim < NDIM; ++idim) { indices[idim] = i / mStrides[idim]; i -= indices[idim] * mStrides[idim]; } return indices; } void operator()(std::size_t num_thread) const { std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; std::vector threads(num_thread); for(std::size_t it = 0; it < num_thread; ++it) { std::size_t iw_begin = it * work_per_thread; std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d); auto f = [=] { for(std::size_t iw = iw_begin; iw < iw_end; ++iw) { call_f_unpack_args(mF, GetNdIndices(iw)); } }; threads[it] = joinable_thread(f); } } }; template auto make_ParallelTensorFunctor(F f, Xs... xs) { return ParallelTensorFunctor(f, xs...); } template struct Tensor { template Tensor(std::initializer_list lens) : mDesc(lens), mData(mDesc.GetElementSpace()) { } template Tensor(std::vector lens) : mDesc(lens), mData(mDesc.GetElementSpace()) { } template Tensor(std::vector lens, std::vector strides) : mDesc(lens, strides), mData(mDesc.GetElementSpace()) { } Tensor(const TensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {} template void GenerateTensorValue(G g, std::size_t num_thread = 1) { switch(mDesc.GetNumOfDimension()) { case 1: { auto f = [&](auto i) { (*this)(i) = g(i); }; make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread); break; } case 2: { auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); }; make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread); break; } case 3: { auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); }; make_ParallelTensorFunctor( f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread); break; } case 4: { auto f = [&](auto i0, auto i1, auto i2, auto i3) { (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3); }; make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2], mDesc.GetLengths()[3])(num_thread); break; } default: throw std::runtime_error("unspported dimension"); } } template T& operator()(Is... is) { return mData[mDesc.GetOffsetFromMultiIndex(is...)]; } template const T& operator()(Is... is) const { return mData[mDesc.GetOffsetFromMultiIndex(is...)]; } typename std::vector::iterator begin() { return mData.begin(); } typename std::vector::iterator end() { return mData.end(); } typename std::vector::const_iterator begin() const { return mData.begin(); } typename std::vector::const_iterator end() const { return mData.end(); } TensorDescriptor mDesc; std::vector mData; }; void ostream_TensorDescriptor(const TensorDescriptor& desc, std::ostream& os = std::cout) { os << "dim " << desc.GetNumOfDimension() << ", "; os << "lengths {"; LogRange(os, desc.GetLengths(), ", "); os << "}, "; os << "strides {"; LogRange(os, desc.GetStrides(), ", "); os << "}" << std::endl; } template void check_error(const Tensor& ref, const Tensor& result) { float error = 0; float max_diff = -1; float ref_value = 0, result_value = 0; for(int i = 0; i < ref.mData.size(); ++i) { error += std::abs(double(ref.mData[i]) - double(result.mData[i])); float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); if(max_diff < diff) { max_diff = diff; ref_value = ref.mData[i]; result_value = result.mData[i]; } } std::cout << "error: " << error << std::endl; std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; } #endif