Commit 19e73bbe authored by Yan Yan's avatar Yan Yan
Browse files

format code with clang-format, better c++ code

parent c336139f
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -16,14 +16,11 @@
#define POINTPILLARS_SCATTER_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv
{
namespace functor
{
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct PointPillarScatter
{
void operator()(const Device& d, tv::TensorView<T> canvas,
struct PointPillarScatter {
void operator()(const Device &d, tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors);
};
......
......@@ -16,8 +16,8 @@
#define PILLAR_SCATTER_OP_H_
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace spconv {
......@@ -42,9 +42,10 @@ torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
features.options());
TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
functor::PointPillarScatter<tv::GPU, T, int> ftor;
ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas), tv::torch2tv<const T>(features.squeeze()),
ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas),
tv::torch2tv<const T>(features.squeeze()),
tv::torch2tv<const T>(coors.squeeze()));
#endif
return canvas;
......
......@@ -29,7 +29,8 @@ using namespace pybind11::literals;
template <typename DType, int NDim>
int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> voxel_point_mask, py::array_t<int> coors,
py::array_t<DType> voxel_point_mask,
py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
......@@ -94,14 +95,12 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(py::array_t<DType> points,
py::array_t<DType> voxel_point_mask, py::array_t<DType> voxels,
py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
int points_to_voxel_3d_np_mean(
py::array_t<DType> points, py::array_t<DType> voxel_point_mask,
py::array_t<DType> voxels, py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto means_rw = means.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
......@@ -174,8 +173,8 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points,
template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask, py::array_t<DType> mins,
py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask,
py::array_t<DType> mins, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels, int block_factor, int block_size,
......
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -16,14 +16,14 @@
#define SPARSE_POOL_OP_H_
#include <spconv/maxpool.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace spconv {
template <typename T>
torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numAct) {
torch::Tensor indiceNum, int64_t numAct) {
auto device = features.device().type();
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
......@@ -43,8 +43,8 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
forwardFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
}
#ifdef SPCONV_CUDA
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
functor::SparseMaxPoolForwardFunctor<tv::GPU, T, int> forwardFtor;
forwardFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
......@@ -53,7 +53,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
TV_CHECK_CUDA_ERR();
}
#endif
else{
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalTime += timer.report() / 1000.0;
......@@ -63,17 +63,17 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
}
template <typename T>
torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum) {
torch::Tensor
indiceMaxPoolBackward(torch::Tensor features, torch::Tensor outFeatures,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum) {
auto device = features.device().type();
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
auto kernelVolume = indicePairs.size(0);
auto kernelVolume = indicePairs.size(0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
......@@ -85,8 +85,8 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
tv::torch2tv<const T>(features),
tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
}
#ifdef SPCONV_CUDA
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, int> backwardFtor;
backwardFtor(tv::TorchGPU(), tv::torch2tv<const T>(outFeatures),
......@@ -96,10 +96,9 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
TV_CHECK_CUDA_ERR();
}
#endif
else{
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
}
return inputGrad;
}
......
......@@ -14,7 +14,7 @@
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/kernel_utils.h>
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace spconv {
......
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -16,23 +16,21 @@
#define SPARSE_REORDERING_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv
{
namespace functor
{
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct SparseGatherFunctor
{
void operator()(const Device& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size);
struct SparseGatherFunctor {
void operator()(const Device &d, tv::TensorView<T> buffer,
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size);
};
template <typename Device, typename T, typename Index>
struct SparseScatterAddFunctor
{
void operator()(const Device& d, tv::TensorView<T> out_features,
tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
int size, bool stable=false);
struct SparseScatterAddFunctor {
void operator()(const Device &d, tv::TensorView<T> out_features,
tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size,
bool stable = false);
};
} // namespace functor
} // namespace spconv
......
......@@ -17,8 +17,8 @@
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace spconv {
......@@ -101,7 +101,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash);
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
......@@ -149,7 +149,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
......@@ -269,7 +269,7 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
dilation32, outSpatialShape32, transpose);
gridOut.fill_(-1);
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
......@@ -299,7 +299,7 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
transpose, true);
gridOut.fill_(-1);
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
......
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <sstream>
#ifdef TV_USE_STACKTRACE
#if defined(WIN32) || defined(_WIN32) || \
defined(__WIN32) && !defined(__CYGWIN__)
#define BOOST_STACKTRACE_USE_WINDBG
#else
// require linking with -ldl and -lbacktrace in linux
#define BOOST_STACKTRACE_USE_BACKTRACE
#endif
#include <boost/stacktrace.hpp>
#endif
namespace tv {
template <class SStream, class T> void sstream_print(SStream &ss, T val) {
ss << val;
}
template <class SStream, class T, class... TArgs>
void sstream_print(SStream &ss, T val, TArgs... args) {
ss << val << " ";
sstream_print(ss, args...);
}
template <class... TArgs> void ssprint(TArgs... args) {
std::stringstream ss;
sstream_print(ss, args...);
std::cout << ss.str() << std::endl;
}
#ifdef TV_USE_STACKTRACE
#define TV_BACKTRACE_PRINT(ss) \
ss << std::endl << boost::stacktrace::stacktrace();
#else
#define TV_BACKTRACE_PRINT(ss)
#endif
#define TV_THROW_RT_ERR(...) \
{ \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
}
#define TV_THROW_INVALID_ARG(...) \
{ \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::invalid_argument(__macro_s.str()); \
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::invalid_argument(__macro_s.str()); \
} \
}
} // namespace tv
\ No newline at end of file
#pragma once
// from pytorch.aten
#include "tensorview.h"
#include <type_traits>
namespace tv {
namespace cuda {
template <typename T1, typename T2> inline int DivUp(const T1 a, const T2 b) {
return (a + b - 1) / b;
}
// Use 1024 threads per block, which requires cuda sm_2x or above
constexpr int CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.
inline int getNumThreads(const int N) {
if (N > CUDA_NUM_THREADS) {
return CUDA_NUM_THREADS;
}
return DivUp(N, 32) * 32;
}
inline int getBlocks(const int N) {
TV_ASSERT_RT_ERR(N > 0,
"CUDA kernel launch blocks must be positive, but got N=", N);
return DivUp(N, getNumThreads(N));
}
} // namespace cuda
} // namespace tv
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensor.h"
#include "tensorview.h"
#include <eigen3/Eigen/Dense>
namespace tv {
template <typename T, int Row = Eigen::Dynamic, int Col = Eigen::Dynamic>
Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>>
tv2eigen(TensorView<T> view) {
TV_ASSERT_INVALID_ARG(view.ndim() <= 2 && view.ndim() > 0, "error");
if (Row != Eigen::Dynamic) {
TV_ASSERT_INVALID_ARG(view.dim(0) == Row, "error");
}
if (Col != Eigen::Dynamic) {
TV_ASSERT_INVALID_ARG(view.dim(1) == Col, "error");
}
int row = 1;
if (view.ndim() == 2) {
row = view.dim(0);
}
Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>> eigen_map(
view.data(), row, view.dim(1));
return eigen_map;
}
} // namespace tv
#pragma once
// from pytorch.aten
#include "tensorview.h"
namespace tv
{
namespace launch
{
template <typename T1, typename T2>
inline int DivUp(const T1 a, const T2 b) { return (a + b - 1) / b; }
// Use 1024 threads per block, which requires cuda sm_2x or above
constexpr int CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.
inline int getBlocks(const int N)
{
TV_ASSERT_RT_ERR(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
return DivUp(N, CUDA_NUM_THREADS);
}
} // namespace launch
} // namespace tv
\ No newline at end of file
#pragma once
// from tensorflow
namespace tv
{
namespace detail
{
namespace tv {
namespace detail {
template <typename T>
class KernelLoop
{
struct Iterator
{
__forceinline__ __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
template <typename T> class KernelLoop {
struct Iterator {
__forceinline__ __device__ Iterator(T index, T delta)
: index_(index), delta_(delta) {}
__forceinline__ __device__ T operator*() const { return index_; }
__forceinline__ __device__ Iterator &operator++()
{
__forceinline__ __device__ Iterator &operator++() {
index_ += delta_;
return *this;
}
__forceinline__ __device__ bool operator!=(const Iterator &other) const
{
__forceinline__ __device__ bool operator!=(const Iterator &other) const {
bool greater = index_ > other.index_;
bool less = index_ < other.index_;
// Anything past an end iterator (delta_ == 0) is equal.
// In range-based for loops, this optimizes to 'return less'.
if (!other.delta_)
{
if (!other.delta_) {
return less;
}
if (!delta_)
{
if (!delta_) {
return greater;
}
return less || greater;
......@@ -43,7 +35,9 @@ public:
__forceinline__ __device__ KernelLoop(T begin, T delta, T end)
: begin_(begin), delta_(delta), end_(end) {}
__forceinline__ __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
__forceinline__ __device__ Iterator begin() const {
return Iterator{begin_, delta_};
}
__forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
private:
......@@ -53,29 +47,26 @@ private:
};
} // namespace detail
template <typename T, int NumILP=1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count)
{
template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
gridDim.x * blockDim.x * NumILP, count);
gridDim.x * blockDim.x * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template <typename T, int NumILP=1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count)
{
template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
gridDim.y * blockDim.y * NumILP, count);
gridDim.y * blockDim.y * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template <typename T, int NumILP=1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count)
{
template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
gridDim.z * blockDim.z * NumILP, count);
gridDim.z * blockDim.z * NumILP, count);
}
} // namespace tv
\ No newline at end of file
......@@ -3,7 +3,7 @@
#include <type_traits>
#include <utility>
namespace spconv {
namespace tv {
template <class... T> struct mp_list {};
template <class T, T... I>
......@@ -11,9 +11,10 @@ using mp_list_c = mp_list<std::integral_constant<T, I>...>;
namespace detail {
template <class... T, class F>
constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
template <class... Ts, class F>
constexpr F mp_for_each_impl(mp_list<Ts...>, F &&f) {
return (void)(std::initializer_list<int>{(f(Ts()), 0)...}),
std::forward<F>(f);
}
template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
......@@ -42,6 +43,6 @@ using mp_rename = typename detail::mp_rename_impl<A, B>::type;
template <class L, class F> constexpr F mp_for_each(F &&f) {
return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
}
} // namespace spconv
} // namespace tv
#endif
\ No newline at end of file
// Copyright Louis Delacroix 2010 - 2014.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
// A pretty printing library for C++
//
// Usage:
// Include this header, and operator<< will "just work".
#ifndef H_PRETTY_PRINT
#define H_PRETTY_PRINT
#include <cstddef>
#include <iterator>
#include <memory>
#include <ostream>
#include <set>
#include <tuple>
#include <type_traits>
#include <unordered_set>
#include <utility>
#include <valarray>
namespace pretty_print {
namespace detail {
// SFINAE type trait to detect whether T::const_iterator exists.
struct sfinae_base {
using yes = char;
using no = yes[2];
};
template <typename T> struct has_const_iterator : private sfinae_base {
private:
template <typename C> static yes &test(typename C::const_iterator *);
template <typename C> static no &test(...);
public:
static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
using type = T;
};
template <typename T> struct has_begin_end : private sfinae_base {
private:
template <typename C>
static yes &
f(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
const>(&C::begin)),
typename C::const_iterator (C::*)() const>::value>::type *);
template <typename C> static no &f(...);
template <typename C>
static yes &
g(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
const>(&C::end)),
typename C::const_iterator (C::*)() const>::value,
void>::type *);
template <typename C> static no &g(...);
public:
static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
};
} // namespace detail
// Holds the delimiter values for a specific character type
template <typename TChar> struct delimiters_values {
using char_type = TChar;
const char_type *prefix;
const char_type *delimiter;
const char_type *postfix;
};
// Defines the delimiter values for a specific container and character type
template <typename T, typename TChar> struct delimiters {
using type = delimiters_values<TChar>;
static const type values;
};
// Functor to print containers. You can use this directly if you want
// to specificy a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.
template <typename T, typename TChar = char,
typename TCharTraits = ::std::char_traits<TChar>,
typename TDelimiters = delimiters<T, TChar>>
struct print_container_helper {
using delimiters_type = TDelimiters;
using ostream_type = std::basic_ostream<TChar, TCharTraits>;
template <typename U> struct printer {
static void print_body(const U &c, ostream_type &stream) {
using std::begin;
using std::end;
auto it = begin(c);
const auto the_end = end(c);
if (it != the_end) {
for (;;) {
stream << *it;
if (++it == the_end)
break;
if (delimiters_type::values.delimiter != NULL)
stream << delimiters_type::values.delimiter;
}
}
}
};
print_container_helper(const T &container) : container_(container) {}
inline void operator()(ostream_type &stream) const {
if (delimiters_type::values.prefix != NULL)
stream << delimiters_type::values.prefix;
printer<T>::print_body(container_, stream);
if (delimiters_type::values.postfix != NULL)
stream << delimiters_type::values.postfix;
}
private:
const T &container_;
};
// Specialization for pairs
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename T1, typename T2>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::pair<T1, T2>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
stream << c.first;
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << c.second;
}
};
// Specialization for tuples
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename... Args>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::tuple<Args...>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
using element_type = std::tuple<Args...>;
template <std::size_t I> struct Int {};
static void print_body(const element_type &c, ostream_type &stream) {
tuple_print(c, stream, Int<0>());
}
static void tuple_print(const element_type &, ostream_type &,
Int<sizeof...(Args)>) {}
static void
tuple_print(const element_type &c, ostream_type &stream,
typename std::conditional<sizeof...(Args) != 0, Int<0>,
std::nullptr_t>::type) {
stream << std::get<0>(c);
tuple_print(c, stream, Int<1>());
}
template <std::size_t N>
static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << std::get<N>(c);
tuple_print(c, stream, Int<N + 1>());
}
};
// Prints a print_container_helper to the specified stream.
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &stream,
const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
helper(stream);
return stream;
}
// Basic is_container template; specialize to derive from std::true_type for all
// desired container types
template <typename T>
struct is_container
: public std::integral_constant<bool,
detail::has_const_iterator<T>::value &&
detail::has_begin_end<T>::beg_value &&
detail::has_begin_end<T>::end_value> {};
template <typename T, std::size_t N>
struct is_container<T[N]> : std::true_type {};
template <std::size_t N> struct is_container<char[N]> : std::false_type {};
template <typename T> struct is_container<std::valarray<T>> : std::true_type {};
template <typename T1, typename T2>
struct is_container<std::pair<T1, T2>> : std::true_type {};
template <typename... Args>
struct is_container<std::tuple<Args...>> : std::true_type {};
// Default delimiters
template <typename T> struct delimiters<T, char> {
static const delimiters_values<char> values;
};
template <typename T>
const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
template <typename T> struct delimiters<T, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T>
const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
L"]"};
// Delimiters for (multi)set and unordered_(multi)set
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char> delimiters<::std::multiset<T, TComp, TAllocator>,
char>::values = {"{", ", ", "}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t>::values = {L"{", L", ", L"}"};
// Delimiters for pair and tuple
template <typename T1, typename T2> struct delimiters<std::pair<T1, T2>, char> {
static const delimiters_values<char> values;
};
template <typename T1, typename T2>
const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
"(", ", ", ")"};
template <typename T1, typename T2>
struct delimiters<::std::pair<T1, T2>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T1, typename T2>
const delimiters_values<wchar_t>
delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
template <typename... Args> struct delimiters<std::tuple<Args...>, char> {
static const delimiters_values<char> values;
};
template <typename... Args>
const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
"(", ", ", ")"};
template <typename... Args> struct delimiters<::std::tuple<Args...>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename... Args>
const delimiters_values<wchar_t>
delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
// and MyDelims needs to be defined for TChar. Usage: "cout <<
// pretty_print::custom_delims<MyDelims>(x)".
struct custom_delims_base {
virtual ~custom_delims_base() {}
virtual std::ostream &stream(::std::ostream &) = 0;
virtual std::wostream &stream(::std::wostream &) = 0;
};
template <typename T, typename Delims>
struct custom_delims_wrapper : custom_delims_base {
custom_delims_wrapper(const T &t_) : t(t_) {}
std::ostream &stream(std::ostream &s) {
return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
t);
}
std::wostream &stream(std::wostream &s) {
return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
Delims>(t);
}
private:
const T &t;
};
template <typename Delims> struct custom_delims {
template <typename Container>
custom_delims(const Container &c)
: base(new custom_delims_wrapper<Container, Delims>(c)) {}
std::unique_ptr<custom_delims_base> base;
};
template <typename TChar, typename TCharTraits, typename Delims>
inline std::basic_ostream<TChar, TCharTraits> &
operator<<(std::basic_ostream<TChar, TCharTraits> &s,
const custom_delims<Delims> &p) {
return p.base->stream(s);
}
// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
template <typename T> struct array_wrapper_n {
typedef const T *const_iterator;
typedef T value_type;
array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
inline const_iterator begin() const { return _array; }
inline const_iterator end() const { return _array + _n; }
private:
const T *const _array;
size_t _n;
};
// A wrapper for hash-table based containers that offer local iterators to each
// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket
// 5 of container m.)
template <typename T> struct bucket_print_wrapper {
typedef typename T::const_local_iterator const_iterator;
typedef typename T::size_type size_type;
const_iterator begin() const { return m_map.cbegin(n); }
const_iterator end() const { return m_map.cend(n); }
bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
private:
const T &m_map;
const size_type n;
};
} // namespace pretty_print
// Global accessor functions for the convenience wrappers
template <typename T>
inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
size_t n) {
return pretty_print::array_wrapper_n<T>(a, n);
}
template <typename T>
pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
typename T::size_type n) {
return pretty_print::bucket_print_wrapper<T>(m, n);
}
// Main magic entry point: An overload snuck into namespace std.
// Can we do better?
namespace std {
// Prints a container to the stream using default delimiters
template <typename T, typename TChar, typename TCharTraits>
inline typename enable_if<::pretty_print::is_container<T>::value,
basic_ostream<TChar, TCharTraits> &>::type
operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
return stream
<< ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
container);
}
} // namespace std
#endif // H_PRETTY_PRINT
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensor.h"
#include "tensorview.h"
#include <algorithm>
#include <array>
#include <iostream>
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
namespace py = pybind11;
namespace tv {
template <typename Tarr> bool is_c_stype(const Tarr &arr) {
return bool(arr.flags() & py::array::c_style);
}
template <typename T, int Rank = -1>
TensorView<T, Rank> arrayt2tv(py::array_t<T> arr) {
TV_ASSERT_INVALID_ARG(is_c_stype(arr), "array must be c-contiguous array");
Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
if (Rank >= 0) {
TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
}
return TensorView<T, Rank>(arr.mutable_data(), shape);
}
template <typename T, int Rank = -1>
TensorView<const T> carrayt2tv(py::array_t<T> arr) {
TV_ASSERT_INVALID_ARG(is_c_stype(arr), "array must be c-contiguous array");
Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
if (Rank >= 0) {
TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
}
return TensorView<const T, Rank>(arr.data(), shape);
}
template <typename Tarr> tv::DType get_array_tv_dtype(const Tarr &arr) {
switch (arr.dtype().kind()) {
case 'b':
return tv::bool_;
case 'i': {
switch (arr.itemsize()) {
case 1:
return tv::int8;
case 2:
return tv::int16;
case 4:
return tv::int32;
case 8:
return tv::int64;
default:
break;
}
}
case 'u': {
switch (arr.itemsize()) {
case 1:
return tv::uint8;
case 2:
return tv::uint16;
case 4:
return tv::uint32;
case 8:
return tv::uint64;
default:
break;
}
}
case 'f': {
switch (arr.itemsize()) {
case 2:
return tv::float16;
case 4:
return tv::float32;
case 8:
return tv::float64;
default:
break;
}
}
}
TV_THROW_RT_ERR("unknown dtype", arr.dtype().kind(), arr.itemsize());
}
template <typename Tarr> Tensor array2tensor(Tarr &arr) {
TV_ASSERT_INVALID_ARG(is_c_stype(arr), "array must be c-contiguous array");
TensorShape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return tv::from_blob(arr.mutable_data(), shape, get_array_tv_dtype(arr), -1);
}
template <typename T> Tensor arrayt2tensor(py::array_t<T> &arr) {
TV_ASSERT_INVALID_ARG(is_c_stype(arr), "array must be c-contiguous array");
TensorShape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return tv::from_blob(arr.mutable_data(), shape, tv::type_v<T>, -1);
}
template <typename TDType> py::dtype tv_dtype_to_py(TDType d) {
switch (d) {
case float32:
return py::dtype("float32");
case float64:
return py::dtype("float64");
case float16:
return py::dtype("float16");
case int32:
return py::dtype("int32");
case int16:
return py::dtype("int16");
case int8:
return py::dtype("int8");
case int64:
return py::dtype("int64");
case uint32:
return py::dtype("uint32");
case uint16:
return py::dtype("uint16");
case uint8:
return py::dtype("uint8");
case uint64:
return py::dtype("uint64");
case bool_:
return py::dtype("bool_");
default:;
}
TV_THROW_INVALID_ARG("unknown dtype", d);
}
// add template to define function in header
template <typename Ttensor> py::array tensor2array(Ttensor &tensor) {
// you cant call this function during GIL released.
TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
auto shape = tensor.shape();
std::vector<int> shape_vec(shape.begin(), shape.end());
auto dtype = tv_dtype_to_py(tensor.dtype());
// construct py::array will copy content from ptr.
// its expected because we can't transfer ownership from
// c++ tv::Tensor to numpy array when c++ object is deleted.
return py::array(dtype, shape_vec, {}, tensor.raw_data());
}
} // namespace tv
// Copyright 2019 Yan Yan
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,19 +12,30 @@
// See the License for the specific language governing permissions and
// limitations under the License.
/*
tv::Tensor is a lightweight header-only tensor container
without template and annoying dependencies. no algorithm is implemented.
it should only be used when you want a no-template simple container but
dont want to link with libtorch.
If you can use libtorch, dont use tv::Tensor.
*/
#pragma once
#include "mp_helper.h"
#include "tensorview.h"
#include <cstring>
#include <iomanip>
#include <memory>
#include <spconv/mp_helper.h>
#ifdef SPCONV_CUDA
#include <type_traits>
#ifdef TV_CUDA
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#endif
namespace tv
{
enum DType
{
namespace tv {
enum DType {
float32,
int32,
int16,
......@@ -39,51 +50,46 @@ enum DType
uint64
};
namespace detail
{
namespace detail {
template <typename T>
class TensorStorage
{
using all_tensor_types_t =
std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
uint16_t, uint32_t, uint64_t, bool>;
template <typename T> class TensorStorage {
public:
TensorStorage(size_t size, int device = -1, bool managed = false)
: mSize(size), device_(device), managed_(managed)
{
if (size == 0)
{
TensorStorage(size_t size, int device = -1, bool managed = false,
bool pinned = false)
: mSize(size), device_(device), managed_(managed), pinned_(pinned) {
if (size == 0) {
mPtr = nullptr;
}
else
{
if (device == -1)
{
#ifdef SPCONV_CUDA
checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
} else {
if (device == -1) {
if (pinned_) {
#ifdef TV_CUDA
checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
#else
mPtr = new T[size];
TV_THROW_INVALID_ARG("you need to define TV_CUDA to use pinned");
#endif
}
else
{
#ifdef SPCONV_CUDA
} else {
mPtr = new T[size];
}
} else {
#ifdef TV_CUDA
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount)
{
TV_ASSERT_INVALID_ARG("you provide device ", device,
" but you only have ", deviceCount, " device.");
if (device >= deviceCount) {
TV_THROW_INVALID_ARG("you provide device ", device,
" but you only have ", deviceCount, " device.");
}
cudaSetDevice(device);
if (managed)
{
if (managed) {
checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
}
else
{
} else {
checkCudaErrors(cudaMalloc(&mPtr, size * sizeof(T)));
}
#else
TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
TV_THROW_INVALID_ARG("don't compiled with cuda");
#endif
}
}
......@@ -91,27 +97,23 @@ public:
TensorStorage(T *ptr, size_t size, int device)
: mSize(size), mPtr(ptr), from_blob_(true), device_(device) {}
virtual ~TensorStorage()
{
if (empty())
{
virtual ~TensorStorage() {
if (empty()) {
return;
}
if (from_blob_)
{
if (from_blob_) {
return;
}
if (device_ == -1)
{
#ifdef SPCONV_CUDA
cudaFreeHost(mPtr);
#else
delete[] mPtr;
if (device_ == -1) {
if (pinned_) {
#ifdef TV_CUDA
cudaFreeHost(mPtr);
#endif
}
else
{
#ifdef SPCONV_CUDA
} else {
delete[] mPtr;
}
} else {
#ifdef TV_CUDA
cudaFree(mPtr);
#endif
}
......@@ -124,36 +126,33 @@ public:
bool empty() const { return mPtr == nullptr || mSize == 0; }
bool managed() const { return managed_; }
bool pinned() const { return pinned_; }
int device() const { return device_; }
void zero_()
{
if (device_ == -1)
{
void zero_() {
if (device_ == -1) {
std::memset(data(), 0, mSize);
// std::fill(data(), data() + mSize, 0);
}
else
{
#ifdef SPCONV_CUDA
} else {
#ifdef TV_CUDA
checkCudaErrors(cudaMemset(data(), 0, mSize / sizeof(T)));
#else
TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
TV_THROW_INVALID_ARG("don't compiled with cuda");
#endif
}
}
private:
T *mPtr = nullptr;
size_t mSize = 0;
int device_ = -1;
T *mPtr = nullptr;
bool from_blob_ = false;
int device_ = -1;
bool managed_ = false;
bool pinned_ = false;
};
size_t sizeof_dtype(DType dtype)
{
switch (dtype)
{
template <typename T> size_t sizeof_dtype(T dtype) {
switch (dtype) {
case float32:
return sizeof(float);
case int8:
......@@ -176,20 +175,16 @@ size_t sizeof_dtype(DType dtype)
return sizeof(uint32_t);
case uint64:
return sizeof(uint64_t);
#ifdef SPCONV_CUDA
case float16:
return sizeof(__half);
#endif
return 2;
default:
TV_THROW_RT_ERR("unsupported dtype");
}
return 0;
}
std::string typeString(DType t)
{
switch (t)
{
template <typename T> std::string typeString(T t) {
switch (t) {
case DType::bool_:
return "bool";
case DType::float32:
......@@ -212,165 +207,477 @@ std::string typeString(DType t)
return "uint32";
case DType::uint64:
return "uint64";
#ifdef SPCONV_CUDA
case DType::float16:
return "half";
#endif
default:
return "";
}
}
template <typename T>
struct TypeToDtypeTraits;
template <typename T> struct TypeToDtypeTraits;
template <> struct TypeToDtypeTraits<int32_t> {
static constexpr DType dtype = int32;
};
#ifdef TV_CUDA
template <> struct TypeToDtypeTraits<__half> {
static constexpr DType dtype = float16;
};
#endif
template <>
struct TypeToDtypeTraits<int32_t>
{
template <> struct TypeToDtypeTraits<float> {
static constexpr DType dtype = float32;
};
template <> struct TypeToDtypeTraits<double> {
static constexpr DType dtype = float64;
};
template <> struct TypeToDtypeTraits<int16_t> {
static constexpr DType dtype = int16;
};
template <> struct TypeToDtypeTraits<int8_t> {
static constexpr DType dtype = int8;
};
template <> struct TypeToDtypeTraits<int64_t> {
static constexpr DType dtype = int64;
};
template <> struct TypeToDtypeTraits<uint8_t> {
static constexpr DType dtype = uint8;
};
template <> struct TypeToDtypeTraits<uint16_t> {
static constexpr DType dtype = uint16;
};
template <> struct TypeToDtypeTraits<uint32_t> {
static constexpr DType dtype = uint32;
};
template <> struct TypeToDtypeTraits<uint64_t> {
static constexpr DType dtype = uint64;
};
template <> struct TypeToDtypeTraits<bool> {
static constexpr DType dtype = bool_;
};
template <> struct TypeToDtypeTraits<const int32_t> {
static constexpr DType dtype = int32;
};
#ifdef SPCONV_CUDA
template <>
struct TypeToDtypeTraits<__half>
{
#ifdef TV_CUDA
template <> struct TypeToDtypeTraits<const __half> {
static constexpr DType dtype = float16;
};
#endif
template <>
struct TypeToDtypeTraits<float>
{
template <> struct TypeToDtypeTraits<const float> {
static constexpr DType dtype = float32;
};
template <>
struct TypeToDtypeTraits<double>
{
template <> struct TypeToDtypeTraits<const double> {
static constexpr DType dtype = float64;
};
template <>
struct TypeToDtypeTraits<int16_t>
{
template <> struct TypeToDtypeTraits<const int16_t> {
static constexpr DType dtype = int16;
};
template <>
struct TypeToDtypeTraits<int8_t>
{
template <> struct TypeToDtypeTraits<const int8_t> {
static constexpr DType dtype = int8;
};
template <>
struct TypeToDtypeTraits<int64_t>
{
template <> struct TypeToDtypeTraits<const int64_t> {
static constexpr DType dtype = int64;
};
template <>
struct TypeToDtypeTraits<uint8_t>
{
template <> struct TypeToDtypeTraits<const uint8_t> {
static constexpr DType dtype = uint8;
};
template <>
struct TypeToDtypeTraits<uint16_t>
{
template <> struct TypeToDtypeTraits<const uint16_t> {
static constexpr DType dtype = uint16;
};
template <>
struct TypeToDtypeTraits<uint32_t>
{
template <> struct TypeToDtypeTraits<const uint32_t> {
static constexpr DType dtype = uint32;
};
template <>
struct TypeToDtypeTraits<uint64_t>
{
template <> struct TypeToDtypeTraits<const uint64_t> {
static constexpr DType dtype = uint64;
};
template <> struct TypeToDtypeTraits<const bool> {
static constexpr DType dtype = bool_;
};
} // namespace detail
template <class T>
constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
template <class T> constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
struct Tensor
{
template <class... Ts, typename F> void dispatch(DType t, F &&f) {
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true;
mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (type_v<decltype(I)> == t) {
std::forward<F>(f)(decltype(I)());
notFound = false;
}
});
if (notFound) {
std::stringstream ss;
mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
ss << detail::TypeToString<decltype(I)>::value << " ";
});
TV_THROW_RT_ERR("unknown type", detail::typeString(t),
", available:", ss.str());
}
}
template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) {
static_assert(sizeof...(Is) > 0,
"you need to provide at least one candidate");
bool notFound = true;
mp_for_each<mp_list_c<T, Is...>>([=, &notFound, &f](auto I) {
if (T(I) == idx) {
std::forward<F>(f)(I);
notFound = false;
}
});
if (notFound) {
std::stringstream ss;
mp_for_each<mp_list_c<T, Is...>>([=, &ss](auto I) { ss << T(I) << " "; });
TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
}
}
template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
// used for kernel parameter selection
static_assert(sizeof...(Is) > 0,
"you need to provide at least one candidate");
bool notFound = true;
mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
if (int(I) == idx) {
std::forward<F>(f)(I);
notFound = false;
}
});
if (notFound) {
std::stringstream ss;
mp_for_each<mp_list_c<int, Is...>>(
[=, &ss](auto I) { ss << int(I) << " "; });
TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
}
}
/*
template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
return dispatch_scalar<int, Is...>(idx, f);
}
*/
template <class T> struct Dispatch;
template <template <class...> class T, class... Args>
struct Dispatch<T<Args...>> {
template <typename F> inline void operator()(DType t, F &&f) {
return dispatch<Args...>(t, std::forward<F>(f));
}
};
template <class T> struct DispatchInt;
template <template <int...> class T, int... Ints>
struct DispatchInt<T<Ints...>> {
template <typename F> inline void operator()(int t, F &&f) {
return dispatch_int<Ints...>(t, std::forward<F>(f));
}
};
constexpr size_t kTensorMaxDim = 10;
using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;
struct Tensor {
Tensor() {}
Tensor(Shape shape, DType dtype, int device = -1, bool managed = false)
: dtype_(dtype)
{
Tensor(TensorShape shape, TensorShape stride, DType dtype, int device = -1,
bool pinned = false, bool managed = false)
: dtype_(dtype) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
shape_ = shape;
stride_ = stride;
}
Tensor(TensorShape shape, DType dtype, int device = -1, bool pinned = false,
bool managed = false)
: dtype_(dtype) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
shape_ = shape;
stride_ = shape.stride_rowmajor();
}
Tensor(void *ptr, TensorShape shape, TensorShape stride, DType dtype,
int device = -1)
: dtype_(dtype) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
shape.size() * detail::sizeof_dtype(dtype), device, managed);
reinterpret_cast<uint8_t *>(ptr),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
stride_ = stride;
}
Tensor(void *ptr, Shape shape, DType dtype, int device = -1) : dtype_(dtype)
{
Tensor(void *ptr, TensorShape shape, DType dtype, int device = -1)
: dtype_(dtype) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
reinterpret_cast<uint8_t *>(ptr),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
stride_ = shape.stride_rowmajor();
}
template <typename T>
TensorView<T> tview()
{
Tensor(const void *ptr, TensorShape shape, TensorShape stride, DType dtype,
int device = -1)
: dtype_(dtype), writeable_(false) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
stride_ = stride;
}
Tensor(const void *ptr, TensorShape shape, DType dtype, int device = -1)
: dtype_(dtype), writeable_(false) {
TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
stride_ = shape.stride_rowmajor();
}
Tensor(std::initializer_list<int32_t> init)
: Tensor({int(init.size())}, tv::int32) {
std::copy(init.begin(), init.end(), data<int32_t>());
}
Tensor(std::initializer_list<int64_t> init)
: Tensor({int(init.size())}, tv::int64) {
std::copy(init.begin(), init.end(), data<int64_t>());
}
Tensor(std::initializer_list<float> init)
: Tensor({int(init.size())}, tv::float32) {
std::copy(init.begin(), init.end(), data<float>());
}
Tensor(std::initializer_list<double> init)
: Tensor({int(init.size())}, tv::float64) {
std::copy(init.begin(), init.end(), data<double>());
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<(Rank > 0), int>::type = 0>
TensorView<T, Rank, PtrTraits, Tindex> tview() {
using tv_shape_t =
typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
writable_check();
static_assert(Rank == -1 || Rank > 0, "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
return TensorView<T>(reinterpret_cast<T *>(storage_->data()), shape_);
tv_shape_t shape(Rank), stride(Rank);
for (int i = 0; i < Rank; ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<T, Rank, PtrTraits, Tindex>(
reinterpret_cast<T *>(data<T>()), shape, stride);
}
template <typename T>
TensorView<T> tview() const
{
TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<Rank == -1, int>::type = 0>
TensorView<T, Rank, PtrTraits, Tindex> tview() {
using tv_shape_t =
typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
writable_check();
static_assert(Rank == -1 || Rank > 0, "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return TensorView<const std::remove_const_t<T>>(
reinterpret_cast<const std::remove_const_t<T> *>(storage_->data()),
shape_);
ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
for (int i = 0; i < ndim(); ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<T, Rank, PtrTraits, Tindex>(
reinterpret_cast<T *>(data<T>()), shape, stride);
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<(Rank > 0), int>::type = 0>
TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
tview() const {
static_assert(Rank == -1 || Rank > 0, "error");
if (Rank > 0) {
TV_ASSERT_RT_ERR(Rank == ndim(), "error");
}
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
for (int i = 0; i < Rank; ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<Rank == -1, int>::type = 0>
TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
tview() const {
static_assert(Rank == -1 || Rank > 0, "error");
if (Rank > 0) {
TV_ASSERT_RT_ERR(Rank == ndim(), "error");
}
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
for (int i = 0; i < ndim(); ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
}
template <class... Inds> Tensor view(Inds... newShapes) const {
static_assert(sizeof...(newShapes) > 0, "dont support empty for now");
TensorShape shape{int(newShapes)...};
bool found_minus_1 = false;
for (size_t i = 0; i < shape.ndim(); ++i) {
if (!found_minus_1) {
if (shape[i] == -1) {
shape[i] = 1;
shape[i] = size() / shape.size();
found_minus_1 = true;
} else {
TV_ASSERT_INVALID_ARG(shape[i] > 0,
"shape except -1 must larger than 0");
}
} else {
TV_ASSERT_INVALID_ARG(shape[i] > 0, "multiple -1 in your argument.");
}
}
TV_ASSERT_RT_ERR(shape.size() == size(), "error");
Tensor res(*this);
res.shape_ = shape;
res.stride_ = shape.stride_rowmajor();
return res;
}
Tensor view(TensorShape shape) const {
TV_ASSERT_RT_ERR(shape.size() == size(), "error");
Tensor res(*this);
res.shape_ = shape;
res.stride_ = shape.stride_rowmajor();
return res;
}
Tensor squeeze() const { return view(shape_.squeeze()); }
Tensor squeeze(int axis) const {
if (axis < 0) {
axis = ndim() + axis;
}
return view(shape_.squeeze(axis));
}
Tensor unsqueeze(int axis) const {
if (axis < 0) {
axis = ndim() + axis;
}
return view(shape_.unsqueeze(axis));
}
bool pinned() const { return storage_->pinned(); }
Tensor slice_first_axis(int start, int end) const {
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
if (start < 0) {
start = shape_[0] + start;
}
if (end < 0) {
end = shape_[0] + end;
}
TV_ASSERT_INVALID_ARG(start < shape_[0], "start must small than dim 0");
TV_ASSERT_INVALID_ARG(start < end, "start must small than end");
size_t new_offset = start * shape_.prod(1) * itemsize();
Tensor res(*this);
TensorShape newshape(shape_);
newshape[0] = end - start;
res.shape_ = newshape;
res.stride_ = stride_;
res.offset_ = new_offset;
return res;
}
bool empty() const { return storage_->empty(); }
DType dtype() const { return dtype_; }
int device() const { return storage_->device(); }
const Shape &shape() const { return shape_; }
int dim(int idx) const
{
TV_ASSERT_RT_ERR(idx < shape_.size(), "error");
return shape_[idx];
size_t ndim() const { return shape_.ndim(); }
const TensorShape &shape() const { return shape_; }
const TensorShape &stride() const { return stride_; }
int dim(int idx) const {
if (idx < 0) {
TV_ASSERT_RT_ERR(shape_.size() + idx < shape_.size(), idx, shape_);
return shape_[shape_.size() + idx];
} else {
TV_ASSERT_RT_ERR(idx < int(shape_.size()), idx, shape_);
return shape_[idx];
}
}
const uint8_t *raw_data() const { return storage_->data(); }
const uint8_t *raw_data() const { return storage_->data() + offset_; }
size_t raw_size() const { return size() * itemsize(); }
size_t size() const { return shape_.size(); }
Tensor &zero_()
{
size_t itemsize() const { return detail::sizeof_dtype(dtype_); }
Tensor &zero_() {
writable_check();
storage_->zero_();
return *this;
}
uint8_t *raw_data() { return storage_->data(); }
template <typename T>
Tensor &fill_(T value)
{
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
auto ptr = reinterpret_cast<T *>(raw_data());
std::fill(ptr, ptr + size(), value);
uint8_t *raw_data() {
writable_check();
return storage_->data() + offset_;
}
template <typename T> Tensor &fill_(T value) {
writable_check();
TV_ASSERT_RT_ERR(device() == -1, "error");
Dispatch<detail::all_tensor_types_t>()(dtype_, [&](auto I) {
using Treal = decltype(I);
if (std::is_convertible<T, Treal>::value) {
auto ptr = reinterpret_cast<Treal *>(raw_data());
std::fill(ptr, ptr + size(), Treal(value));
} else {
TV_THROW_INVALID_ARG("not convertable from", type_s<T>, "to",
type_s<Treal>);
}
});
return *this;
}
template <typename T>
T *data()
{
template <typename T> T *data() {
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
writable_check();
return reinterpret_cast<T *>(raw_data());
}
template <typename T>
const T *data() const
{
template <typename T> const T *data() const {
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return reinterpret_cast<const T *>(raw_data());
}
void copy_(const Tensor &tensor)
{
void copy_(const Tensor &tensor) {
writable_check();
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype");
if (device() == -1 && tensor.device() == -1)
{
#ifdef SPCONV_CUDA
TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
detail::typeString(dtype()),
detail::typeString(tensor.dtype()));
if (device() == -1 && tensor.device() == -1) {
#ifdef TV_CUDA
host2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
#else
......@@ -379,88 +686,162 @@ struct Tensor
storage_->data());
#endif
}
#ifdef SPCONV_CUDA
else if (device() >= 0 && tensor.device() == -1)
{
// host2dev
#ifdef TV_CUDA
else if (device() >= 0 && tensor.device() == -1) {
host2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
else if (device() == -1 && tensor.device() >= 0)
{
// dev2host
} else if (device() == -1 && tensor.device() >= 0) {
dev2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
else if (device() >= 0 && tensor.device() >= 0)
{
// dev2dev
} else if (device() >= 0 && tensor.device() >= 0) {
dev2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
#endif
else
{
TV_ASSERT_RT_ERR(false, "only support cpu tensor");
else {
TV_THROW_RT_ERR("only support cpu tensor");
}
}
Tensor cpu() const
{
if (storage_->device() == -1)
{
return *this;
#ifdef TV_CUDA
void copy_(const Tensor &tensor, cudaStream_t stream) {
writable_check();
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
detail::typeString(dtype()),
detail::typeString(tensor.dtype()));
if (device() == -1 && tensor.device() == -1) {
host2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_), stream);
} else if (device() >= 0 && tensor.device() == -1) {
host2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_), stream);
} else if (device() == -1 && tensor.device() >= 0) {
dev2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_), stream);
} else if (device() >= 0 && tensor.device() >= 0) {
dev2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_), stream);
} else {
TV_THROW_RT_ERR("only support cpu tensor");
}
Tensor res(shape_, dtype_, -1, storage_->managed());
}
#endif
Tensor cpu() const {
if (storage_->device() == -1) {
// cpu() should always copy tensor.
return clone();
}
Tensor res(shape_, stride_, dtype_, -1, storage_->managed());
res.copy_(*this);
return res;
}
template <typename T>
void copy_(const TensorView<T> &tensor, int device)
{
template <typename T> void copy_(const TensorView<T> &tensor, int device) {
writable_check();
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
Tensor src = from_blob(tensor, device);
return copy_(src);
}
Tensor &operator=(const Tensor &tensor) {
dtype_ = tensor.dtype_;
storage_ = tensor.storage_;
shape_ = tensor.shape_;
writeable_ = tensor.writeable_;
offset_ = tensor.offset_;
stride_ = tensor.stride_;
return *this;
}
Tensor(const Tensor &tensor) {
dtype_ = tensor.dtype_;
storage_ = tensor.storage_;
shape_ = tensor.shape_;
writeable_ = tensor.writeable_;
offset_ = tensor.offset_;
stride_ = tensor.stride_;
}
Tensor clone(bool pinned = false) const {
TV_ASSERT_RT_ERR(!empty(), "clone a empty tensor");
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
Tensor newtensor(shape_, stride_, dtype_, device(), pinned,
storage_->managed());
newtensor.copy_(*this);
return newtensor;
}
Tensor astype(DType dtype) {
if (dtype == dtype_) {
return clone();
}
TV_ASSERT_INVALID_ARG(device() == -1, "only support cpu tensor");
TV_ASSERT_INVALID_ARG(!empty(), "can't be used in empty tensor");
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
auto tensor = Tensor();
Dispatch<detail::all_tensor_types_t>()(dtype, [&](auto Idst) {
using Tdst = decltype(Idst);
Dispatch<detail::all_tensor_types_t>()(dtype_, [&](auto Icur) {
using Tcur = decltype(Icur);
if (std::is_convertible<Tcur, Tdst>::value) {
auto ptr = data<Tcur>();
tensor = Tensor(shape_, stride_, dtype, device(), pinned(),
storage_->managed());
std::copy(ptr, ptr + size(), tensor.data<Tdst>());
} else {
TV_THROW_INVALID_ARG("not convertable from", type_s<Tcur>, "to",
type_s<Tdst>);
}
});
});
return tensor;
}
template <class... Ts, typename F> inline void dispatch(F &&f) {
return tv::dispatch<Ts...>(dtype_, std::forward<F>(f));
}
protected:
inline void writable_check() {
TV_ASSERT_RT_ERR(writeable_,
"you cant do non-const operation when not writable");
}
DType dtype_;
std::shared_ptr<detail::TensorStorage<uint8_t>> storage_;
Shape shape_;
TensorShape shape_;
size_t offset_ = 0;
TensorShape stride_;
private:
bool writeable_ = true;
bool contiguous_ = true;
};
inline Tensor from_blob(void *ptr, Shape shape, DType dtype, int device)
{
return Tensor(ptr, shape, dtype, device);
template <typename Os> Os &operator<<(Os &os, const Tensor &tensor) {
TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
Dispatch<detail::all_tensor_types_t>()(tensor.dtype(), [&](auto I) {
using T = decltype(I);
std::stringstream ss;
if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
ss << std::setprecision(4);
}
os << tensor.tview<T, -1, DefaultPtrTraits, int64_t>().repr(ss);
});
return os;
}
template <typename T>
Tensor from_blob(TensorView<T> tensor, int device)
{
return Tensor(tensor.data(), tensor.shape, type_v<T>, device);
inline Tensor from_blob(void *ptr, TensorShape shape, DType dtype, int device) {
return Tensor(ptr, shape, dtype, device);
}
template <class... Ts, typename F>
void dispatch(DType t, F &&f)
{
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (type_v<decltype(I)> == t)
{
std::forward<F>(f)(decltype(I)());
notFound = false;
}
});
if (notFound)
{
std::stringstream ss;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
ss << detail::TypeToString<decltype(I)>::value << " ";
});
TV_THROW_RT_ERR("unknown type", detail::typeString(t),
", available: ", ss.str());
}
inline Tensor from_blob(const void *ptr, TensorShape shape, DType dtype,
int device) {
return Tensor(ptr, shape, dtype, device);
}
} // namespace tv
\ No newline at end of file
// Copyright 2019 Yan Yan
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -17,19 +17,19 @@
#include <cassert>
#include <cstdlib>
#include "common.h"
#include "prettyprint.h"
#include <iostream>
#include <memory>
#include <sstream>
#include <type_traits>
#include <vector>
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
#include <cuda_runtime_api.h>
#endif
namespace tv {
#ifdef __NVCC__
#if (defined(__clang__) && defined(__CUDA__)) || defined(__NVCC__)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__
......@@ -54,54 +54,6 @@ namespace tv {
} \
}
#define TV_DEVICE_REQUIRE(expr, ...) \
{ \
if (!(expr) && threadIdx.x == 0) \
printf(__VA_ARGS__); \
assert(expr); \
}
template <class SStream, class T> void sstream_print(SStream &ss, T val) {
ss << val;
}
template <class SStream, class T, class... TArgs>
void sstream_print(SStream &ss, T val, TArgs... args) {
ss << val << " ";
sstream_print(ss, args...);
}
template <class... TArgs> void ssprint(TArgs... args) {
std::stringstream ss;
sstream_print(ss, args...);
std::cout << ss.str() << std::endl;
}
#define TV_THROW_RT_ERR(...) \
{ \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) \
TV_THROW_RT_ERR(__VA_ARGS__); \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::invalid_argument(__macro_s.str()); \
} \
}
#define TV_CHECK_CUDA_ERR() \
{ \
auto __macro_err = cudaGetLastError(); \
......@@ -109,6 +61,7 @@ template <class... TArgs> void ssprint(TArgs... args) {
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
} \
}
......@@ -122,11 +75,12 @@ template <class... TArgs> void ssprint(TArgs... args) {
__macro_s << "cuda execution failed with error " << __macro_err; \
__macro_s << " " << cudaGetErrorString(__macro_err) << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
TV_BACKTRACE_PRINT(__macro_s); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
struct GPU {
GPU(cudaStream_t s = 0) : mStream(s) {}
virtual cudaStream_t getStream() const { return mStream; }
......@@ -135,7 +89,18 @@ struct GPU {
#endif
struct CPU {};
#ifndef TV_MAX_DIM
#define TV_MAX_DIM 6
#endif
template <typename T> struct DefaultPtrTraits { typedef T *type; };
#if defined(__CUDACC__) || defined(__HIPCC__)
template <typename T> struct RestrictPtrTraits {
typedef T *__restrict__ type;
};
#endif
/*
template <typename T>
constexpr size_t calc_align(size_t ndim)
......@@ -160,57 +125,73 @@ template <typename T, size_t MaxDim = TV_MAX_DIM>
struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
public:
TV_HOST_DEVICE_INLINE SimpleVector(){};
TV_HOST_DEVICE_INLINE SimpleVector(size_t count, T init = T())
: size_(count) {
for (size_t i = 0; i < count; ++i) {
array_[i] = init;
}
};
template <typename Iterator> SimpleVector(Iterator first, Iterator last) {
size_ = 0;
for (; first != last; ++first) {
if (size_ >= MaxDim) {
TV_THROW_INVALID_ARG("iterator too long");
}
array_[size_++] = *first;
}
};
TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
TV_ASSERT(q.size() <= MaxDim);
mSize = 0;
size_ = 0;
for (T s : q) {
mArray[mSize++] = s;
array_[size_++] = s;
}
mSize = q.size();
size_ = q.size();
}
SimpleVector(const std::vector<T> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
mArray[i] = arr[i];
array_[i] = arr[i];
}
mSize = arr.size();
size_ = arr.size();
}
TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
mArray[i] = arr[i];
array_[i] = arr[i];
}
mSize = arr.size();
size_ = arr.size();
}
TV_HOST_DEVICE_INLINE T &operator[](int idx) {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < mSize);
TV_ASSERT(idx >= 0 && idx < size_);
#endif
return mArray[idx];
return array_[idx];
}
TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < mSize);
TV_ASSERT(idx >= 0 && idx < size_);
#endif
return mArray[idx];
return array_[idx];
}
TV_HOST_DEVICE_INLINE void push_back(T s) {
#ifdef TV_DEBUG
TV_ASSERT(mSize < MaxDim);
TV_ASSERT(size_ < MaxDim);
#endif
mArray[mSize] = s;
mSize++;
array_[size_] = s;
size_++;
}
TV_HOST_DEVICE_INLINE void pop_back() {
#ifdef TV_DEBUG
TV_ASSERT(mSize > 0);
TV_ASSERT(size_ > 0);
#endif
mSize--;
size_--;
}
TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
TV_HOST_DEVICE_INLINE const T *data() const { return mArray; }
TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
TV_HOST_DEVICE_INLINE size_t size() const { return size_; }
TV_HOST_DEVICE_INLINE const T *data() const { return array_; }
TV_HOST_DEVICE_INLINE T *data() { return array_; }
TV_HOST_DEVICE_INLINE size_t empty() const { return size_ == 0; }
typedef size_t size_type;
......@@ -234,10 +215,10 @@ public:
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
return ptr_ != rhs.ptr_;
}
......@@ -265,10 +246,10 @@ public:
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
return ptr_ != rhs.ptr_;
}
......@@ -276,28 +257,28 @@ public:
pointer ptr_;
};
TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
TV_HOST_DEVICE_INLINE iterator begin() { return iterator(array_); }
TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
TV_HOST_DEVICE_INLINE iterator end() { return iterator(array_ + size_); }
TV_HOST_DEVICE_INLINE const_iterator begin() const {
return const_iterator(mArray);
return const_iterator(array_);
}
TV_HOST_DEVICE_INLINE const_iterator end() const {
return const_iterator(mArray + mSize);
return const_iterator(array_ + size_);
}
TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
return const_iterator(mArray);
return const_iterator(array_);
}
TV_HOST_DEVICE_INLINE const_iterator cend() const {
return const_iterator(mArray + mSize);
return const_iterator(array_ + size_);
}
protected:
T mArray[MaxDim];
size_t mSize = 0;
T array_[MaxDim];
size_t size_ = 0;
};
template <typename T, size_t MaxDim>
......@@ -323,28 +304,28 @@ struct Slice {
template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
SimpleVector<int, 3> slices{int(ints)...};
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
slices_[0] = -1;
slices_[1] = -1;
slices_[2] = -1;
for (size_t i = 0; i < slices.size(); ++i) {
mSlices[i] = slices[i];
slices_[i] = slices[i];
}
}
TV_HOST_DEVICE_INLINE Slice() {
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
slices_[0] = -1;
slices_[1] = -1;
slices_[2] = -1;
}
template <typename T>
TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
slices_[0] = -1;
slices_[1] = -1;
slices_[2] = -1;
TV_ASSERT(slice.size() <= 3);
int idx = 0;
for (T s : slice) {
mSlices[idx] = int(s);
slices_[idx] = int(s);
++idx;
}
}
......@@ -352,90 +333,124 @@ struct Slice {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return mSlices[idx];
return slices_[idx];
}
TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return mSlices[idx];
return slices_[idx];
}
protected:
int mSlices[3];
int slices_[3];
};
template <size_t MaxDim = TV_MAX_DIM>
struct ShapeBase : public SimpleVector<int, MaxDim> {
TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
: SimpleVector<int, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<int, MaxDim> vec)
: SimpleVector<int, MaxDim>(vec) {}
template <size_t MaxDim = TV_MAX_DIM, typename Tindex = int>
struct ShapeBase : public SimpleVector<Tindex, MaxDim> {
TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<Tindex, MaxDim>(){};
TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<Tindex> shape)
: SimpleVector<Tindex, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<Tindex, MaxDim> vec)
: SimpleVector<Tindex, MaxDim>(vec) {}
template <typename T, template <class...> class Container>
ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
ShapeBase(Container<T> shape) : SimpleVector<Tindex, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
: SimpleVector<int, MaxDim>(shape) {}
ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
: SimpleVector<Tindex, MaxDim>(shape) {}
ShapeBase(const std::vector<Tindex> &arr)
: SimpleVector<Tindex, MaxDim>(arr) {}
ShapeBase<MaxDim, Tindex> &
operator=(const ShapeBase<MaxDim, Tindex> &shape) = default;
TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start,
Tindex end) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && end <= this->mSize && end > start);
TV_ASSERT(start >= 0 && end <= this->size_ && end > start);
#endif
ShapeBase<MaxDim> shape;
for (int i = start; i < end; ++i) {
shape.push_back(this->mArray[i]);
ShapeBase<MaxDim, Tindex> shape;
for (Tindex i = start; i < end; ++i) {
shape.push_back(this->array_[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && start <= this->mSize);
TV_ASSERT(start >= 0 && start <= this->size_);
#endif
ShapeBase<MaxDim> shape;
for (int i = start; i < this->mSize; ++i) {
shape.push_back(this->mArray[i]);
ShapeBase<MaxDim, Tindex> shape;
for (size_t i = start; i < this->size_; ++i) {
shape.push_back(this->array_[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE size_t size() const {
if (this->mSize == 0)
TV_HOST_DEVICE size_t size() const {
if (this->size_ == 0)
return 0;
size_t s = 1;
for (int i = 0; i < int(this->mSize); ++i) {
s *= this->mArray[i];
for (int i = 0; i < int(this->size_); ++i) {
s *= this->array_[i];
}
return s;
}
TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) {
if (this->mArray[i] != 1)
shape.push_back(this->mArray[i]);
TV_HOST_DEVICE_INLINE size_t ndim() const { return this->size_; }
TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> squeeze() const {
ShapeBase<MaxDim, Tindex> shape;
for (size_t i = 0; i < this->size_; ++i) {
if (this->array_[i] != 1)
shape.push_back(this->array_[i]);
}
if (shape.empty()) {
// dont support empty shape for now
shape.push_back(1);
}
return shape;
}
template <size_t MaxDim2 = MaxDim>
TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> squeeze(int dim) const {
static_assert(MaxDim2 >= MaxDim - 1, "error");
ShapeBase<MaxDim2, Tindex> shape;
for (size_t i = 0; i < this->size_; ++i) {
if (i != size_t(dim) || this->array_[i] != 1)
shape.push_back(this->array_[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) {
if (i != dim || this->mArray[i] != 1)
shape.push_back(this->mArray[i]);
template <size_t MaxDim2 = MaxDim>
TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> unsqueeze(int dim) const {
static_assert(MaxDim2 >= MaxDim - 1, "error");
ShapeBase<MaxDim2, Tindex> shape;
for (size_t i = 0; i < this->size_; ++i) {
if (i == size_t(dim))
shape.push_back(1);
shape.push_back(this->array_[i]);
}
return shape;
}
TV_HOST_DEVICE size_t prod() const {
TV_HOST_DEVICE size_t prod(Tindex start = 0) const {
size_t res = 1;
for (size_t i = 0; i < this->mSize; ++i) {
res *= this->mArray[i];
for (size_t i = start; i < this->size_; ++i) {
res *= this->array_[i];
}
return res;
}
template <size_t MaxDim2 = MaxDim>
TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> stride_rowmajor() {
static_assert(MaxDim2 >= MaxDim, "error");
Tindex p = Tindex(1);
ShapeBase<MaxDim2, Tindex> res(this->size_);
for (Tindex i = this->size_ - 1; i >= 0; --i) {
res[i] = p;
p *= this->array_[i];
}
return res;
}
};
using Shape = ShapeBase<TV_MAX_DIM>;
using Shape = ShapeBase<TV_MAX_DIM, int>;
template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
......@@ -446,7 +461,9 @@ TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
#ifdef TV_DEBUG
TV_ASSERT(sizeof...(indexes) == shape.size());
#endif
#if defined(__CUDA_ARCH__)
#pragma unroll
#endif
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
......@@ -471,7 +488,9 @@ TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
unsigned offset = 0;
unsigned m = 1;
int indexes_vec[sizeof...(indexes)] = {indexes...};
#if defined(__CUDA_ARCH__)
#pragma unroll
#endif
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
......@@ -495,7 +514,9 @@ TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
const Index *shape) {
unsigned offset = 0;
unsigned m = 1;
#if defined(__CUDA_ARCH__)
#pragma unroll
#endif
for (int i = NDim - 1; i >= 0; --i) {
offset += m * indexes[i];
m *= shape[i];
......@@ -515,416 +536,501 @@ TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
return index;
}
template <int N> struct ArrayIndexRowMajor {
// mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) {
return indexes[N - 1] +
shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
template <typename Index>
TV_HOST_DEVICE Index rowArrayIdxInv(Index index, Index *output,
const Index *shape, int ndim) {
for (int i = ndim - 1; i >= 0; --i) {
output[i] = index % shape[i];
index -= output[i];
index /= shape[i];
}
return index;
}
template <int N> struct ArrayIndexRowMajorReverse {
template <typename TShape, typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T index,
Ts... inds) {
return index +
shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
}
template <typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T index,
Ts... inds) {
return index +
shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
}
};
template <> struct ArrayIndexRowMajorReverse<1> {
template <typename TShape, typename T>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T idx) {
return idx;
}
template <typename T>
TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T idx) {
return idx;
}
};
template <int N, int Ndim> struct ArrayIndexRowMajor {
// this array index provide almost same compiled code. compile it in
// https://godbolt.org/ for more details.
template <typename TShape, typename Tinit, typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
T index, Ts... inds) {
return ArrayIndexRowMajor<N - 1, Ndim>::run(
shape, (index + start) * shape[Ndim - N + 1], inds...);
}
template <typename Tinit, typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned
runShape(const Shape &shape, Tinit start, T index, Ts... inds) {
return ArrayIndexRowMajor<N - 1, Ndim>::runShape(
shape, (index + start) * shape[Ndim - N + 1], inds...);
}
};
template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
template <typename TShape, typename Tinit, typename T>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
T idx) {
return start + idx;
}
template <typename Tinit, typename T>
TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
Tinit start, T idx) {
return start + idx;
}
};
template <> struct ArrayIndexRowMajor<0> {
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) {
template <> struct ArrayIndexRowMajor<0, 0> {
template <typename TShape, typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start) {
return 0;
}
template <typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
Tinit start) {
return 0;
}
};
namespace detail {
template <typename T> constexpr const char *simpleTypeName(T val = T());
template <> constexpr const char *simpleTypeName(float val) {
return "float32";
}
template <> constexpr const char *simpleTypeName(double val) {
return "float64";
}
template <> constexpr const char *simpleTypeName(int val) { return "int32"; }
template <> constexpr const char *simpleTypeName(unsigned val) {
return "uint32";
}
template <> constexpr const char *simpleTypeName(long val) { return "int64"; }
template <> constexpr const char *simpleTypeName(unsigned long val) {
return "uint64";
template <int N, int Ndim> struct ArrayIndexStride {
// this array index provide almost same compiled code. compile it in
// https://godbolt.org/ for more details.
template <typename TShape, typename Tinit, typename T, class... Ts>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
T index, Ts... inds) {
return ArrayIndexStride<N - 1, Ndim>::run(
stride, start + index * stride[Ndim - N + 1], inds...);
}
};
template <int Ndim> struct ArrayIndexStride<1, Ndim> {
template <typename TShape, typename Tinit, typename T>
TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
T idx) {
return start + idx * stride[Ndim - 1];
}
};
#if __cplusplus >= 201703L
template <size_t... N, class T, class... Ts>
TV_HOST_DEVICE_INLINE T array_index_stride(const T *stride, Ts... ids) {
return ((stride[N] * std::get<N>(std::forward_as_tuple(ids...))) + ...);
}
}; // namespace detail
#endif
template <typename T, int Rank = -1> struct TensorView {
TV_HOST_DEVICE_INLINE TensorView() {}
explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
: mPtr(ptr), mShape(shape) {}
// explicit TV_HOST_DEVICE_INLINE TensorView(const
// TensorView<std::remove_const_t<T>> &tview) : mPtr(tview.data()),
// mShape(tview.shape()) {}
template <class... Integers>
explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Integers... shapes)
: mPtr(ptr) {
mShape = {int(shapes)...};
namespace detail {
template <typename T> struct TypeToString;
template <> struct TypeToString<bool> {
static constexpr const char *value = "bool";
};
template <> struct TypeToString<const bool> {
static constexpr const char *value = "bool";
};
template <> struct TypeToString<int32_t> {
static constexpr const char *value = "int32";
};
template <> struct TypeToString<float> {
static constexpr const char *value = "float";
};
template <> struct TypeToString<double> {
static constexpr const char *value = "double";
};
template <> struct TypeToString<int16_t> {
static constexpr const char *value = "int16";
};
template <> struct TypeToString<int8_t> {
static constexpr const char *value = "int8";
};
template <> struct TypeToString<int64_t> {
static constexpr const char *value = "int64";
};
template <> struct TypeToString<uint8_t> {
static constexpr const char *value = "uint8";
};
template <> struct TypeToString<uint16_t> {
static constexpr const char *value = "uint16";
};
template <> struct TypeToString<uint32_t> {
static constexpr const char *value = "uint32";
};
template <> struct TypeToString<uint64_t> {
static constexpr const char *value = "uint64";
};
template <> struct TypeToString<const int32_t> {
static constexpr const char *value = "int32";
};
template <> struct TypeToString<const float> {
static constexpr const char *value = "float";
};
template <> struct TypeToString<const double> {
static constexpr const char *value = "double";
};
template <> struct TypeToString<const int16_t> {
static constexpr const char *value = "int16";
};
template <> struct TypeToString<const int8_t> {
static constexpr const char *value = "int8";
};
template <> struct TypeToString<const int64_t> {
static constexpr const char *value = "int64";
};
template <> struct TypeToString<const uint8_t> {
static constexpr const char *value = "uint8";
};
template <> struct TypeToString<const uint16_t> {
static constexpr const char *value = "uint16";
};
template <> struct TypeToString<const uint32_t> {
static constexpr const char *value = "uint32";
};
template <> struct TypeToString<const uint64_t> {
static constexpr const char *value = "uint64";
};
} // namespace detail
template <typename T>
constexpr const char *type_s = detail::TypeToString<T>::value;
namespace detail {
template <typename T, int Rank,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int>
struct TensorAccesserBase {
static constexpr int rank_value = Rank;
using ptr_t = typename PtrTraits<T>::type;
static_assert(Rank > 0, "error");
explicit TV_HOST_DEVICE_INLINE TensorAccesserBase(ptr_t ptr,
const Tindex *stride_ptr)
: ptr_(ptr), stride_ptr_(stride_ptr) {}
TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
static_assert(sizeof...(inds) == Rank, "error");
return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
}
operator TensorView<const T>() {
return TensorView<const T>(mPtr, mShape);
} // conversion function
TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
assign(const TensorView<T, Rank> &tensor) {
TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
"\n");
T *ptr = mPtr;
const T *other_ptr = tensor.data();
for (size_t i = 0; i < size(); ++i)
*(ptr++) = *(other_ptr++);
return *this;
template <class... Inds>
TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
static_assert(sizeof...(inds) == Rank, "error");
return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
}
template <typename T1>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
assign(std::initializer_list<T1> seq) {
TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
"\n");
T *ptr = mPtr;
for (const T1 &s : seq)
*(ptr++) = T(s);
return *this;
protected:
const Tindex *stride_ptr_;
ptr_t ptr_;
};
} // namespace detail
template <typename T, int Rank,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int>
struct TensorAccesser
: public detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex> {
using ptr_t = typename PtrTraits<T>::type;
static_assert(Rank > 0, "error");
explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
const Tindex *stride_ptr)
: detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex>(ptr,
stride_ptr) {}
TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
operator[](int i) {
return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
}
TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
operator[](int i) const {
return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
}
};
template <typename T, template <class> class PtrTraits, typename Tindex>
struct TensorAccesser<T, 1, PtrTraits, Tindex>
: public detail::TensorAccesserBase<T, 1, PtrTraits, Tindex> {
using ptr_t = typename PtrTraits<T>::type;
explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
const Tindex *stride_ptr)
: detail::TensorAccesserBase<T, 1, PtrTraits, Tindex>(ptr, stride_ptr) {}
TV_HOST_DEVICE_INLINE T &operator[](int i) {
return this->ptr_[this->stride_ptr_[0] * i];
}
TV_HOST_DEVICE_INLINE T &operator[](int i) const {
return this->ptr_[this->stride_ptr_[0] * i];
}
};
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int>
struct TensorView {
static constexpr int rank_value = Rank;
using ptr_t = typename PtrTraits<T>::type;
using tv_shape_t = ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex>;
using no_cv_type = typename std::remove_cv<T>::type;
static_assert(Rank == -1 || Rank > 0, "error");
TV_HOST_DEVICE_INLINE TensorView() {}
explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape)
: ptr_(ptr), shape_(shape), stride_(shape.stride_rowmajor()) {}
explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape,
tv_shape_t stride)
: ptr_(ptr), shape_(shape), stride_(stride) {}
operator TensorView<const no_cv_type, Rank, PtrTraits, Tindex>() {
return TensorView<const no_cv_type, Rank, PtrTraits, Tindex>(ptr_, shape_);
} // conversion function
template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
#ifdef TV_DEBUG
static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
#if defined TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
mShape.ndim());
shape_.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
mShape[i]);
shape_[i]);
}
#endif
return mPtr[rowArrayIdx(mShape, int(inds)...)];
constexpr int Ndim = sizeof...(Inds);
return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
}
template <class... Inds>
TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
#ifdef TV_DEBUG
static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
#if defined TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
mShape.ndim());
shape_.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
mShape[i]);
shape_[i]);
}
#endif
return mPtr[rowArrayIdx(mShape, int(inds)...)];
constexpr int Ndim = sizeof...(Inds);
return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
}
TV_HOST_DEVICE_INLINE T &operator()() {
static_assert(Rank == -1 || 0 == Rank, "error");
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mPtr != nullptr,
"you want get value but the view is empty.%s", "\n");
TV_DEVICE_REQUIRE(mShape.ndim() == 0,
"you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
shape_.ndim());
#endif
#endif
return mPtr[0];
return ptr_[0];
}
TV_HOST_DEVICE_INLINE const T &operator()() const {
static_assert(Rank == -1 || 0 == Rank, "error");
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mPtr != nullptr,
"you want get value but the view is empty.%s", "\n");
TV_DEVICE_REQUIRE(mShape.ndim() == 0,
"you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
shape_.ndim());
#endif
#endif
return mPtr[0];
return ptr_[0];
}
template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
static_assert(Rank == -1 || 1 == Rank, "error");
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1,
"you provide 1 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#else
TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#endif
TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, shape_[0]);
#endif
return mPtr[i1];
return ptr_[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 2,
"you provide 2 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
#else
TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
#endif
static_assert(Rank == -1 || 2 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
#endif
return mPtr[i1 * mShape[1] + i2];
return ptr_[i1 * shape_[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 3,
"you provide 3 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
#else
TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
static_assert(Rank == -1 || 3 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
#endif
return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 4,
"you provide 4 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
mShape[3]);
#else
TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
static_assert(Rank == -1 || 4 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
#endif
return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
}
template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1,
"you provide 1 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
#else
TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
#endif
static_assert(Rank == -1 || 1 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
#endif
return mPtr[i1];
return ptr_[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 2,
"you provide 2 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
#else
TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
#endif
static_assert(Rank == -1 || 2 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
#endif
return mPtr[i1 * mShape[1] + i2];
return ptr_[i1 * shape_[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 3,
"you provide 3 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
#else
TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
static_assert(Rank == -1 || 3 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
#endif
return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 4,
"you provide 4 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
mShape[3]);
#else
TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
static_assert(Rank == -1 || 4 == Rank, "error");
#if defined TV_DEBUG
TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
shape_.ndim());
TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
#endif
return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
}
TV_HOST_DEVICE_INLINE T &operator[](int idx) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
"index(%d) out-of-range: [0, %ld)\n", int(idx), size());
#else
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
#endif
return mPtr[idx];
return ptr_[idx];
}
TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
"index(%d) out-of-range: [0, %ld)\n", int(idx), size());
#else
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
#endif
return mPtr[idx];
return ptr_[idx];
}
// TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
/*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
int idx = rowArrayIdx(mShape, index);
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return mPtr[idx];
TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
accessor(Tindex idx) {
static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
ptr_ + stride_[0] * idx, stride_.data() + 1);
}
TV_HOST_DEVICE_INLINE const T &operator[](const Shape index) const {
int idx = rowArrayIdx(mShape, index);
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return mPtr[idx];
}*/
TV_HOST_DEVICE_INLINE TensorView<T, Rank>
operator[](SimpleVector<Slice> slice_vec) {
return _subview(slice_vec);
}
TV_HOST_DEVICE_INLINE const TensorView<T, Rank>
operator[](SimpleVector<Slice> slice_vec) const {
return _subview(slice_vec);
}
TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
TV_HOST_DEVICE_INLINE T *data() { return mPtr; }
TV_HOST_DEVICE_INLINE const T *data() const { return mPtr; }
TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
template <class... Inds>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Inds... newShapes) {
Shape shapes{int(newShapes)...};
TV_ASSERT(shapes.size() == size());
mShape = shapes;
return *this;
TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() {
static_assert(Rank > 0, "rank must higher than zero");
return TensorAccesser<T, Rank, PtrTraits, Tindex>(ptr_, stride_.data());
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Shape shapes) {
TV_ASSERT(shapes.size() == size());
mShape = shapes;
return *this;
TV_HOST_DEVICE_INLINE
TensorAccesser<T, Rank - 1, PtrTraits, Tindex> accessor(Tindex idx) const {
static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
ptr_ + stride_[0] * idx, stride_.data() + 1);
}
TV_HOST_DEVICE_INLINE
TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() const {
static_assert(Rank > 0, "error");
return TensorAccesser<T, Rank, PtrTraits, Tindex>(
ptr_, stride_.data(), "rank must higher than zero");
}
TV_HOST_DEVICE_INLINE bool empty() const { return ptr_ == nullptr; }
TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
TV_HOST_DEVICE_INLINE const tv_shape_t &shape() const { return shape_; }
TV_HOST_DEVICE_INLINE const tv_shape_t &stride() const { return stride_; }
TV_HOST_DEVICE_INLINE int dim(int idx) const { return shape_[idx]; }
TV_HOST_DEVICE_INLINE int ndim() const { return shape_.ndim(); }
template <class... Inds>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Inds... newShapes) const {
Shape shapes{int(newShapes)...};
for (size_t i = 0; i < shapes.ndim(); ++i) {
TV_HOST_DEVICE_INLINE
TensorView<T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits, Tindex>
view(Inds... newShapes) const {
ShapeBase<Rank == -1 ? TV_MAX_DIM : sizeof...(Inds), Tindex> shapes{
int(newShapes)...};
for (size_t i = 0; i < sizeof...(newShapes); ++i) {
if (shapes[i] == -1) {
shapes[i] = 1;
shapes[i] = size() / shapes.size();
......@@ -932,220 +1038,221 @@ template <typename T, int Rank = -1> struct TensorView {
}
}
TV_ASSERT(shapes.size() == size());
return TensorView<T, Rank>(mPtr, shapes);
return TensorView < T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits,
Tindex > (ptr_, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Shape shapes) const {
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
view(Shape shapes) const {
TV_ASSERT(shapes.size() == size());
return TensorView<T, Rank>(mPtr, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze() const {
return TensorView<T, Rank>(mPtr, mShape.squeeze());
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze(int dim) const {
return TensorView<T, Rank>(mPtr, mShape.squeeze(dim));
}
TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
template <class... Slices>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slice slice,
Slices... slices) const {
return subview<float, Slice, Slices...>(slice, slices...);
}
template <class T2 = float, class... Slices>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slices... slices) const {
Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
Shape new_shape{to_slice(slices)[0]...};
Shape start{to_slice(slices)[0]...};
TV_ASSERT(new_shape.ndim() <= mShape.ndim());
TV_ASSERT(new_shape.ndim() != 0);
size_t idxsize = new_shape.ndim();
for (size_t i = idxsize; i < mShape.ndim(); ++i) {
new_shape.push_back(0);
return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex> squeeze() const {
return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shape_.squeeze());
}
TV_HOST_DEVICE_INLINE
TensorView<T, Rank == -1 ? -1 : Rank - 1, PtrTraits, Tindex>
squeeze(int dim) const {
return TensorView < T, Rank == -1 ? -1 : Rank - 1, PtrTraits,
Tindex > (ptr_, shape_.squeeze < Rank == -1 ? TV_MAX_DIM
: Rank - 1 > (dim));
}
TV_HOST_DEVICE_INLINE size_t size() const { return shape_.size(); }
template <class... Integers>
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
subview(int id, Integers... ints) {
tv_shape_t start = {id, ints...};
for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
start.push_back(0);
}
#pragma unroll
for (size_t i = 0; i < sizeof...(Slices); ++i) {
if (slice_vec[i][1] != -1) {
new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0);
} else {
new_shape[i] = 1; // reduce dim
}
}
auto offset = rowArrayIdx(mShape, start);
#pragma unroll
for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
new_shape[i] = mShape[i];
TV_ASSERT(new_shape[i] >= 0);
}
Shape reduced_shape;
#pragma unroll
for (size_t i = 0; i < sizeof...(Slices); ++i) {
if (slice_vec[i][1] != -1) {
reduced_shape.push_back(new_shape[i]);
}
}
#pragma unroll
for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
reduced_shape.push_back(new_shape[i]);
}
return TensorView<T, Rank>(mPtr + offset, reduced_shape);
return TensorView<T, Rank, PtrTraits, Tindex>(
ptr_ + rowArrayIdx(shape_, start),
shape_.subshape(sizeof...(ints) + 1));
}
template <class... Integers>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(int id, Integers... ints) {
Shape start = {id, ints...};
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
subview(int id, Integers... ints) const {
tv_shape_t start = {id, ints...};
for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
mShape.subshape(sizeof...(ints) + 1));
return TensorView<T, Rank, PtrTraits, Tindex>(
ptr_ + rowArrayIdx(shape_, start),
shape_.subshape(sizeof...(ints) + 1));
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank>
subview_ints(SimpleVector<int> ids) const {
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
subview(SimpleVector<int> ids) const {
Shape start = ids;
for (int i = ids.size(); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
mShape.subshape(ids.size()));
}
std::string print_vec(TensorView<T> tensor) const {
std::ostringstream ss;
ss << "[";
for (size_t i = 0; i < tensor.dim(0) - 1; ++i) {
ss << tensor(i) << ", ";
}
ss << tensor(tensor.dim(0) - 1) << "]";
return ss.str();
return TensorView<T, Rank, PtrTraits, Tindex>(
ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
}
std::string repr() const {
std::ostringstream ss;
template <typename Os>
std::string repr(Os &ss, int limit = 1000, int limit_axis = 6) const {
if (empty())
return "";
if (mShape.ndim() == 0) {
ss << *mPtr;
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>());
ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
if (shape_.ndim() == 0) {
ss << "Tensor[" << type_s<T> << "]" << std::endl;
ss << *ptr_;
return ss.str();
}
Shape counter = mShape;
auto tensor_flat = this->view(-1);
bool enable_limit = size() > limit;
for (int i = 0; i < counter.ndim() - 1; ++i) {
counter[i] = 0;
// ss << "[";
SimpleVector<int64_t, TV_MAX_DIM> prev(ndim(), -1);
SimpleVector<int64_t, TV_MAX_DIM> nd_index(ndim());
SimpleVector<int64_t, TV_MAX_DIM> _shape;
for (auto s : shape()) {
_shape.push_back(s);
}
for (size_t i = 0; i < this->size() / this->dim(this->ndim() - 1); ++i) {
for (int i = 0; i < counter.ndim() - 1; ++i) {
if (counter[i] == 0) {
ss << "[";
ss << "Tensor[" << type_s<T> << "]: shape=" << shape()
<< ", stride=" << stride() << std::endl;
auto ndimValue = ndim();
for (int64_t i = 0; i < size(); ++i) {
rowArrayIdxInv(i, nd_index.data(), _shape.data(), ndimValue);
bool newline = false;
int end_count = 0;
for (int j = 0; j < ndimValue; ++j) {
if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
prev[j] != -1) {
ss << "]";
++end_count;
newline = true;
}
}
std::cout << "counter.ndim() " << counter.ndim() << std::endl;
auto counter_ = counter.subshape(0, counter.ndim() - 1);
std::cout << counter.subshape(0, counter.ndim() - 1) << std::endl;
ss << print_vec(this->subview_ints(counter_)) << "\n";
std::cout << "after counter.ndim() " << counter.ndim() << std::endl;
for (int i = 0; i < counter.ndim() - 1; ++i) {
if (counter[i] == this->dim(i) - 1) {
ss << "]";
if (prev[0] == -1) {
end_count = ndimValue;
}
if (newline) {
ss << "\n";
}
int starts_count = 0;
for (int j = 0; j < ndimValue; ++j) {
if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
++starts_count;
}
}
}
// ss << "]";
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>());
ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
return ss.str();
}
protected:
// TODO: make this function public.
// currently this function is called unexpectedly when using subview({0, 0}).
TV_HOST_DEVICE_INLINE TensorView<T, Rank>
_subview(SimpleVector<Slice> slice_vec) {
Shape new_shape;
for (int i = 0; i < slice_vec.size(); ++i) {
new_shape.push_back(slice_vec[i][0]);
}
Shape start = new_shape;
TV_ASSERT(new_shape.ndim() <= mShape.ndim());
TV_ASSERT(new_shape.ndim() != 0);
size_t idxsize = new_shape.ndim();
for (size_t i = idxsize; i < mShape.ndim(); ++i) {
new_shape.push_back(0);
start.push_back(0);
}
for (size_t i = 0; i < slice_vec.size(); ++i) {
if (slice_vec[i][1] != -1) {
new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0);
if (starts_count > 0) {
for (int j = 0; j < ndimValue - end_count; ++j) {
ss << " ";
}
for (int j = 0; j < starts_count; ++j) {
ss << "[";
}
}
if (std::is_same<T, uint8_t>::value ||
std::is_same<T, const uint8_t>::value) {
ss << unsigned((*this)[i]);
} else {
new_shape[i] = 1; // reduce dim
ss << (*this)[i];
}
}
auto offset = rowArrayIdx(mShape, start);
for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
new_shape[i] = mShape[i];
TV_ASSERT(new_shape[i] >= 0);
}
Shape reduced_shape;
for (size_t i = 0; i < slice_vec.size(); ++i) {
if (slice_vec[i][1] != -1) {
reduced_shape.push_back(new_shape[i]);
if (nd_index[ndimValue - 1] != _shape[ndimValue - 1] - 1) {
ss << ",";
}
for (int j = 0; j < ndimValue; ++j) {
prev[j] = nd_index[j];
}
}
for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
reduced_shape.push_back(new_shape[i]);
for (int j = 0; j < ndimValue; ++j) {
ss << "]";
}
return TensorView<T, Rank>(mPtr + offset, reduced_shape);
return ss.str();
}
std::string repr() const {
std::ostringstream ss;
return repr(ss);
}
protected:
template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
return Slice{int(s), -1, -1};
}
TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
T *mPtr = nullptr;
Shape mShape;
ptr_t ptr_ = nullptr;
tv_shape_t shape_;
tv_shape_t stride_;
};
template <typename Os, typename T, int Rank>
Os &operator<<(Os &os, const TensorView<T, Rank> &dt) {
template <typename T> TensorView<T> vector2tv(std::vector<T> &arr) {
return TensorView<T>(arr.data(), {arr.size()});
}
template <typename T>
TensorView<T> vector2tv(std::vector<T> &arr, Shape shape) {
TV_ASSERT_INVALID_ARG(shape.prod() == arr.size(), "error");
return TensorView<T>(arr.data(), shape);
}
template <typename T> TensorView<const T> vector2tv(const std::vector<T> &arr) {
return TensorView<const T>(arr.data(), {arr.size()});
}
template <typename Os, typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
Os &operator<<(Os &os, const TensorView<T, Rank, PtrTraits, Tindex> &dt) {
os << dt.repr();
return os;
}
template <typename Os, typename T, int Rank>
Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
template <typename Os, typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
Os &operator<<(Os &os, const TensorView<const T, Rank, PtrTraits, Tindex> &dt) {
os << dt.repr();
return os;
}
namespace detail {
template <typename T> constexpr const char *printfTypeFormat(T val = T());
template <> constexpr const char *printfTypeFormat(float val) { return "%.2f"; }
template <> constexpr const char *printfTypeFormat(double val) {
return "%.2f";
}
template <> constexpr const char *printfTypeFormat(int val) { return "%d"; }
template <> constexpr const char *printfTypeFormat(unsigned val) {
return "%u";
}
template <> constexpr const char *printfTypeFormat(long val) { return "%ld"; }
template <> constexpr const char *printfTypeFormat(unsigned long val) {
return "%lu";
}
}; // namespace detail
template <typename T> struct TypePrintfFormat;
template <> struct TypePrintfFormat<float> {
static constexpr const char *value = "%.2f";
};
template <> struct TypePrintfFormat<double> {
static constexpr const char *value = "%.2f";
};
template <> struct TypePrintfFormat<int8_t> {
static constexpr const char *value = "%d";
};
template <> struct TypePrintfFormat<int16_t> {
static constexpr const char *value = "%d";
};
template <> struct TypePrintfFormat<int32_t> {
static constexpr const char *value = "%d";
};
template <> struct TypePrintfFormat<uint8_t> {
static constexpr const char *value = "%u";
};
template <> struct TypePrintfFormat<uint16_t> {
static constexpr const char *value = "%u";
};
template <> struct TypePrintfFormat<uint32_t> {
static constexpr const char *value = "%u";
};
template <> struct TypePrintfFormat<int64_t> {
static constexpr const char *value = "%ld";
};
template <> struct TypePrintfFormat<uint64_t> {
static constexpr const char *value = "%lu";
};
template <> struct TypePrintfFormat<bool> {
static constexpr const char *value = "%d";
};
template <typename T>
TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
const char *format) {
constexpr const char *type_printf_format_v = TypePrintfFormat<T>::value;
}; // namespace detail
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
TV_HOST_DEVICE void
printTensorView(const TensorView<T, Rank, PtrTraits, Tindex> &tensor,
const char *format) {
// used to print tensor in cuda kernel.
if (tensor.empty())
return;
if (tensor.ndim() == 0) {
......@@ -1153,51 +1260,69 @@ TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
printf("\n");
return;
}
Shape counter = tensor.shape();
auto tensor_flat = tensor.view(-1);
for (int i = 0; i < counter.ndim(); ++i) {
counter[i] = 0;
printf("[");
}
for (size_t i = 0; i < tensor.size(); ++i) {
printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
counter[counter.ndim() - 1] += 1;
int inc_count = 0;
bool print_comma = true;
for (int c = counter.ndim() - 1; c >= 0; --c) {
if (counter[c] == tensor.dim(c) && c > 0) {
++inc_count;
counter[c - 1] += 1;
counter[c] = 0;
print_comma = false;
SimpleVector<int64_t, TV_MAX_DIM> prev(tensor.ndim(), -1);
SimpleVector<int64_t, TV_MAX_DIM> nd_index(tensor.ndim());
SimpleVector<int64_t, TV_MAX_DIM> shape(tensor.shape());
auto ndim = tensor.ndim();
for (int64_t i = 0; i < tensor.size(); ++i) {
rowArrayIdxInv(i, nd_index.data(), shape.data(), ndim);
bool newline = false;
int end_count = 0;
for (int j = 0; j < ndim; ++j) {
if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
prev[j] != -1) {
printf("]");
++end_count;
newline = true;
}
}
if (print_comma && i != tensor.size() - 1)
printf(", ");
for (int j = 0; j < inc_count; ++j) {
printf("]");
if (prev[0] == -1) {
end_count = ndim;
}
if (newline) {
printf("\n");
}
int starts_count = 0;
for (int j = 0; j < ndim; ++j) {
if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
++starts_count;
}
}
if (i != tensor.size() - 1) {
if (inc_count != 0)
printf("\n");
for (int j = 0; j < inc_count; ++j) {
printf("[");
if (starts_count > 0) {
for (int j = 0; j < ndim - end_count; ++j) {
printf(" ");
}
for (int j = 0; j < starts_count; ++j) {
printf("]");
}
}
printf(format, tensor[i]);
if (nd_index[ndim - 1] != shape[ndim - 1] - 1) {
printf(",");
}
for (int j = 0; j < ndim; ++j) {
prev[j] = nd_index[j];
}
}
for (int j = 0; j < ndim; ++j) {
printf("]");
}
printf("]\n");
printf("\n");
}
template <typename T>
TV_HOST_DEVICE void printTensorView(TensorView<T> tensor) {
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
TV_HOST_DEVICE void
printTensorView(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
using Traw = typename std::remove_const<T>::type;
return printTensorView(tensor, detail::printfTypeFormat<Traw>());
return printTensorView(tensor, detail::type_printf_format_v<Traw>);
}
template <typename T>
TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
using Traw = typename std::remove_const<T>::type;
return printTensorView(TensorView<const T>(ptr, shape),
detail::printfTypeFormat<Traw>());
detail::type_printf_format_v<Traw>);
}
template <typename T>
TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
......@@ -1205,7 +1330,7 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
return printTensorView(TensorView<const T>(ptr, shape), format);
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
......@@ -1229,20 +1354,25 @@ void check(T result, char const *const func, const char *const file,
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#define checkCudaErrors(val) tv::check((val), #val, __FILE__, __LINE__)
template <typename T>
void host2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToDevice, s));
}
template <typename T>
void host2dev(TensorView<T> dst, const TensorView<const T> src,
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void host2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
......@@ -1250,12 +1380,16 @@ template <typename T> void host2dev_sync(T *dst, const T *src, size_t size) {
checkCudaErrors(
cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyHostToDevice));
}
template <typename T>
void host2dev_sync(TensorView<T> dst, const TensorView<const T> src) {
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src) {
host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
}
template <typename T>
void host2dev_sync(TensorView<T> dst, const TensorView<T> src) {
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src) {
host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
}
......@@ -1265,14 +1399,18 @@ void dev2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToHost, s));
}
template <typename T>
void dev2host(TensorView<T> dst, const TensorView<const T> src,
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void dev2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
......@@ -1282,13 +1420,18 @@ void dev2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice, s));
}
template <typename T>
void dev2dev(TensorView<T> dst, const TensorView<const T> src,
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void dev2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
......@@ -1298,67 +1441,39 @@ void host2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToHost, s));
}
template <typename T>
void host2host(TensorView<T> dst, const TensorView<const T> src,
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void host2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
template <typename T, int Rank, template <class> class PtrTraits1,
template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
const TensorView<T, Rank, PtrTraits2, Tindex2> src,
cudaStream_t s = 0) {
host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T> void zero_dev(TensorView<T> tensor) {
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
checkCudaErrors(cudaMemset(tensor.data(), 0, tensor.size() * sizeof(T)));
}
template <typename T> void zero_dev(TensorView<T> tensor, cudaStream_t s) {
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor, cudaStream_t s) {
checkCudaErrors(
cudaMemsetAsync(tensor.data(), 0, tensor.size() * sizeof(T), s));
}
template <typename T> void zero_host(TensorView<T> tensor) {
template <typename T, int Rank, template <class> class PtrTraits,
typename Tindex>
void zero_host(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
std::fill(tensor.data(), tensor.data() + tensor.size(), 0);
}
#endif
namespace detail {
template <typename T> struct TypeToString;
template <> struct TypeToString<int32_t> {
static constexpr const char *value = "int32";
};
template <> struct TypeToString<bool> {
static constexpr const char *value = "bool";
};
template <> struct TypeToString<float> {
static constexpr const char *value = "float";
};
template <> struct TypeToString<double> {
static constexpr const char *value = "double";
};
template <> struct TypeToString<int16_t> {
static constexpr const char *value = "int16";
};
template <> struct TypeToString<int8_t> {
static constexpr const char *value = "int8";
};
template <> struct TypeToString<int64_t> {
static constexpr const char *value = "int64";
};
template <> struct TypeToString<uint8_t> {
static constexpr const char *value = "uint8";
};
template <> struct TypeToString<uint16_t> {
static constexpr const char *value = "uint16";
};
template <> struct TypeToString<uint32_t> {
static constexpr const char *value = "uint32";
};
template <> struct TypeToString<uint64_t> {
static constexpr const char *value = "uint64";
};
} // namespace detail
} // namespace tv
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#ifdef TV_CUDA
#include <cuda_runtime_api.h>
#endif
#include <iostream>
namespace tv {
#ifdef TV_CUDA
template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
CudaContextTimer() {
cudaDeviceSynchronize();
mCurTime = std::chrono::steady_clock::now();
}
typename TimeT::rep report() {
cudaDeviceSynchronize();
auto duration = std::chrono::duration_cast<TimeT>(
std::chrono::steady_clock::now() - mCurTime);
auto res = duration.count();
mCurTime = std::chrono::steady_clock::now();
return res;
}
private:
std::chrono::time_point<std::chrono::steady_clock> mCurTime;
};
#endif
template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
typename TimeT::rep report() {
auto duration = std::chrono::duration_cast<TimeT>(
std::chrono::steady_clock::now() - mCurTime);
auto res = duration.count();
mCurTime = std::chrono::steady_clock::now();
return res;
}
private:
std::chrono::time_point<std::chrono::steady_clock> mCurTime;
};
} // namespace tv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "mp_helper.h"
#include <tensorview/tensorview.h>
#include <ATen/ATen.h>
#include <torch/script.h>
#ifdef TV_CUDA
#include <ATen/cuda/CUDAContext.h>
#endif
namespace tv {
#ifdef TV_CUDA
struct TorchGPU : public tv::GPU {
virtual cudaStream_t getStream() const override {
return at::cuda::getCurrentCUDAStream();
}
};
#endif
namespace detail {
template <typename T> struct TypeToTorchDtypeTraits;
template <> struct TypeToTorchDtypeTraits<int32_t> {
static constexpr decltype(torch::kInt32) value = torch::kInt32;
};
template <> struct TypeToTorchDtypeTraits<int16_t> {
static constexpr decltype(torch::kInt32) value = torch::kInt16;
};
template <> struct TypeToTorchDtypeTraits<int8_t> {
static constexpr decltype(torch::kInt8) value = torch::kInt8;
};
template <> struct TypeToTorchDtypeTraits<int64_t> {
static constexpr decltype(torch::kInt32) value = torch::kInt64;
};
template <> struct TypeToTorchDtypeTraits<uint8_t> {
static constexpr decltype(torch::kInt32) value = torch::kUInt8;
};
template <> struct TypeToTorchDtypeTraits<bool> {
static constexpr decltype(torch::kInt32) value = torch::kBool;
};
template <> struct TypeToTorchDtypeTraits<float> {
static constexpr decltype(torch::kInt32) value = torch::kFloat32;
};
template <> struct TypeToTorchDtypeTraits<double> {
static constexpr decltype(torch::kInt32) value = torch::kFloat64;
};
template <> struct TypeToTorchDtypeTraits<at::Half> {
static constexpr decltype(torch::kInt32) value = torch::kHalf;
};
using all_torch_types_t = std::tuple<float, double, int8_t, int16_t, int32_t,
int64_t, uint8_t, bool, at::Half>;
} // namespace detail
template <typename T>
constexpr decltype(torch::kInt32) torch_type_v =
detail::TypeToTorchDtypeTraits<T>::value;
template <class... Ts, typename F>
void dispatch_torch(at::ScalarType t, F &&f) {
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true;
tv::mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (detail::TypeToTorchDtypeTraits<decltype(I)>::value == t) {
std::forward<F>(f)(decltype(I)());
notFound = false;
}
});
if (notFound) {
std::stringstream ss;
tv::mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
ss << tv::detail::TypeToString<decltype(I)>::value << " ";
});
TV_THROW_RT_ERR("unknown type", t, ", available:", ss.str());
}
}
template <class T> struct DispatchTorch;
template <template <class...> class T, class... Args>
struct DispatchTorch<T<Args...>> {
template <typename F> inline void operator()(at::ScalarType t, F &&f) {
return dispatch_torch<Args...>(t, std::forward<F>(f));
}
};
template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
DispatchTorch<detail::all_torch_types_t>()(tensor.scalar_type(), [&](auto I) {
using Ttensor = decltype(I);
constexpr bool val = std::is_same<std::remove_cv_t<T>, Ttensor>::value;
TV_ASSERT_RT_ERR(val, "error");
});
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int>
TensorView<T, Rank, PtrTraits, Tindex> torch2tv(const torch::Tensor &tensor) {
using tv_shape_t =
typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
check_torch_dtype<T>(tensor);
// TODO stride
if (Rank > 0) {
TV_ASSERT_INVALID_ARG(tensor.dim() == Rank, "error");
}
tv_shape_t shape;
for (auto i : tensor.sizes()) {
shape.push_back(i);
}
return tv::TensorView<T, Rank, PtrTraits, Tindex>(
tensor.data_ptr<std::remove_const_t<T>>(), shape);
}
namespace detail {
template <> struct TypeToString<at::Half> {
static constexpr const char *value = "half";
};
} // namespace detail
} // namespace tv
\ No newline at end of file
......@@ -13,18 +13,18 @@
// limitations under the License.
#pragma once
#include <spconv/mp_helper.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <ATen/ATen.h>
#include <torch/script.h>
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
#include <ATen/cuda/CUDAContext.h>
#endif
namespace tv {
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
struct TorchGPU : public tv::GPU {
virtual cudaStream_t getStream() const override {
return at::cuda::getCurrentCUDAStream();
......@@ -103,10 +103,10 @@ template <> struct TypeToString<at::Half> {
};
} // namespace detail
template <class... Ts, typename F>
void torch_dispatch(at::ScalarType t, F &&f) {
void dispatch_torch(at::ScalarType t, F &&f) {
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
spconv::tv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (torch_type_v<decltype(I)> == t) {
std::forward<F>(f)(decltype(I)());
notFound = false;
......@@ -114,7 +114,7 @@ void torch_dispatch(at::ScalarType t, F &&f) {
});
if (notFound) {
std::stringstream ss;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
spconv::tv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
ss << tv::detail::TypeToString<decltype(I)>::value << " ";
});
TV_THROW_RT_ERR("unknown type", t, ", available: ", ss.str());
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment