format code with clang-format, better c++ code

19e73bbe · Yan Yan · c336139f · 19e73bbe · 19e73bbe · 19e73bbe
Commit 19e73bbe authored May 20, 2020 by Yan Yan
20 changed files
--- a/include/spconv/pillar_scatter_functor.h
+++ b/include/spconv/pillar_scatter_functor.h
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,14 +16,11 @@
 #define POINTPILLARS_SCATTER_FUNCTOR_H_
 #include <tensorview/tensorview.h>

-namespace spconv
-{
-namespace functor
-{
+namespace spconv {
+namespace functor {
 template <typename Device, typename T, typename Index>
-struct PointPillarScatter
-{
-    void operator()(const Device& d, tv::TensorView<T> canvas,
+struct PointPillarScatter {
+  void operator()(const Device &d, tv::TensorView<T> canvas,
                  tv::TensorView<const T> features,
                  tv::TensorView<const T> coors);
 };

--- a/include/spconv/pillar_scatter_ops.h
+++ b/include/spconv/pillar_scatter_ops.h
@@ -16,8 +16,8 @@
 #define PILLAR_SCATTER_OP_H_

 #include <spconv/pillar_scatter_functor.h>
+#include <tensorview/torch_utils.h>
 #include <torch/script.h>
-#include <torch_utils.h>
 #include <utility/timer.h>

 namespace spconv {
@@ -42,9 +42,10 @@ torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
      torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
                   features.options());
  TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
  functor::PointPillarScatter<tv::GPU, T, int> ftor;
-  ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas), tv::torch2tv<const T>(features.squeeze()),
+  ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas),
+       tv::torch2tv<const T>(features.squeeze()),
       tv::torch2tv<const T>(coors.squeeze()));
 #endif
  return canvas;

--- a/include/spconv/point2voxel.h
+++ b/include/spconv/point2voxel.h
@@ -29,7 +29,8 @@ using namespace pybind11::literals;

 template <typename DType, int NDim>
 int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> voxel_point_mask, py::array_t<int> coors,
+                          py::array_t<DType> voxel_point_mask,
+                          py::array_t<int> coors,
                          py::array_t<int> num_points_per_voxel,
                          py::array_t<int> coor_to_voxelidx,
                          std::vector<DType> voxel_size,
@@ -94,14 +95,12 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
 }

 template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(py::array_t<DType> points,
-                               py::array_t<DType> voxel_point_mask, py::array_t<DType> voxels,
-                               py::array_t<DType> means, py::array_t<int> coors,
-                               py::array_t<int> num_points_per_voxel,
-                               py::array_t<int> coor_to_voxelidx,
-                               std::vector<DType> voxel_size,
-                               std::vector<DType> coors_range, int max_points,
-                               int max_voxels) {
+int points_to_voxel_3d_np_mean(
+    py::array_t<DType> points, py::array_t<DType> voxel_point_mask,
+    py::array_t<DType> voxels, py::array_t<DType> means, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto means_rw = means.template mutable_unchecked<2>();
  auto voxels_rw = voxels.template mutable_unchecked<3>();
@@ -174,8 +173,8 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points,
 template <typename DType, int NDim>
 int points_to_voxel_3d_with_filtering(
    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask, py::array_t<DType> mins,
-    py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask,
+    py::array_t<DType> mins, py::array_t<DType> maxs, py::array_t<int> coors,
    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
    std::vector<DType> voxel_size, std::vector<DType> coors_range,
    int max_points, int max_voxels, int block_factor, int block_size,

--- a/include/spconv/pool_ops.h
+++ b/include/spconv/pool_ops.h
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,14 +16,14 @@
 #define SPARSE_POOL_OP_H_

 #include <spconv/maxpool.h>
+#include <tensorview/torch_utils.h>
 #include <torch/script.h>
-#include <torch_utils.h>
 #include <utility/timer.h>

 namespace spconv {
 template <typename T>
 torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
-                          torch::Tensor indiceNum, int64_t numAct) {
+                            torch::Tensor indiceNum, int64_t numAct) {
  auto device = features.device().type();
  auto kernelVolume = indicePairs.size(0);
  auto numInPlanes = features.size(1);
@@ -43,8 +43,8 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
      forwardFtor(tv::CPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(features),
                  tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-    } 
-#ifdef SPCONV_CUDA
+    }
+#ifdef TV_CUDA
    else if (device == torch::kCUDA) {
      functor::SparseMaxPoolForwardFunctor<tv::GPU, T, int> forwardFtor;
      forwardFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
@@ -53,7 +53,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
      TV_CHECK_CUDA_ERR();
    }
 #endif
-    else{
+    else {
      TV_ASSERT_INVALID_ARG(false, "unknown device type");
    }
    // totalTime += timer.report() / 1000.0;
@@ -63,17 +63,17 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
 }

 template <typename T>
-torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
-                                  torch::Tensor outFeatures,
-                                  torch::Tensor outGrad, torch::Tensor indicePairs,
-                                  torch::Tensor indiceNum) {
+torch::Tensor
+indiceMaxPoolBackward(torch::Tensor features, torch::Tensor outFeatures,
+                      torch::Tensor outGrad, torch::Tensor indicePairs,
+                      torch::Tensor indiceNum) {
  auto device = features.device().type();
  auto numInPlanes = features.size(1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-    auto kernelVolume = indicePairs.size(0);
+  auto kernelVolume = indicePairs.size(0);
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0) {
@@ -85,8 +85,8 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
                   tv::torch2tv<const T>(features),
                   tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
                   tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-    } 
-#ifdef SPCONV_CUDA
+    }
+#ifdef TV_CUDA
    else if (device == torch::kCUDA) {
      functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, int> backwardFtor;
      backwardFtor(tv::TorchGPU(), tv::torch2tv<const T>(outFeatures),
@@ -96,10 +96,9 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
      TV_CHECK_CUDA_ERR();
    }
 #endif
-    else{
+    else {
      TV_ASSERT_INVALID_ARG(false, "unknown device type");
    }
-
  }
  return inputGrad;
 }

--- a/include/spconv/reordering.cu.h
+++ b/include/spconv/reordering.cu.h
@@ -14,7 +14,7 @@

 #ifndef REORDERING_CU_H_
 #define REORDERING_CU_H_
-#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/kernel_utils.h>

 // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
 namespace spconv {

--- a/include/spconv/reordering.h
+++ b/include/spconv/reordering.h
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,23 +16,21 @@
 #define SPARSE_REORDERING_FUNCTOR_H_
 #include <tensorview/tensorview.h>

-namespace spconv
-{
-namespace functor
-{
+namespace spconv {
+namespace functor {
 template <typename Device, typename T, typename Index>
-struct SparseGatherFunctor
-{
-    void operator()(const Device& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
-                    tv::TensorView<const Index> indices, int size);
+struct SparseGatherFunctor {
+  void operator()(const Device &d, tv::TensorView<T> buffer,
+                  tv::TensorView<const T> features,
+                  tv::TensorView<const Index> indices, int size);
 };

 template <typename Device, typename T, typename Index>
-struct SparseScatterAddFunctor
-{
-    void operator()(const Device& d, tv::TensorView<T> out_features,
-                    tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
-                    int size, bool stable=false);
+struct SparseScatterAddFunctor {
+  void operator()(const Device &d, tv::TensorView<T> out_features,
+                  tv::TensorView<const T> buffer,
+                  tv::TensorView<const Index> indices, int size,
+                  bool stable = false);
 };
 } // namespace functor
 } // namespace spconv

--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -17,8 +17,8 @@

 #include <spconv/indice.h>
 #include <spconv/reordering.h>
+#include <tensorview/torch_utils.h>
 #include <torch/script.h>
-#include <torch_utils.h>
 #include <utility/timer.h>

 namespace spconv {
@@ -101,7 +101,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose, false, useHash);
    }
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
    else if (indices.device().type() == torch::kCUDA) {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
@@ -149,7 +149,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
          transpose);
    }
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
    else if (indices.device().type() == torch::kCUDA) {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
@@ -269,7 +269,7 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
          dilation32, outSpatialShape32, transpose);
      gridOut.fill_(-1);
    }
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
    else if (indices.device().type() == torch::kCUDA) {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
@@ -299,7 +299,7 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
          transpose, true);
      gridOut.fill_(-1);
    }
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
    else if (indices.device().type() == torch::kCUDA) {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();

--- a/include/tensorview/common.h
+++ b/include/tensorview/common.h
+// Copyright 2019-2020 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <sstream>
+#ifdef TV_USE_STACKTRACE
+#if defined(WIN32) || defined(_WIN32) ||                                       \
+    defined(__WIN32) && !defined(__CYGWIN__)
+#define BOOST_STACKTRACE_USE_WINDBG
+#else
+// require linking with -ldl and -lbacktrace in linux
+#define BOOST_STACKTRACE_USE_BACKTRACE
+#endif
+#include <boost/stacktrace.hpp>
+#endif
+
+namespace tv {
+
+template <class SStream, class T> void sstream_print(SStream &ss, T val) {
+  ss << val;
+}
+
+template <class SStream, class T, class... TArgs>
+void sstream_print(SStream &ss, T val, TArgs... args) {
+  ss << val << " ";
+  sstream_print(ss, args...);
+}
+
+template <class... TArgs> void ssprint(TArgs... args) {
+  std::stringstream ss;
+  sstream_print(ss, args...);
+  std::cout << ss.str() << std::endl;
+}
+
+#ifdef TV_USE_STACKTRACE
+#define TV_BACKTRACE_PRINT(ss)                                                 \
+  ss << std::endl << boost::stacktrace::stacktrace();
+#else
+#define TV_BACKTRACE_PRINT(ss)
+#endif
+
+#define TV_THROW_RT_ERR(...)                                                   \
+  {                                                                            \
+    std::stringstream __macro_s;                                               \
+    __macro_s << __FILE__ << " " << __LINE__ << "\n";                          \
+    tv::sstream_print(__macro_s, __VA_ARGS__);                                 \
+    TV_BACKTRACE_PRINT(__macro_s);                                             \
+    throw std::runtime_error(__macro_s.str());                                 \
+  }
+
+#define TV_THROW_INVALID_ARG(...)                                              \
+  {                                                                            \
+    std::stringstream __macro_s;                                               \
+    __macro_s << __FILE__ << " " << __LINE__ << "\n";                          \
+    tv::sstream_print(__macro_s, __VA_ARGS__);                                 \
+    TV_BACKTRACE_PRINT(__macro_s);                                             \
+    throw std::invalid_argument(__macro_s.str());                              \
+  }
+
+#define TV_ASSERT_RT_ERR(expr, ...)                                            \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << #expr << " assert faild. ";                                 \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
+      TV_BACKTRACE_PRINT(__macro_s);                                           \
+      throw std::runtime_error(__macro_s.str());                               \
+    }                                                                          \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << #expr << " assert faild. ";                                 \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
+      TV_BACKTRACE_PRINT(__macro_s);                                           \
+      throw std::invalid_argument(__macro_s.str());                            \
+    }                                                                          \
+  }
+} // namespace tv
\ No newline at end of file
--- a/include/tensorview/cuda_utils.h
+++ b/include/tensorview/cuda_utils.h
+#pragma once
+// from pytorch.aten
+#include "tensorview.h"
+#include <type_traits>
+namespace tv {
+namespace cuda {
+
+template <typename T1, typename T2> inline int DivUp(const T1 a, const T2 b) {
+  return (a + b - 1) / b;
+}
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int CUDA_NUM_THREADS = 1024;
+// CUDA: number of blocks for threads.
+
+inline int getNumThreads(const int N) {
+  if (N > CUDA_NUM_THREADS) {
+    return CUDA_NUM_THREADS;
+  }
+  return DivUp(N, 32) * 32;
+}
+
+inline int getBlocks(const int N) {
+  TV_ASSERT_RT_ERR(N > 0,
+                   "CUDA kernel launch blocks must be positive, but got N=", N);
+  return DivUp(N, getNumThreads(N));
+}
+
+} // namespace cuda
+
+} // namespace tv
\ No newline at end of file
--- a/include/tensorview/eigen_utils.h
+++ b/include/tensorview/eigen_utils.h
+// Copyright 2019-2020 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "tensor.h"
+#include "tensorview.h"
+#include <eigen3/Eigen/Dense>
+
+namespace tv {
+
+template <typename T, int Row = Eigen::Dynamic, int Col = Eigen::Dynamic>
+Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>>
+tv2eigen(TensorView<T> view) {
+  TV_ASSERT_INVALID_ARG(view.ndim() <= 2 && view.ndim() > 0, "error");
+  if (Row != Eigen::Dynamic) {
+    TV_ASSERT_INVALID_ARG(view.dim(0) == Row, "error");
+  }
+  if (Col != Eigen::Dynamic) {
+    TV_ASSERT_INVALID_ARG(view.dim(1) == Col, "error");
+  }
+  int row = 1;
+  if (view.ndim() == 2) {
+    row = view.dim(0);
+  }
+  Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>> eigen_map(
+      view.data(), row, view.dim(1));
+  return eigen_map;
+}
+
+} // namespace tv
--- a/include/tensorview/helper_launch.h
+++ b/include/tensorview/helper_launch.h
-#pragma once
-// from pytorch.aten
-#include "tensorview.h"
-namespace tv
-{
-namespace launch
-{
-
-template <typename T1, typename T2>
-inline int DivUp(const T1 a, const T2 b) { return (a + b - 1) / b; }
-
-// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
-// CUDA: number of blocks for threads.
-inline int getBlocks(const int N)
-{
-    TV_ASSERT_RT_ERR(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
-    return DivUp(N, CUDA_NUM_THREADS);
-}
-} // namespace launch
-} // namespace tv
\ No newline at end of file
--- a/include/tensorview/helper_kernel.cu.h
+++ b/include/tensorview/helper_kernel.cu.h
 #pragma once
 // from tensorflow
-namespace tv
-{
-namespace detail
-{
+namespace tv {
+namespace detail {

-template <typename T>
-class KernelLoop
-{
-  struct Iterator
-  {
-    __forceinline__ __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
+template <typename T> class KernelLoop {
+  struct Iterator {
+    __forceinline__ __device__ Iterator(T index, T delta)
+        : index_(index), delta_(delta) {}
    __forceinline__ __device__ T operator*() const { return index_; }
-    __forceinline__ __device__ Iterator &operator++()
-    {
+    __forceinline__ __device__ Iterator &operator++() {
      index_ += delta_;
      return *this;
    }
-    __forceinline__ __device__ bool operator!=(const Iterator &other) const
-    {
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
      bool greater = index_ > other.index_;
      bool less = index_ < other.index_;
      // Anything past an end iterator (delta_ == 0) is equal.
      // In range-based for loops, this optimizes to 'return less'.
-      if (!other.delta_)
-      {
+      if (!other.delta_) {
        return less;
      }
-      if (!delta_)
-      {
+      if (!delta_) {
        return greater;
      }
      return less || greater;
@@ -43,7 +35,9 @@ public:
  __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
      : begin_(begin), delta_(delta), end_(end) {}

-  __forceinline__ __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __forceinline__ __device__ Iterator begin() const {
+    return Iterator{begin_, delta_};
+  }
  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }

 private:
@@ -53,29 +47,26 @@ private:
 };

 } // namespace detail
-template <typename T, int NumILP=1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count)
-{
+template <typename T, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
  return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
-                                  gridDim.x * blockDim.x * NumILP, count);
+                               gridDim.x * blockDim.x * NumILP, count);
 }

 // Helper to visit indices in the range 0 <= i < count using the y-coordinate.
 // Usage: for(int i : KernelLoopY(count)) { visit(i); }
-template <typename T, int NumILP=1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count)
-{
+template <typename T, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
  return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
-                                  gridDim.y * blockDim.y * NumILP, count);
+                               gridDim.y * blockDim.y * NumILP, count);
 }

 // Helper to visit indices in the range 0 <= i < count using the z-coordinate.
 // Usage: for(int i : KernelLoopZ(count)) { visit(i); }
-template <typename T, int NumILP=1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count)
-{
+template <typename T, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
  return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
-                                  gridDim.z * blockDim.z * NumILP, count);
+                               gridDim.z * blockDim.z * NumILP, count);
 }

 } // namespace tv
\ No newline at end of file
--- a/include/spconv/mp_helper.h
+++ b/include/spconv/mp_helper.h
@@ -3,7 +3,7 @@
 #include <type_traits>
 #include <utility>

-namespace spconv {
+namespace tv {
 template <class... T> struct mp_list {};

 template <class T, T... I>
@@ -11,9 +11,10 @@ using mp_list_c = mp_list<std::integral_constant<T, I>...>;

 namespace detail {

-template <class... T, class F>
-constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
-  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
+template <class... Ts, class F>
+constexpr F mp_for_each_impl(mp_list<Ts...>, F &&f) {
+  return (void)(std::initializer_list<int>{(f(Ts()), 0)...}),
+         std::forward<F>(f);
 }

 template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
@@ -42,6 +43,6 @@ using mp_rename = typename detail::mp_rename_impl<A, B>::type;
 template <class L, class F> constexpr F mp_for_each(F &&f) {
  return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
 }
-} // namespace spconv
+} // namespace tv

 #endif
\ No newline at end of file
--- a/include/tensorview/prettyprint.h
+++ b/include/tensorview/prettyprint.h
+//          Copyright Louis Delacroix 2010 - 2014.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+//
+// A pretty printing library for C++
+//
+// Usage:
+// Include this header, and operator<< will "just work".
+
+#ifndef H_PRETTY_PRINT
+#define H_PRETTY_PRINT
+
+#include <cstddef>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+#include <valarray>
+
+namespace pretty_print {
+namespace detail {
+// SFINAE type trait to detect whether T::const_iterator exists.
+
+struct sfinae_base {
+  using yes = char;
+  using no = yes[2];
+};
+
+template <typename T> struct has_const_iterator : private sfinae_base {
+private:
+  template <typename C> static yes &test(typename C::const_iterator *);
+  template <typename C> static no &test(...);
+
+public:
+  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
+  using type = T;
+};
+
+template <typename T> struct has_begin_end : private sfinae_base {
+private:
+  template <typename C>
+  static yes &
+  f(typename std::enable_if<
+      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
+                                            const>(&C::begin)),
+                   typename C::const_iterator (C::*)() const>::value>::type *);
+
+  template <typename C> static no &f(...);
+
+  template <typename C>
+  static yes &
+  g(typename std::enable_if<
+      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
+                                            const>(&C::end)),
+                   typename C::const_iterator (C::*)() const>::value,
+      void>::type *);
+
+  template <typename C> static no &g(...);
+
+public:
+  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
+  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
+};
+
+} // namespace detail
+
+// Holds the delimiter values for a specific character type
+
+template <typename TChar> struct delimiters_values {
+  using char_type = TChar;
+  const char_type *prefix;
+  const char_type *delimiter;
+  const char_type *postfix;
+};
+
+// Defines the delimiter values for a specific container and character type
+
+template <typename T, typename TChar> struct delimiters {
+  using type = delimiters_values<TChar>;
+  static const type values;
+};
+
+// Functor to print containers. You can use this directly if you want
+// to specificy a non-default delimiters type. The printing logic can
+// be customized by specializing the nested template.
+
+template <typename T, typename TChar = char,
+          typename TCharTraits = ::std::char_traits<TChar>,
+          typename TDelimiters = delimiters<T, TChar>>
+struct print_container_helper {
+  using delimiters_type = TDelimiters;
+  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
+
+  template <typename U> struct printer {
+    static void print_body(const U &c, ostream_type &stream) {
+      using std::begin;
+      using std::end;
+
+      auto it = begin(c);
+      const auto the_end = end(c);
+
+      if (it != the_end) {
+        for (;;) {
+          stream << *it;
+
+          if (++it == the_end)
+            break;
+
+          if (delimiters_type::values.delimiter != NULL)
+            stream << delimiters_type::values.delimiter;
+        }
+      }
+    }
+  };
+
+  print_container_helper(const T &container) : container_(container) {}
+
+  inline void operator()(ostream_type &stream) const {
+    if (delimiters_type::values.prefix != NULL)
+      stream << delimiters_type::values.prefix;
+
+    printer<T>::print_body(container_, stream);
+
+    if (delimiters_type::values.postfix != NULL)
+      stream << delimiters_type::values.postfix;
+  }
+
+private:
+  const T &container_;
+};
+
+// Specialization for pairs
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename T1, typename T2>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::pair<T1, T2>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+
+  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
+    stream << c.first;
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+    stream << c.second;
+  }
+};
+
+// Specialization for tuples
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename... Args>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::tuple<Args...>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+  using element_type = std::tuple<Args...>;
+
+  template <std::size_t I> struct Int {};
+
+  static void print_body(const element_type &c, ostream_type &stream) {
+    tuple_print(c, stream, Int<0>());
+  }
+
+  static void tuple_print(const element_type &, ostream_type &,
+                          Int<sizeof...(Args)>) {}
+
+  static void
+  tuple_print(const element_type &c, ostream_type &stream,
+              typename std::conditional<sizeof...(Args) != 0, Int<0>,
+                                        std::nullptr_t>::type) {
+    stream << std::get<0>(c);
+    tuple_print(c, stream, Int<1>());
+  }
+
+  template <std::size_t N>
+  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+
+    stream << std::get<N>(c);
+
+    tuple_print(c, stream, Int<N + 1>());
+  }
+};
+
+// Prints a print_container_helper to the specified stream.
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &stream,
+    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
+  helper(stream);
+  return stream;
+}
+
+// Basic is_container template; specialize to derive from std::true_type for all
+// desired container types
+
+template <typename T>
+struct is_container
+    : public std::integral_constant<bool,
+                                    detail::has_const_iterator<T>::value &&
+                                        detail::has_begin_end<T>::beg_value &&
+                                        detail::has_begin_end<T>::end_value> {};
+
+template <typename T, std::size_t N>
+struct is_container<T[N]> : std::true_type {};
+
+template <std::size_t N> struct is_container<char[N]> : std::false_type {};
+
+template <typename T> struct is_container<std::valarray<T>> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_container<std::pair<T1, T2>> : std::true_type {};
+
+template <typename... Args>
+struct is_container<std::tuple<Args...>> : std::true_type {};
+
+// Default delimiters
+
+template <typename T> struct delimiters<T, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T>
+const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
+template <typename T> struct delimiters<T, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T>
+const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
+                                                                   L"]"};
+
+// Delimiters for (multi)set and unordered_(multi)set
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
+                                                                  "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char> delimiters<::std::multiset<T, TComp, TAllocator>,
+                                         char>::values = {"{", ", ", "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
+    L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+               wchar_t>::values = {L"{", L", ", L"}"};
+
+// Delimiters for pair and tuple
+
+template <typename T1, typename T2> struct delimiters<std::pair<T1, T2>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
+    "(", ", ", ")"};
+template <typename T1, typename T2>
+struct delimiters<::std::pair<T1, T2>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<wchar_t>
+    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
+
+template <typename... Args> struct delimiters<std::tuple<Args...>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename... Args>
+const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
+    "(", ", ", ")"};
+template <typename... Args> struct delimiters<::std::tuple<Args...>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename... Args>
+const delimiters_values<wchar_t>
+    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
+
+// Type-erasing helper class for easy use of custom delimiters.
+// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
+// and MyDelims needs to be defined for TChar. Usage: "cout <<
+// pretty_print::custom_delims<MyDelims>(x)".
+
+struct custom_delims_base {
+  virtual ~custom_delims_base() {}
+  virtual std::ostream &stream(::std::ostream &) = 0;
+  virtual std::wostream &stream(::std::wostream &) = 0;
+};
+
+template <typename T, typename Delims>
+struct custom_delims_wrapper : custom_delims_base {
+  custom_delims_wrapper(const T &t_) : t(t_) {}
+
+  std::ostream &stream(std::ostream &s) {
+    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
+               t);
+  }
+
+  std::wostream &stream(std::wostream &s) {
+    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
+                                       Delims>(t);
+  }
+
+private:
+  const T &t;
+};
+
+template <typename Delims> struct custom_delims {
+  template <typename Container>
+  custom_delims(const Container &c)
+      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
+
+  std::unique_ptr<custom_delims_base> base;
+};
+
+template <typename TChar, typename TCharTraits, typename Delims>
+inline std::basic_ostream<TChar, TCharTraits> &
+operator<<(std::basic_ostream<TChar, TCharTraits> &s,
+           const custom_delims<Delims> &p) {
+  return p.base->stream(s);
+}
+
+// A wrapper for a C-style array given as pointer-plus-size.
+// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
+
+template <typename T> struct array_wrapper_n {
+  typedef const T *const_iterator;
+  typedef T value_type;
+
+  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
+  inline const_iterator begin() const { return _array; }
+  inline const_iterator end() const { return _array + _n; }
+
+private:
+  const T *const _array;
+  size_t _n;
+};
+
+// A wrapper for hash-table based containers that offer local iterators to each
+// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
+// 5 of container m.)
+
+template <typename T> struct bucket_print_wrapper {
+  typedef typename T::const_local_iterator const_iterator;
+  typedef typename T::size_type size_type;
+
+  const_iterator begin() const { return m_map.cbegin(n); }
+
+  const_iterator end() const { return m_map.cend(n); }
+
+  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
+
+private:
+  const T &m_map;
+  const size_type n;
+};
+
+} // namespace pretty_print
+
+// Global accessor functions for the convenience wrappers
+
+template <typename T>
+inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
+                                                           size_t n) {
+  return pretty_print::array_wrapper_n<T>(a, n);
+}
+
+template <typename T>
+pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
+                                                   typename T::size_type n) {
+  return pretty_print::bucket_print_wrapper<T>(m, n);
+}
+
+// Main magic entry point: An overload snuck into namespace std.
+// Can we do better?
+
+namespace std {
+// Prints a container to the stream using default delimiters
+
+template <typename T, typename TChar, typename TCharTraits>
+inline typename enable_if<::pretty_print::is_container<T>::value,
+                          basic_ostream<TChar, TCharTraits> &>::type
+operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
+  return stream
+         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
+                container);
+}
+} // namespace std
+
+#endif // H_PRETTY_PRINT
--- a/include/tensorview/pybind_utils.h
+++ b/include/tensorview/pybind_utils.h
+// Copyright 2019-2020 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "tensor.h"
+#include "tensorview.h"
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+namespace tv {
+
+template <typename Tarr> bool is_c_stype(const Tarr &arr) {
+  return bool(arr.flags() & py::array::c_style);
+}
+
+template <typename T, int Rank = -1>
+TensorView<T, Rank> arrayt2tv(py::array_t<T> arr) {
+  TV_ASSERT_INVALID_ARG(is_c_stype(arr), "array must be c-contiguous array");
+  Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  if (Rank >= 0) {
+    TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
+  }
+  return TensorView<T, Rank>(arr.mutable_data(), shape);
+}
+
+template <typename T, int Rank = -1>
+TensorView<const T> carrayt2tv(py::array_t<T> arr) {
+  TV_ASSERT_INVALID_ARG(is_c_stype(arr), "array must be c-contiguous array");
+  Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  if (Rank >= 0) {
+    TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
+  }
+  return TensorView<const T, Rank>(arr.data(), shape);
+}
+
+template <typename Tarr> tv::DType get_array_tv_dtype(const Tarr &arr) {
+  switch (arr.dtype().kind()) {
+  case 'b':
+    return tv::bool_;
+  case 'i': {
+    switch (arr.itemsize()) {
+    case 1:
+      return tv::int8;
+    case 2:
+      return tv::int16;
+    case 4:
+      return tv::int32;
+    case 8:
+      return tv::int64;
+    default:
+      break;
+    }
+  }
+  case 'u': {
+    switch (arr.itemsize()) {
+    case 1:
+      return tv::uint8;
+    case 2:
+      return tv::uint16;
+    case 4:
+      return tv::uint32;
+    case 8:
+      return tv::uint64;
+    default:
+      break;
+    }
+  }
+  case 'f': {
+    switch (arr.itemsize()) {
+    case 2:
+      return tv::float16;
+    case 4:
+      return tv::float32;
+    case 8:
+      return tv::float64;
+    default:
+      break;
+    }
+  }
+  }
+  TV_THROW_RT_ERR("unknown dtype", arr.dtype().kind(), arr.itemsize());
+}
+
+template <typename Tarr> Tensor array2tensor(Tarr &arr) {
+  TV_ASSERT_INVALID_ARG(is_c_stype(arr), "array must be c-contiguous array");
+  TensorShape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return tv::from_blob(arr.mutable_data(), shape, get_array_tv_dtype(arr), -1);
+}
+
+template <typename T> Tensor arrayt2tensor(py::array_t<T> &arr) {
+  TV_ASSERT_INVALID_ARG(is_c_stype(arr), "array must be c-contiguous array");
+  TensorShape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return tv::from_blob(arr.mutable_data(), shape, tv::type_v<T>, -1);
+}
+
+template <typename TDType> py::dtype tv_dtype_to_py(TDType d) {
+  switch (d) {
+  case float32:
+    return py::dtype("float32");
+  case float64:
+    return py::dtype("float64");
+  case float16:
+    return py::dtype("float16");
+  case int32:
+    return py::dtype("int32");
+  case int16:
+    return py::dtype("int16");
+  case int8:
+    return py::dtype("int8");
+  case int64:
+    return py::dtype("int64");
+  case uint32:
+    return py::dtype("uint32");
+  case uint16:
+    return py::dtype("uint16");
+  case uint8:
+    return py::dtype("uint8");
+  case uint64:
+    return py::dtype("uint64");
+  case bool_:
+    return py::dtype("bool_");
+  default:;
+  }
+  TV_THROW_INVALID_ARG("unknown dtype", d);
+}
+
+// add template to define function in header
+template <typename Ttensor> py::array tensor2array(Ttensor &tensor) {
+  // you cant call this function during GIL released.
+  TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
+  auto shape = tensor.shape();
+  std::vector<int> shape_vec(shape.begin(), shape.end());
+  auto dtype = tv_dtype_to_py(tensor.dtype());
+  // construct py::array will copy content from ptr.
+  // its expected because we can't transfer ownership from
+  // c++ tv::Tensor to numpy array when c++ object is deleted.
+  return py::array(dtype, shape_vec, {}, tensor.raw_data());
+}
+
+} // namespace tv
--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,19 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+/*
+tv::Tensor is a lightweight header-only tensor container
+without template and annoying dependencies. no algorithm is implemented.
+it should only be used when you want a no-template simple container but
+dont want to link with libtorch.
+
+If you can use libtorch, dont use tv::Tensor.
+*/
+
 #pragma once
+#include "mp_helper.h"
 #include "tensorview.h"
+#include <cstring>
+#include <iomanip>
 #include <memory>
-#include <spconv/mp_helper.h>
-#ifdef SPCONV_CUDA
+#include <type_traits>
+#ifdef TV_CUDA
+#include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #endif

-namespace tv
-{
-enum DType
-{
+namespace tv {
+enum DType {
  float32,
  int32,
  int16,
@@ -39,51 +50,46 @@ enum DType
  uint64
 };

-namespace detail
-{
+namespace detail {

-template <typename T>
-class TensorStorage
-{
+using all_tensor_types_t =
+    std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
+               uint16_t, uint32_t, uint64_t, bool>;
+
+template <typename T> class TensorStorage {
 public:
-  TensorStorage(size_t size, int device = -1, bool managed = false)
-      : mSize(size), device_(device), managed_(managed)
-  {
-    if (size == 0)
-    {
+  TensorStorage(size_t size, int device = -1, bool managed = false,
+                bool pinned = false)
+      : mSize(size), device_(device), managed_(managed), pinned_(pinned) {
+    if (size == 0) {
      mPtr = nullptr;
-    }
-    else
-    {
-      if (device == -1)
-      {
-#ifdef SPCONV_CUDA
-        checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
+    } else {
+      if (device == -1) {
+        if (pinned_) {
+#ifdef TV_CUDA
+          checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
 #else
-        mPtr = new T[size];
+          TV_THROW_INVALID_ARG("you need to define TV_CUDA to use pinned");
 #endif
-      }
-      else
-      {
-#ifdef SPCONV_CUDA
+        } else {
+          mPtr = new T[size];
+        }
+      } else {
+#ifdef TV_CUDA
        int deviceCount;
        cudaGetDeviceCount(&deviceCount);
-        if (device >= deviceCount)
-        {
-          TV_ASSERT_INVALID_ARG("you provide device ", device,
-                                " but you only have ", deviceCount, " device.");
+        if (device >= deviceCount) {
+          TV_THROW_INVALID_ARG("you provide device ", device,
+                               " but you only have ", deviceCount, " device.");
        }
        cudaSetDevice(device);
-        if (managed)
-        {
+        if (managed) {
          checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
-        }
-        else
-        {
+        } else {
          checkCudaErrors(cudaMalloc(&mPtr, size * sizeof(T)));
        }
 #else
-        TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
+        TV_THROW_INVALID_ARG("don't compiled with cuda");
 #endif
      }
    }
@@ -91,27 +97,23 @@ public:
  TensorStorage(T *ptr, size_t size, int device)
      : mSize(size), mPtr(ptr), from_blob_(true), device_(device) {}

-  virtual ~TensorStorage()
-  {
-    if (empty())
-    {
+  virtual ~TensorStorage() {
+    if (empty()) {
      return;
    }
-    if (from_blob_)
-    {
+    if (from_blob_) {
      return;
    }
-    if (device_ == -1)
-    {
-#ifdef SPCONV_CUDA
-      cudaFreeHost(mPtr);
-#else
-      delete[] mPtr;
+    if (device_ == -1) {
+      if (pinned_) {
+#ifdef TV_CUDA
+        cudaFreeHost(mPtr);
 #endif
-    }
-    else
-    {
-#ifdef SPCONV_CUDA
+      } else {
+        delete[] mPtr;
+      }
+    } else {
+#ifdef TV_CUDA
      cudaFree(mPtr);
 #endif
    }
@@ -124,36 +126,33 @@ public:

  bool empty() const { return mPtr == nullptr || mSize == 0; }
  bool managed() const { return managed_; }
+  bool pinned() const { return pinned_; }
+
  int device() const { return device_; }
-  void zero_()
-  {
-    if (device_ == -1)
-    {
+  void zero_() {
+    if (device_ == -1) {
      std::memset(data(), 0, mSize);
      // std::fill(data(), data() + mSize, 0);
-    }
-    else
-    {
-#ifdef SPCONV_CUDA
+    } else {
+#ifdef TV_CUDA
      checkCudaErrors(cudaMemset(data(), 0, mSize / sizeof(T)));
 #else
-      TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
+      TV_THROW_INVALID_ARG("don't compiled with cuda");
 #endif
    }
  }

 private:
-  T *mPtr = nullptr;
  size_t mSize = 0;
-  int device_ = -1;
+  T *mPtr = nullptr;
  bool from_blob_ = false;
+  int device_ = -1;
  bool managed_ = false;
+  bool pinned_ = false;
 };

-size_t sizeof_dtype(DType dtype)
-{
-  switch (dtype)
-  {
+template <typename T> size_t sizeof_dtype(T dtype) {
+  switch (dtype) {
  case float32:
    return sizeof(float);
  case int8:
@@ -176,20 +175,16 @@ size_t sizeof_dtype(DType dtype)
    return sizeof(uint32_t);
  case uint64:
    return sizeof(uint64_t);
-#ifdef SPCONV_CUDA
  case float16:
-    return sizeof(__half);
-#endif
+    return 2;
  default:
    TV_THROW_RT_ERR("unsupported dtype");
  }
  return 0;
 }

-std::string typeString(DType t)
-{
-  switch (t)
-  {
+template <typename T> std::string typeString(T t) {
+  switch (t) {
  case DType::bool_:
    return "bool";
  case DType::float32:
@@ -212,165 +207,477 @@ std::string typeString(DType t)
    return "uint32";
  case DType::uint64:
    return "uint64";
-#ifdef SPCONV_CUDA
  case DType::float16:
    return "half";
-#endif
  default:
    return "";
  }
 }

-template <typename T>
-struct TypeToDtypeTraits;
+template <typename T> struct TypeToDtypeTraits;
+
+template <> struct TypeToDtypeTraits<int32_t> {
+  static constexpr DType dtype = int32;
+};
+
+#ifdef TV_CUDA
+template <> struct TypeToDtypeTraits<__half> {
+  static constexpr DType dtype = float16;
+};
+#endif

-template <>
-struct TypeToDtypeTraits<int32_t>
-{
+template <> struct TypeToDtypeTraits<float> {
+  static constexpr DType dtype = float32;
+};
+template <> struct TypeToDtypeTraits<double> {
+  static constexpr DType dtype = float64;
+};
+template <> struct TypeToDtypeTraits<int16_t> {
+  static constexpr DType dtype = int16;
+};
+template <> struct TypeToDtypeTraits<int8_t> {
+  static constexpr DType dtype = int8;
+};
+template <> struct TypeToDtypeTraits<int64_t> {
+  static constexpr DType dtype = int64;
+};
+template <> struct TypeToDtypeTraits<uint8_t> {
+  static constexpr DType dtype = uint8;
+};
+template <> struct TypeToDtypeTraits<uint16_t> {
+  static constexpr DType dtype = uint16;
+};
+template <> struct TypeToDtypeTraits<uint32_t> {
+  static constexpr DType dtype = uint32;
+};
+template <> struct TypeToDtypeTraits<uint64_t> {
+  static constexpr DType dtype = uint64;
+};
+template <> struct TypeToDtypeTraits<bool> {
+  static constexpr DType dtype = bool_;
+};
+template <> struct TypeToDtypeTraits<const int32_t> {
  static constexpr DType dtype = int32;
 };

-#ifdef SPCONV_CUDA
-template <>
-struct TypeToDtypeTraits<__half>
-{
+#ifdef TV_CUDA
+template <> struct TypeToDtypeTraits<const __half> {
  static constexpr DType dtype = float16;
 };
 #endif

-template <>
-struct TypeToDtypeTraits<float>
-{
+template <> struct TypeToDtypeTraits<const float> {
  static constexpr DType dtype = float32;
 };
-template <>
-struct TypeToDtypeTraits<double>
-{
+template <> struct TypeToDtypeTraits<const double> {
  static constexpr DType dtype = float64;
 };
-template <>
-struct TypeToDtypeTraits<int16_t>
-{
+template <> struct TypeToDtypeTraits<const int16_t> {
  static constexpr DType dtype = int16;
 };
-template <>
-struct TypeToDtypeTraits<int8_t>
-{
+template <> struct TypeToDtypeTraits<const int8_t> {
  static constexpr DType dtype = int8;
 };
-template <>
-struct TypeToDtypeTraits<int64_t>
-{
+template <> struct TypeToDtypeTraits<const int64_t> {
  static constexpr DType dtype = int64;
 };
-template <>
-struct TypeToDtypeTraits<uint8_t>
-{
+template <> struct TypeToDtypeTraits<const uint8_t> {
  static constexpr DType dtype = uint8;
 };
-template <>
-struct TypeToDtypeTraits<uint16_t>
-{
+template <> struct TypeToDtypeTraits<const uint16_t> {
  static constexpr DType dtype = uint16;
 };
-template <>
-struct TypeToDtypeTraits<uint32_t>
-{
+template <> struct TypeToDtypeTraits<const uint32_t> {
  static constexpr DType dtype = uint32;
 };
-template <>
-struct TypeToDtypeTraits<uint64_t>
-{
+template <> struct TypeToDtypeTraits<const uint64_t> {
  static constexpr DType dtype = uint64;
 };
+template <> struct TypeToDtypeTraits<const bool> {
+  static constexpr DType dtype = bool_;
+};

 } // namespace detail

-template <class T>
-constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
+template <class T> constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;

-struct Tensor
-{
+template <class... Ts, typename F> void dispatch(DType t, F &&f) {
+  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
+  bool notFound = true;
+  mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
+    if (type_v<decltype(I)> == t) {
+      std::forward<F>(f)(decltype(I)());
+      notFound = false;
+    }
+  });
+  if (notFound) {
+    std::stringstream ss;
+    mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
+      ss << detail::TypeToString<decltype(I)>::value << " ";
+    });
+    TV_THROW_RT_ERR("unknown type", detail::typeString(t),
+                    ", available:", ss.str());
+  }
+}
+
+template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) {
+  static_assert(sizeof...(Is) > 0,
+                "you need to provide at least one candidate");
+  bool notFound = true;
+  mp_for_each<mp_list_c<T, Is...>>([=, &notFound, &f](auto I) {
+    if (T(I) == idx) {
+      std::forward<F>(f)(I);
+      notFound = false;
+    }
+  });
+  if (notFound) {
+    std::stringstream ss;
+    mp_for_each<mp_list_c<T, Is...>>([=, &ss](auto I) { ss << T(I) << " "; });
+    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
+  }
+}
+
+template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
+  // used for kernel parameter selection
+  static_assert(sizeof...(Is) > 0,
+                "you need to provide at least one candidate");
+  bool notFound = true;
+  mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
+    if (int(I) == idx) {
+      std::forward<F>(f)(I);
+      notFound = false;
+    }
+  });
+  if (notFound) {
+    std::stringstream ss;
+    mp_for_each<mp_list_c<int, Is...>>(
+        [=, &ss](auto I) { ss << int(I) << " "; });
+    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
+  }
+}
+
+/*
+template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
+  return dispatch_scalar<int, Is...>(idx, f);
+}
+*/
+
+template <class T> struct Dispatch;
+
+template <template <class...> class T, class... Args>
+struct Dispatch<T<Args...>> {
+  template <typename F> inline void operator()(DType t, F &&f) {
+    return dispatch<Args...>(t, std::forward<F>(f));
+  }
+};
+
+template <class T> struct DispatchInt;
+
+template <template <int...> class T, int... Ints>
+struct DispatchInt<T<Ints...>> {
+  template <typename F> inline void operator()(int t, F &&f) {
+    return dispatch_int<Ints...>(t, std::forward<F>(f));
+  }
+};
+constexpr size_t kTensorMaxDim = 10;
+using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;
+
+struct Tensor {
  Tensor() {}
-  Tensor(Shape shape, DType dtype, int device = -1, bool managed = false)
-      : dtype_(dtype)
-  {
+  Tensor(TensorShape shape, TensorShape stride, DType dtype, int device = -1,
+         bool pinned = false, bool managed = false)
+      : dtype_(dtype) {
+    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
+    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
+        shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
+    shape_ = shape;
+    stride_ = stride;
+  }
+
+  Tensor(TensorShape shape, DType dtype, int device = -1, bool pinned = false,
+         bool managed = false)
+      : dtype_(dtype) {
+    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
+    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
+        shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
+    shape_ = shape;
+    stride_ = shape.stride_rowmajor();
+  }
+  Tensor(void *ptr, TensorShape shape, TensorShape stride, DType dtype,
+         int device = -1)
+      : dtype_(dtype) {
+    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        shape.size() * detail::sizeof_dtype(dtype), device, managed);
+        reinterpret_cast<uint8_t *>(ptr),
+        shape.size() * detail::sizeof_dtype(dtype), device);
    shape_ = shape;
+    stride_ = stride;
  }
-  Tensor(void *ptr, Shape shape, DType dtype, int device = -1) : dtype_(dtype)
-  {
+  Tensor(void *ptr, TensorShape shape, DType dtype, int device = -1)
+      : dtype_(dtype) {
+    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
        reinterpret_cast<uint8_t *>(ptr),
        shape.size() * detail::sizeof_dtype(dtype), device);
    shape_ = shape;
+    stride_ = shape.stride_rowmajor();
  }

-  template <typename T>
-  TensorView<T> tview()
-  {
+  Tensor(const void *ptr, TensorShape shape, TensorShape stride, DType dtype,
+         int device = -1)
+      : dtype_(dtype), writeable_(false) {
+    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
+    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
+        reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
+        shape.size() * detail::sizeof_dtype(dtype), device);
+    shape_ = shape;
+    stride_ = stride;
+  }
+  Tensor(const void *ptr, TensorShape shape, DType dtype, int device = -1)
+      : dtype_(dtype), writeable_(false) {
+    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
+    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
+        reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
+        shape.size() * detail::sizeof_dtype(dtype), device);
+    shape_ = shape;
+    stride_ = shape.stride_rowmajor();
+  }
+
+  Tensor(std::initializer_list<int32_t> init)
+      : Tensor({int(init.size())}, tv::int32) {
+    std::copy(init.begin(), init.end(), data<int32_t>());
+  }
+  Tensor(std::initializer_list<int64_t> init)
+      : Tensor({int(init.size())}, tv::int64) {
+    std::copy(init.begin(), init.end(), data<int64_t>());
+  }
+  Tensor(std::initializer_list<float> init)
+      : Tensor({int(init.size())}, tv::float32) {
+    std::copy(init.begin(), init.end(), data<float>());
+  }
+  Tensor(std::initializer_list<double> init)
+      : Tensor({int(init.size())}, tv::float64) {
+    std::copy(init.begin(), init.end(), data<double>());
+  }
+
+  template <typename T, int Rank = -1,
+            template <class> class PtrTraits = DefaultPtrTraits,
+            typename Tindex = int,
+            typename std::enable_if<(Rank > 0), int>::type = 0>
+  TensorView<T, Rank, PtrTraits, Tindex> tview() {
+    using tv_shape_t =
+        typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
+    writable_check();
+    static_assert(Rank == -1 || Rank > 0, "error");
    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
-    return TensorView<T>(reinterpret_cast<T *>(storage_->data()), shape_);
+    tv_shape_t shape(Rank), stride(Rank);
+    for (int i = 0; i < Rank; ++i) {
+      shape[i] = shape_[i];
+      stride[i] = stride_[i];
+    }
+    return TensorView<T, Rank, PtrTraits, Tindex>(
+        reinterpret_cast<T *>(data<T>()), shape, stride);
  }
-  template <typename T>
-  TensorView<T> tview() const
-  {
-    TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
+  template <typename T, int Rank = -1,
+            template <class> class PtrTraits = DefaultPtrTraits,
+            typename Tindex = int,
+            typename std::enable_if<Rank == -1, int>::type = 0>
+  TensorView<T, Rank, PtrTraits, Tindex> tview() {
+    using tv_shape_t =
+        typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
+    writable_check();
+    static_assert(Rank == -1 || Rank > 0, "error");
    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    return TensorView<const std::remove_const_t<T>>(
-        reinterpret_cast<const std::remove_const_t<T> *>(storage_->data()),
-        shape_);
+    ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
+    for (int i = 0; i < ndim(); ++i) {
+      shape[i] = shape_[i];
+      stride[i] = stride_[i];
+    }
+    return TensorView<T, Rank, PtrTraits, Tindex>(
+        reinterpret_cast<T *>(data<T>()), shape, stride);
  }
+
+  template <typename T, int Rank = -1,
+            template <class> class PtrTraits = DefaultPtrTraits,
+            typename Tindex = int,
+            typename std::enable_if<(Rank > 0), int>::type = 0>
+  TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
+  tview() const {
+    static_assert(Rank == -1 || Rank > 0, "error");
+    if (Rank > 0) {
+      TV_ASSERT_RT_ERR(Rank == ndim(), "error");
+    }
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+
+    ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
+    for (int i = 0; i < Rank; ++i) {
+      shape[i] = shape_[i];
+      stride[i] = stride_[i];
+    }
+    return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
+        reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
+        stride);
+  }
+  template <typename T, int Rank = -1,
+            template <class> class PtrTraits = DefaultPtrTraits,
+            typename Tindex = int,
+            typename std::enable_if<Rank == -1, int>::type = 0>
+  TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
+  tview() const {
+    static_assert(Rank == -1 || Rank > 0, "error");
+    if (Rank > 0) {
+      TV_ASSERT_RT_ERR(Rank == ndim(), "error");
+    }
+    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+
+    ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
+    for (int i = 0; i < ndim(); ++i) {
+      shape[i] = shape_[i];
+      stride[i] = stride_[i];
+    }
+    return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
+        reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
+        stride);
+  }
+
+  template <class... Inds> Tensor view(Inds... newShapes) const {
+    static_assert(sizeof...(newShapes) > 0, "dont support empty for now");
+    TensorShape shape{int(newShapes)...};
+    bool found_minus_1 = false;
+    for (size_t i = 0; i < shape.ndim(); ++i) {
+      if (!found_minus_1) {
+        if (shape[i] == -1) {
+          shape[i] = 1;
+          shape[i] = size() / shape.size();
+          found_minus_1 = true;
+        } else {
+          TV_ASSERT_INVALID_ARG(shape[i] > 0,
+                                "shape except -1 must larger than 0");
+        }
+      } else {
+        TV_ASSERT_INVALID_ARG(shape[i] > 0, "multiple -1 in your argument.");
+      }
+    }
+    TV_ASSERT_RT_ERR(shape.size() == size(), "error");
+    Tensor res(*this);
+    res.shape_ = shape;
+    res.stride_ = shape.stride_rowmajor();
+    return res;
+  }
+
+  Tensor view(TensorShape shape) const {
+    TV_ASSERT_RT_ERR(shape.size() == size(), "error");
+    Tensor res(*this);
+    res.shape_ = shape;
+    res.stride_ = shape.stride_rowmajor();
+    return res;
+  }
+
+  Tensor squeeze() const { return view(shape_.squeeze()); }
+
+  Tensor squeeze(int axis) const {
+    if (axis < 0) {
+      axis = ndim() + axis;
+    }
+    return view(shape_.squeeze(axis));
+  }
+
+  Tensor unsqueeze(int axis) const {
+    if (axis < 0) {
+      axis = ndim() + axis;
+    }
+    return view(shape_.unsqueeze(axis));
+  }
+
+  bool pinned() const { return storage_->pinned(); }
+
+  Tensor slice_first_axis(int start, int end) const {
+    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
+    if (start < 0) {
+      start = shape_[0] + start;
+    }
+    if (end < 0) {
+      end = shape_[0] + end;
+    }
+    TV_ASSERT_INVALID_ARG(start < shape_[0], "start must small than dim 0");
+    TV_ASSERT_INVALID_ARG(start < end, "start must small than end");
+    size_t new_offset = start * shape_.prod(1) * itemsize();
+    Tensor res(*this);
+    TensorShape newshape(shape_);
+    newshape[0] = end - start;
+    res.shape_ = newshape;
+    res.stride_ = stride_;
+    res.offset_ = new_offset;
+    return res;
+  }
+
  bool empty() const { return storage_->empty(); }
  DType dtype() const { return dtype_; }
  int device() const { return storage_->device(); }
-  const Shape &shape() const { return shape_; }
-  int dim(int idx) const
-  {
-    TV_ASSERT_RT_ERR(idx < shape_.size(), "error");
-    return shape_[idx];
+  size_t ndim() const { return shape_.ndim(); }
+
+  const TensorShape &shape() const { return shape_; }
+  const TensorShape &stride() const { return stride_; }
+
+  int dim(int idx) const {
+    if (idx < 0) {
+      TV_ASSERT_RT_ERR(shape_.size() + idx < shape_.size(), idx, shape_);
+      return shape_[shape_.size() + idx];
+    } else {
+      TV_ASSERT_RT_ERR(idx < int(shape_.size()), idx, shape_);
+      return shape_[idx];
+    }
  }
-  const uint8_t *raw_data() const { return storage_->data(); }
+  const uint8_t *raw_data() const { return storage_->data() + offset_; }
+  size_t raw_size() const { return size() * itemsize(); }
  size_t size() const { return shape_.size(); }
-  Tensor &zero_()
-  {
+  size_t itemsize() const { return detail::sizeof_dtype(dtype_); }
+  Tensor &zero_() {
+    writable_check();
    storage_->zero_();
    return *this;
  }
-  uint8_t *raw_data() { return storage_->data(); }
-  template <typename T>
-  Tensor &fill_(T value)
-  {
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    auto ptr = reinterpret_cast<T *>(raw_data());
-    std::fill(ptr, ptr + size(), value);
+  uint8_t *raw_data() {
+    writable_check();
+    return storage_->data() + offset_;
+  }
+  template <typename T> Tensor &fill_(T value) {
+    writable_check();
+    TV_ASSERT_RT_ERR(device() == -1, "error");
+    Dispatch<detail::all_tensor_types_t>()(dtype_, [&](auto I) {
+      using Treal = decltype(I);
+      if (std::is_convertible<T, Treal>::value) {
+        auto ptr = reinterpret_cast<Treal *>(raw_data());
+        std::fill(ptr, ptr + size(), Treal(value));
+      } else {
+        TV_THROW_INVALID_ARG("not convertable from", type_s<T>, "to",
+                             type_s<Treal>);
+      }
+    });
    return *this;
  }

-  template <typename T>
-  T *data()
-  {
+  template <typename T> T *data() {
    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
+    writable_check();
    return reinterpret_cast<T *>(raw_data());
  }

-  template <typename T>
-  const T *data() const
-  {
+  template <typename T> const T *data() const {
    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
    return reinterpret_cast<const T *>(raw_data());
  }

-  void copy_(const Tensor &tensor)
-  {
+  void copy_(const Tensor &tensor) {
+    writable_check();
+    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
    TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
    TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
-    TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype");
-    if (device() == -1 && tensor.device() == -1)
-    {
-#ifdef SPCONV_CUDA
+    TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
+                     detail::typeString(dtype()),
+                     detail::typeString(tensor.dtype()));
+    if (device() == -1 && tensor.device() == -1) {
+#ifdef TV_CUDA
      host2host(storage_->data(), tensor.raw_data(),
                size() * detail::sizeof_dtype(dtype_));
 #else
@@ -379,88 +686,162 @@ struct Tensor
                storage_->data());
 #endif
    }
-#ifdef SPCONV_CUDA
-    else if (device() >= 0 && tensor.device() == -1)
-    {
-      // host2dev
+#ifdef TV_CUDA
+    else if (device() >= 0 && tensor.device() == -1) {
      host2dev(storage_->data(), tensor.raw_data(),
               size() * detail::sizeof_dtype(dtype_));
-    }
-    else if (device() == -1 && tensor.device() >= 0)
-    {
-      // dev2host
+    } else if (device() == -1 && tensor.device() >= 0) {
      dev2host(storage_->data(), tensor.raw_data(),
               size() * detail::sizeof_dtype(dtype_));
-    }
-    else if (device() >= 0 && tensor.device() >= 0)
-    {
-      // dev2dev
+    } else if (device() >= 0 && tensor.device() >= 0) {
      dev2dev(storage_->data(), tensor.raw_data(),
              size() * detail::sizeof_dtype(dtype_));
    }
 #endif
-    else
-    {
-      TV_ASSERT_RT_ERR(false, "only support cpu tensor");
+    else {
+      TV_THROW_RT_ERR("only support cpu tensor");
    }
  }

-  Tensor cpu() const
-  {
-    if (storage_->device() == -1)
-    {
-      return *this;
+#ifdef TV_CUDA
+  void copy_(const Tensor &tensor, cudaStream_t stream) {
+    writable_check();
+    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
+    TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
+    TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
+    TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
+                     detail::typeString(dtype()),
+                     detail::typeString(tensor.dtype()));
+    if (device() == -1 && tensor.device() == -1) {
+      host2host(storage_->data(), tensor.raw_data(),
+                size() * detail::sizeof_dtype(dtype_), stream);
+    } else if (device() >= 0 && tensor.device() == -1) {
+      host2dev(storage_->data(), tensor.raw_data(),
+               size() * detail::sizeof_dtype(dtype_), stream);
+    } else if (device() == -1 && tensor.device() >= 0) {
+      dev2host(storage_->data(), tensor.raw_data(),
+               size() * detail::sizeof_dtype(dtype_), stream);
+    } else if (device() >= 0 && tensor.device() >= 0) {
+      dev2dev(storage_->data(), tensor.raw_data(),
+              size() * detail::sizeof_dtype(dtype_), stream);
+    } else {
+      TV_THROW_RT_ERR("only support cpu tensor");
    }
-    Tensor res(shape_, dtype_, -1, storage_->managed());
+  }
+#endif
+
+  Tensor cpu() const {
+    if (storage_->device() == -1) {
+      // cpu() should always copy tensor.
+      return clone();
+    }
+    Tensor res(shape_, stride_, dtype_, -1, storage_->managed());
    res.copy_(*this);
    return res;
  }

-  template <typename T>
-  void copy_(const TensorView<T> &tensor, int device)
-  {
+  template <typename T> void copy_(const TensorView<T> &tensor, int device) {
+    writable_check();
+    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
    Tensor src = from_blob(tensor, device);
    return copy_(src);
  }

+  Tensor &operator=(const Tensor &tensor) {
+    dtype_ = tensor.dtype_;
+    storage_ = tensor.storage_;
+    shape_ = tensor.shape_;
+    writeable_ = tensor.writeable_;
+    offset_ = tensor.offset_;
+    stride_ = tensor.stride_;
+    return *this;
+  }
+
+  Tensor(const Tensor &tensor) {
+    dtype_ = tensor.dtype_;
+    storage_ = tensor.storage_;
+    shape_ = tensor.shape_;
+    writeable_ = tensor.writeable_;
+    offset_ = tensor.offset_;
+    stride_ = tensor.stride_;
+  }
+
+  Tensor clone(bool pinned = false) const {
+    TV_ASSERT_RT_ERR(!empty(), "clone a empty tensor");
+    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
+    Tensor newtensor(shape_, stride_, dtype_, device(), pinned,
+                     storage_->managed());
+    newtensor.copy_(*this);
+    return newtensor;
+  }
+
+  Tensor astype(DType dtype) {
+    if (dtype == dtype_) {
+      return clone();
+    }
+    TV_ASSERT_INVALID_ARG(device() == -1, "only support cpu tensor");
+    TV_ASSERT_INVALID_ARG(!empty(), "can't be used in empty tensor");
+    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
+    auto tensor = Tensor();
+    Dispatch<detail::all_tensor_types_t>()(dtype, [&](auto Idst) {
+      using Tdst = decltype(Idst);
+      Dispatch<detail::all_tensor_types_t>()(dtype_, [&](auto Icur) {
+        using Tcur = decltype(Icur);
+        if (std::is_convertible<Tcur, Tdst>::value) {
+          auto ptr = data<Tcur>();
+          tensor = Tensor(shape_, stride_, dtype, device(), pinned(),
+                          storage_->managed());
+          std::copy(ptr, ptr + size(), tensor.data<Tdst>());
+        } else {
+          TV_THROW_INVALID_ARG("not convertable from", type_s<Tcur>, "to",
+                               type_s<Tdst>);
+        }
+      });
+    });
+    return tensor;
+  }
+
+  template <class... Ts, typename F> inline void dispatch(F &&f) {
+    return tv::dispatch<Ts...>(dtype_, std::forward<F>(f));
+  }
+
 protected:
+  inline void writable_check() {
+    TV_ASSERT_RT_ERR(writeable_,
+                     "you cant do non-const operation when not writable");
+  }
+
  DType dtype_;
  std::shared_ptr<detail::TensorStorage<uint8_t>> storage_;
-  Shape shape_;
+  TensorShape shape_;
+  size_t offset_ = 0;
+  TensorShape stride_;
+
+private:
+  bool writeable_ = true;
+  bool contiguous_ = true;
 };

-inline Tensor from_blob(void *ptr, Shape shape, DType dtype, int device)
-{
-  return Tensor(ptr, shape, dtype, device);
+template <typename Os> Os &operator<<(Os &os, const Tensor &tensor) {
+  TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
+  Dispatch<detail::all_tensor_types_t>()(tensor.dtype(), [&](auto I) {
+    using T = decltype(I);
+    std::stringstream ss;
+    if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
+      ss << std::setprecision(4);
+    }
+    os << tensor.tview<T, -1, DefaultPtrTraits, int64_t>().repr(ss);
+  });
+  return os;
 }

-template <typename T>
-Tensor from_blob(TensorView<T> tensor, int device)
-{
-  return Tensor(tensor.data(), tensor.shape, type_v<T>, device);
+inline Tensor from_blob(void *ptr, TensorShape shape, DType dtype, int device) {
+  return Tensor(ptr, shape, dtype, device);
 }

-template <class... Ts, typename F>
-void dispatch(DType t, F &&f)
-{
-  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
-  bool notFound = true;
-  spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    if (type_v<decltype(I)> == t)
-    {
-      std::forward<F>(f)(decltype(I)());
-      notFound = false;
-    }
-  });
-  if (notFound)
-  {
-    std::stringstream ss;
-    spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
-      ss << detail::TypeToString<decltype(I)>::value << " ";
-    });
-    TV_THROW_RT_ERR("unknown type", detail::typeString(t),
-                    ", available: ", ss.str());
-  }
+inline Tensor from_blob(const void *ptr, TensorShape shape, DType dtype,
+                        int device) {
+  return Tensor(ptr, shape, dtype, device);
 }

 } // namespace tv
\ No newline at end of file
--- a/include/tensorview/tensorview.h
+++ b/include/tensorview/tensorview.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,19 +17,19 @@
 #include <cassert>
 #include <cstdlib>

+#include "common.h"
 #include "prettyprint.h"
 #include <iostream>
 #include <memory>
 #include <sstream>
 #include <type_traits>
 #include <vector>
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
 #include <cuda_runtime_api.h>
 #endif
-
 namespace tv {

-#ifdef __NVCC__
+#if (defined(__clang__) && defined(__CUDA__)) || defined(__NVCC__)

 #define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
 #define TV_DEVICE_INLINE __forceinline__ __device__
@@ -54,54 +54,6 @@ namespace tv {
    }                                                                          \
  }

-#define TV_DEVICE_REQUIRE(expr, ...)                                           \
-  {                                                                            \
-    if (!(expr) && threadIdx.x == 0)                                           \
-      printf(__VA_ARGS__);                                                     \
-    assert(expr);                                                              \
-  }
-
-template <class SStream, class T> void sstream_print(SStream &ss, T val) {
-  ss << val;
-}
-
-template <class SStream, class T, class... TArgs>
-void sstream_print(SStream &ss, T val, TArgs... args) {
-  ss << val << " ";
-  sstream_print(ss, args...);
-}
-
-template <class... TArgs> void ssprint(TArgs... args) {
-  std::stringstream ss;
-  sstream_print(ss, args...);
-  std::cout << ss.str() << std::endl;
-}
-
-#define TV_THROW_RT_ERR(...)                                                   \
-  {                                                                            \
-    std::stringstream __macro_s;                                               \
-    __macro_s << __FILE__ << " " << __LINE__ << "\n";                          \
-    tv::sstream_print(__macro_s, __VA_ARGS__);                                 \
-    throw std::runtime_error(__macro_s.str());                                 \
-  }
-
-#define TV_ASSERT_RT_ERR(expr, ...)                                            \
-  {                                                                            \
-    if (!(expr))                                                               \
-      TV_THROW_RT_ERR(__VA_ARGS__);                                            \
-  }
-
-#define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << #expr << " assert faild. ";                                 \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      throw std::invalid_argument(__macro_s.str());                            \
-    }                                                                          \
-  }
-
 #define TV_CHECK_CUDA_ERR()                                                    \
  {                                                                            \
    auto __macro_err = cudaGetLastError();                                     \
@@ -109,6 +61,7 @@ template <class... TArgs> void ssprint(TArgs... args) {
      std::stringstream __macro_s;                                             \
      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
      __macro_s << "cuda execution failed with error " << __macro_err;         \
+      TV_BACKTRACE_PRINT(__macro_s);                                           \
      throw std::runtime_error(__macro_s.str());                               \
    }                                                                          \
  }
@@ -122,11 +75,12 @@ template <class... TArgs> void ssprint(TArgs... args) {
      __macro_s << "cuda execution failed with error " << __macro_err;         \
      __macro_s << " " << cudaGetErrorString(__macro_err) << "\n";             \
      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
+      TV_BACKTRACE_PRINT(__macro_s);                                           \
      throw std::runtime_error(__macro_s.str());                               \
    }                                                                          \
  }

-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
 struct GPU {
  GPU(cudaStream_t s = 0) : mStream(s) {}
  virtual cudaStream_t getStream() const { return mStream; }
@@ -135,7 +89,18 @@ struct GPU {
 #endif
 struct CPU {};

+#ifndef TV_MAX_DIM
 #define TV_MAX_DIM 6
+#endif
+
+template <typename T> struct DefaultPtrTraits { typedef T *type; };
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T> struct RestrictPtrTraits {
+  typedef T *__restrict__ type;
+};
+#endif
+
 /*
 template <typename T>
 constexpr size_t calc_align(size_t ndim)
@@ -160,57 +125,73 @@ template <typename T, size_t MaxDim = TV_MAX_DIM>
 struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
 public:
  TV_HOST_DEVICE_INLINE SimpleVector(){};
+  TV_HOST_DEVICE_INLINE SimpleVector(size_t count, T init = T())
+      : size_(count) {
+    for (size_t i = 0; i < count; ++i) {
+      array_[i] = init;
+    }
+  };
+  template <typename Iterator> SimpleVector(Iterator first, Iterator last) {
+    size_ = 0;
+    for (; first != last; ++first) {
+      if (size_ >= MaxDim) {
+        TV_THROW_INVALID_ARG("iterator too long");
+      }
+      array_[size_++] = *first;
+    }
+  };
  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
    TV_ASSERT(q.size() <= MaxDim);
-    mSize = 0;
+    size_ = 0;
    for (T s : q) {
-      mArray[mSize++] = s;
+      array_[size_++] = s;
    }
-    mSize = q.size();
+    size_ = q.size();
  }
  SimpleVector(const std::vector<T> &arr) {
    TV_ASSERT(arr.size() <= MaxDim);
    for (size_t i = 0; i < arr.size(); ++i) {
-      mArray[i] = arr[i];
+      array_[i] = arr[i];
    }
-    mSize = arr.size();
+    size_ = arr.size();
  }
  TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
    TV_ASSERT(arr.size() <= MaxDim);
    for (size_t i = 0; i < arr.size(); ++i) {
-      mArray[i] = arr[i];
+      array_[i] = arr[i];
    }
-    mSize = arr.size();
+    size_ = arr.size();
  }
  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
 #ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < mSize);
+    TV_ASSERT(idx >= 0 && idx < size_);
 #endif
-    return mArray[idx];
+    return array_[idx];
  }
  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
 #ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < mSize);
+    TV_ASSERT(idx >= 0 && idx < size_);
 #endif
-    return mArray[idx];
+    return array_[idx];
  }
  TV_HOST_DEVICE_INLINE void push_back(T s) {
 #ifdef TV_DEBUG
-    TV_ASSERT(mSize < MaxDim);
+    TV_ASSERT(size_ < MaxDim);
 #endif
-    mArray[mSize] = s;
-    mSize++;
+    array_[size_] = s;
+    size_++;
  }
  TV_HOST_DEVICE_INLINE void pop_back() {
 #ifdef TV_DEBUG
-    TV_ASSERT(mSize > 0);
+    TV_ASSERT(size_ > 0);
 #endif
-    mSize--;
+    size_--;
  }

-  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
-  TV_HOST_DEVICE_INLINE const T *data() const { return mArray; }
-  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
+  TV_HOST_DEVICE_INLINE size_t size() const { return size_; }
+  TV_HOST_DEVICE_INLINE const T *data() const { return array_; }
+  TV_HOST_DEVICE_INLINE T *data() { return array_; }
+  TV_HOST_DEVICE_INLINE size_t empty() const { return size_ == 0; }

  typedef size_t size_type;

@@ -234,10 +215,10 @@ public:
    }
    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
      return ptr_ == rhs.ptr_;
    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
      return ptr_ != rhs.ptr_;
    }

@@ -265,10 +246,10 @@ public:
    }
    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
      return ptr_ == rhs.ptr_;
    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
      return ptr_ != rhs.ptr_;
    }

@@ -276,28 +257,28 @@ public:
    pointer ptr_;
  };

-  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
+  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(array_); }

-  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
+  TV_HOST_DEVICE_INLINE iterator end() { return iterator(array_ + size_); }

  TV_HOST_DEVICE_INLINE const_iterator begin() const {
-    return const_iterator(mArray);
+    return const_iterator(array_);
  }

  TV_HOST_DEVICE_INLINE const_iterator end() const {
-    return const_iterator(mArray + mSize);
+    return const_iterator(array_ + size_);
  }
  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
-    return const_iterator(mArray);
+    return const_iterator(array_);
  }

  TV_HOST_DEVICE_INLINE const_iterator cend() const {
-    return const_iterator(mArray + mSize);
+    return const_iterator(array_ + size_);
  }

 protected:
-  T mArray[MaxDim];
-  size_t mSize = 0;
+  T array_[MaxDim];
+  size_t size_ = 0;
 };

 template <typename T, size_t MaxDim>
@@ -323,28 +304,28 @@ struct Slice {
  template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
    SimpleVector<int, 3> slices{int(ints)...};
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
+    slices_[0] = -1;
+    slices_[1] = -1;
+    slices_[2] = -1;
    for (size_t i = 0; i < slices.size(); ++i) {
-      mSlices[i] = slices[i];
+      slices_[i] = slices[i];
    }
  }

  TV_HOST_DEVICE_INLINE Slice() {
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
+    slices_[0] = -1;
+    slices_[1] = -1;
+    slices_[2] = -1;
  }
  template <typename T>
  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
+    slices_[0] = -1;
+    slices_[1] = -1;
+    slices_[2] = -1;
    TV_ASSERT(slice.size() <= 3);
    int idx = 0;
    for (T s : slice) {
-      mSlices[idx] = int(s);
+      slices_[idx] = int(s);
      ++idx;
    }
  }
@@ -352,90 +333,124 @@ struct Slice {
 #ifdef TV_DEBUG
    TV_ASSERT(idx >= 0 && idx < 3);
 #endif
-    return mSlices[idx];
+    return slices_[idx];
  }
  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
 #ifdef TV_DEBUG
    TV_ASSERT(idx >= 0 && idx < 3);
 #endif
-    return mSlices[idx];
+    return slices_[idx];
  }

 protected:
-  int mSlices[3];
+  int slices_[3];
 };

-template <size_t MaxDim = TV_MAX_DIM>
-struct ShapeBase : public SimpleVector<int, MaxDim> {
-  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
-  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
-      : SimpleVector<int, MaxDim>(shape) {}
-  TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<int, MaxDim> vec)
-      : SimpleVector<int, MaxDim>(vec) {}
+template <size_t MaxDim = TV_MAX_DIM, typename Tindex = int>
+struct ShapeBase : public SimpleVector<Tindex, MaxDim> {
+  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<Tindex, MaxDim>(){};
+  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<Tindex> shape)
+      : SimpleVector<Tindex, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<Tindex, MaxDim> vec)
+      : SimpleVector<Tindex, MaxDim>(vec) {}
  template <typename T, template <class...> class Container>
-  ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
+  ShapeBase(Container<T> shape) : SimpleVector<Tindex, MaxDim>(shape) {}
  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
-      : SimpleVector<int, MaxDim>(shape) {}
-  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
-
-  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
+      : SimpleVector<Tindex, MaxDim>(shape) {}
+  ShapeBase(const std::vector<Tindex> &arr)
+      : SimpleVector<Tindex, MaxDim>(arr) {}
+
+  ShapeBase<MaxDim, Tindex> &
+  operator=(const ShapeBase<MaxDim, Tindex> &shape) = default;
+  TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start,
+                                                    Tindex end) const {
 #ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && end <= this->mSize && end > start);
+    TV_ASSERT(start >= 0 && end <= this->size_ && end > start);
 #endif
-    ShapeBase<MaxDim> shape;
-    for (int i = start; i < end; ++i) {
-      shape.push_back(this->mArray[i]);
+    ShapeBase<MaxDim, Tindex> shape;
+    for (Tindex i = start; i < end; ++i) {
+      shape.push_back(this->array_[i]);
    }
    return shape;
  }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
+  TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start) const {
 #ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && start <= this->mSize);
+    TV_ASSERT(start >= 0 && start <= this->size_);
 #endif
-    ShapeBase<MaxDim> shape;
-    for (int i = start; i < this->mSize; ++i) {
-      shape.push_back(this->mArray[i]);
+    ShapeBase<MaxDim, Tindex> shape;
+    for (size_t i = start; i < this->size_; ++i) {
+      shape.push_back(this->array_[i]);
    }
    return shape;
  }

-  TV_HOST_DEVICE_INLINE size_t size() const {
-    if (this->mSize == 0)
+  TV_HOST_DEVICE size_t size() const {
+    if (this->size_ == 0)
      return 0;
    size_t s = 1;
-    for (int i = 0; i < int(this->mSize); ++i) {
-      s *= this->mArray[i];
+    for (int i = 0; i < int(this->size_); ++i) {
+      s *= this->array_[i];
    }
    return s;
  }
-  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
-    ShapeBase<MaxDim> shape;
-    for (int i = 0; i < this->mSize; ++i) {
-      if (this->mArray[i] != 1)
-        shape.push_back(this->mArray[i]);
+  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->size_; }
+
+  TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> squeeze() const {
+    ShapeBase<MaxDim, Tindex> shape;
+    for (size_t i = 0; i < this->size_; ++i) {
+      if (this->array_[i] != 1)
+        shape.push_back(this->array_[i]);
+    }
+    if (shape.empty()) {
+      // dont support empty shape for now
+      shape.push_back(1);
+    }
+    return shape;
+  }
+  template <size_t MaxDim2 = MaxDim>
+  TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> squeeze(int dim) const {
+    static_assert(MaxDim2 >= MaxDim - 1, "error");
+
+    ShapeBase<MaxDim2, Tindex> shape;
+    for (size_t i = 0; i < this->size_; ++i) {
+      if (i != size_t(dim) || this->array_[i] != 1)
+        shape.push_back(this->array_[i]);
    }
    return shape;
  }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
-    ShapeBase<MaxDim> shape;
-    for (int i = 0; i < this->mSize; ++i) {
-      if (i != dim || this->mArray[i] != 1)
-        shape.push_back(this->mArray[i]);
+  template <size_t MaxDim2 = MaxDim>
+  TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> unsqueeze(int dim) const {
+    static_assert(MaxDim2 >= MaxDim - 1, "error");
+    ShapeBase<MaxDim2, Tindex> shape;
+    for (size_t i = 0; i < this->size_; ++i) {
+      if (i == size_t(dim))
+        shape.push_back(1);
+      shape.push_back(this->array_[i]);
    }
    return shape;
  }
-  TV_HOST_DEVICE size_t prod() const {
+
+  TV_HOST_DEVICE size_t prod(Tindex start = 0) const {
    size_t res = 1;
-    for (size_t i = 0; i < this->mSize; ++i) {
-      res *= this->mArray[i];
+    for (size_t i = start; i < this->size_; ++i) {
+      res *= this->array_[i];
+    }
+    return res;
+  }
+  template <size_t MaxDim2 = MaxDim>
+  TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> stride_rowmajor() {
+    static_assert(MaxDim2 >= MaxDim, "error");
+    Tindex p = Tindex(1);
+    ShapeBase<MaxDim2, Tindex> res(this->size_);
+    for (Tindex i = this->size_ - 1; i >= 0; --i) {
+      res[i] = p;
+      p *= this->array_[i];
    }
    return res;
  }
 };

-using Shape = ShapeBase<TV_MAX_DIM>;
+using Shape = ShapeBase<TV_MAX_DIM, int>;

 template <class... Inds>
 TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
@@ -446,7 +461,9 @@ TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
 #ifdef TV_DEBUG
  TV_ASSERT(sizeof...(indexes) == shape.size());
 #endif
+#if defined(__CUDA_ARCH__)
 #pragma unroll
+#endif
  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
    offset += m * indexes_vec[i];
    m *= shape[i];
@@ -471,7 +488,9 @@ TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
  unsigned offset = 0;
  unsigned m = 1;
  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#if defined(__CUDA_ARCH__)
 #pragma unroll
+#endif
  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
    offset += m * indexes_vec[i];
    m *= shape[i];
@@ -495,7 +514,9 @@ TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
                                           const Index *shape) {
  unsigned offset = 0;
  unsigned m = 1;
+#if defined(__CUDA_ARCH__)
 #pragma unroll
+#endif
  for (int i = NDim - 1; i >= 0; --i) {
    offset += m * indexes[i];
    m *= shape[i];
@@ -515,416 +536,501 @@ TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
  return index;
 }

-template <int N> struct ArrayIndexRowMajor {
-  // mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
-  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
-                                            const Shape &indexes) {
-    return indexes[N - 1] +
-           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
+template <typename Index>
+TV_HOST_DEVICE Index rowArrayIdxInv(Index index, Index *output,
+                                    const Index *shape, int ndim) {
+  for (int i = ndim - 1; i >= 0; --i) {
+    output[i] = index % shape[i];
+    index -= output[i];
+    index /= shape[i];
+  }
+  return index;
+}
+
+template <int N> struct ArrayIndexRowMajorReverse {
+  template <typename TShape, typename T, class... Ts>
+  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T index,
+                                            Ts... inds) {
+    return index +
+           shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
+  }
+  template <typename T, class... Ts>
+  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T index,
+                                                 Ts... inds) {
+    return index +
+           shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
+  }
+};
+
+template <> struct ArrayIndexRowMajorReverse<1> {
+  template <typename TShape, typename T>
+  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T idx) {
+    return idx;
+  }
+  template <typename T>
+  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T idx) {
+    return idx;
+  }
+};
+
+template <int N, int Ndim> struct ArrayIndexRowMajor {
+  // this array index provide almost same compiled code. compile it in
+  // https://godbolt.org/ for more details.
+  template <typename TShape, typename Tinit, typename T, class... Ts>
+  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
+                                            T index, Ts... inds) {
+    return ArrayIndexRowMajor<N - 1, Ndim>::run(
+        shape, (index + start) * shape[Ndim - N + 1], inds...);
+  }
+  template <typename Tinit, typename T, class... Ts>
+  TV_HOST_DEVICE_INLINE static unsigned
+  runShape(const Shape &shape, Tinit start, T index, Ts... inds) {
+    return ArrayIndexRowMajor<N - 1, Ndim>::runShape(
+        shape, (index + start) * shape[Ndim - N + 1], inds...);
+  }
+};
+
+template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
+  template <typename TShape, typename Tinit, typename T>
+  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
+                                            T idx) {
+    return start + idx;
+  }
+  template <typename Tinit, typename T>
+  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
+                                                 Tinit start, T idx) {
+    return start + idx;
  }
 };

-template <> struct ArrayIndexRowMajor<0> {
-  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
-                                            const Shape &indexes) {
+template <> struct ArrayIndexRowMajor<0, 0> {
+  template <typename TShape, typename Tinit>
+  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start) {
+    return 0;
+  }
+  template <typename Tinit>
+  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
+                                                 Tinit start) {
    return 0;
  }
 };

-namespace detail {
-template <typename T> constexpr const char *simpleTypeName(T val = T());
-template <> constexpr const char *simpleTypeName(float val) {
-  return "float32";
-}
-template <> constexpr const char *simpleTypeName(double val) {
-  return "float64";
-}
-template <> constexpr const char *simpleTypeName(int val) { return "int32"; }
-template <> constexpr const char *simpleTypeName(unsigned val) {
-  return "uint32";
-}
-template <> constexpr const char *simpleTypeName(long val) { return "int64"; }
-template <> constexpr const char *simpleTypeName(unsigned long val) {
-  return "uint64";
+template <int N, int Ndim> struct ArrayIndexStride {
+  // this array index provide almost same compiled code. compile it in
+  // https://godbolt.org/ for more details.
+  template <typename TShape, typename Tinit, typename T, class... Ts>
+  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
+                                            T index, Ts... inds) {
+    return ArrayIndexStride<N - 1, Ndim>::run(
+        stride, start + index * stride[Ndim - N + 1], inds...);
+  }
+};
+
+template <int Ndim> struct ArrayIndexStride<1, Ndim> {
+  template <typename TShape, typename Tinit, typename T>
+  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
+                                            T idx) {
+    return start + idx * stride[Ndim - 1];
+  }
+};
+
+#if __cplusplus >= 201703L
+template <size_t... N, class T, class... Ts>
+TV_HOST_DEVICE_INLINE T array_index_stride(const T *stride, Ts... ids) {
+  return ((stride[N] * std::get<N>(std::forward_as_tuple(ids...))) + ...);
 }
-}; // namespace detail
+#endif

-template <typename T, int Rank = -1> struct TensorView {
-  TV_HOST_DEVICE_INLINE TensorView() {}
-  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
-      : mPtr(ptr), mShape(shape) {}
-  // explicit TV_HOST_DEVICE_INLINE TensorView(const
-  // TensorView<std::remove_const_t<T>> &tview) : mPtr(tview.data()),
-  // mShape(tview.shape()) {}
-  template <class... Integers>
-  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Integers... shapes)
-      : mPtr(ptr) {
-    mShape = {int(shapes)...};
+namespace detail {
+template <typename T> struct TypeToString;
+template <> struct TypeToString<bool> {
+  static constexpr const char *value = "bool";
+};
+template <> struct TypeToString<const bool> {
+  static constexpr const char *value = "bool";
+};
+template <> struct TypeToString<int32_t> {
+  static constexpr const char *value = "int32";
+};
+template <> struct TypeToString<float> {
+  static constexpr const char *value = "float";
+};
+template <> struct TypeToString<double> {
+  static constexpr const char *value = "double";
+};
+template <> struct TypeToString<int16_t> {
+  static constexpr const char *value = "int16";
+};
+template <> struct TypeToString<int8_t> {
+  static constexpr const char *value = "int8";
+};
+template <> struct TypeToString<int64_t> {
+  static constexpr const char *value = "int64";
+};
+template <> struct TypeToString<uint8_t> {
+  static constexpr const char *value = "uint8";
+};
+template <> struct TypeToString<uint16_t> {
+  static constexpr const char *value = "uint16";
+};
+template <> struct TypeToString<uint32_t> {
+  static constexpr const char *value = "uint32";
+};
+template <> struct TypeToString<uint64_t> {
+  static constexpr const char *value = "uint64";
+};
+template <> struct TypeToString<const int32_t> {
+  static constexpr const char *value = "int32";
+};
+template <> struct TypeToString<const float> {
+  static constexpr const char *value = "float";
+};
+template <> struct TypeToString<const double> {
+  static constexpr const char *value = "double";
+};
+template <> struct TypeToString<const int16_t> {
+  static constexpr const char *value = "int16";
+};
+template <> struct TypeToString<const int8_t> {
+  static constexpr const char *value = "int8";
+};
+template <> struct TypeToString<const int64_t> {
+  static constexpr const char *value = "int64";
+};
+template <> struct TypeToString<const uint8_t> {
+  static constexpr const char *value = "uint8";
+};
+template <> struct TypeToString<const uint16_t> {
+  static constexpr const char *value = "uint16";
+};
+template <> struct TypeToString<const uint32_t> {
+  static constexpr const char *value = "uint32";
+};
+template <> struct TypeToString<const uint64_t> {
+  static constexpr const char *value = "uint64";
+};
+} // namespace detail
+
+template <typename T>
+constexpr const char *type_s = detail::TypeToString<T>::value;
+
+namespace detail {
+
+template <typename T, int Rank,
+          template <class> class PtrTraits = DefaultPtrTraits,
+          typename Tindex = int>
+struct TensorAccesserBase {
+  static constexpr int rank_value = Rank;
+  using ptr_t = typename PtrTraits<T>::type;
+
+  static_assert(Rank > 0, "error");
+
+  explicit TV_HOST_DEVICE_INLINE TensorAccesserBase(ptr_t ptr,
+                                                    const Tindex *stride_ptr)
+      : ptr_(ptr), stride_ptr_(stride_ptr) {}
+
+  TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
+  TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
+
+  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
+    static_assert(sizeof...(inds) == Rank, "error");
+    return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
  }
-  operator TensorView<const T>() {
-    return TensorView<const T>(mPtr, mShape);
-  } // conversion function

-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
-  assign(const TensorView<T, Rank> &tensor) {
-    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
-               "\n");
-    T *ptr = mPtr;
-    const T *other_ptr = tensor.data();
-    for (size_t i = 0; i < size(); ++i)
-      *(ptr++) = *(other_ptr++);
-    return *this;
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
+    static_assert(sizeof...(inds) == Rank, "error");
+    return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
  }

-  template <typename T1>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
-  assign(std::initializer_list<T1> seq) {
-    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
-               "\n");
-    T *ptr = mPtr;
-    for (const T1 &s : seq)
-      *(ptr++) = T(s);
-    return *this;
+protected:
+  const Tindex *stride_ptr_;
+  ptr_t ptr_;
+};
+} // namespace detail
+
+template <typename T, int Rank,
+          template <class> class PtrTraits = DefaultPtrTraits,
+          typename Tindex = int>
+struct TensorAccesser
+    : public detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex> {
+  using ptr_t = typename PtrTraits<T>::type;
+  static_assert(Rank > 0, "error");
+  explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
+                                                const Tindex *stride_ptr)
+      : detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex>(ptr,
+                                                               stride_ptr) {}
+
+  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
+  operator[](int i) {
+    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
+        this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
+  }
+  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
+  operator[](int i) const {
+    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
+        this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
+  }
+};
+
+template <typename T, template <class> class PtrTraits, typename Tindex>
+struct TensorAccesser<T, 1, PtrTraits, Tindex>
+    : public detail::TensorAccesserBase<T, 1, PtrTraits, Tindex> {
+  using ptr_t = typename PtrTraits<T>::type;
+
+  explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
+                                                const Tindex *stride_ptr)
+      : detail::TensorAccesserBase<T, 1, PtrTraits, Tindex>(ptr, stride_ptr) {}
+
+  TV_HOST_DEVICE_INLINE T &operator[](int i) {
+    return this->ptr_[this->stride_ptr_[0] * i];
+  }
+  TV_HOST_DEVICE_INLINE T &operator[](int i) const {
+    return this->ptr_[this->stride_ptr_[0] * i];
  }
+};
+
+template <typename T, int Rank = -1,
+          template <class> class PtrTraits = DefaultPtrTraits,
+          typename Tindex = int>
+struct TensorView {
+  static constexpr int rank_value = Rank;
+  using ptr_t = typename PtrTraits<T>::type;
+  using tv_shape_t = ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex>;
+  using no_cv_type = typename std::remove_cv<T>::type;
+  static_assert(Rank == -1 || Rank > 0, "error");
+
+  TV_HOST_DEVICE_INLINE TensorView() {}
+  explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape)
+      : ptr_(ptr), shape_(shape), stride_(shape.stride_rowmajor()) {}
+
+  explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape,
+                                            tv_shape_t stride)
+      : ptr_(ptr), shape_(shape), stride_(stride) {}
+
+  operator TensorView<const no_cv_type, Rank, PtrTraits, Tindex>() {
+    return TensorView<const no_cv_type, Rank, PtrTraits, Tindex>(ptr_, shape_);
+  } // conversion function

  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
-#ifdef TV_DEBUG
+    static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
+#if defined TV_DEBUG
    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+    TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               mShape.ndim());
+               shape_.ndim());
    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 mShape[i]);
+                 shape_[i]);
    }
 #endif
-    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+    constexpr int Ndim = sizeof...(Inds);
+    return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
  }
  template <class... Inds>
  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
-#ifdef TV_DEBUG
+    static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
+#if defined TV_DEBUG
    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+    TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               mShape.ndim());
+               shape_.ndim());
    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 mShape[i]);
+                 shape_[i]);
    }
 #endif
-    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+    constexpr int Ndim = sizeof...(Inds);
+    return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
  }
  TV_HOST_DEVICE_INLINE T &operator()() {
+    static_assert(Rank == -1 || 0 == Rank, "error");
 #if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mPtr != nullptr,
-                      "you want get value but the view is empty.%s", "\n");
-    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
-                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
-#else
-    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+    TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
               "\n");
-    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               mShape.ndim());
+    TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               shape_.ndim());
 #endif
-#endif
-    return mPtr[0];
+    return ptr_[0];
  }
  TV_HOST_DEVICE_INLINE const T &operator()() const {
+    static_assert(Rank == -1 || 0 == Rank, "error");
 #if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mPtr != nullptr,
-                      "you want get value but the view is empty.%s", "\n");
-    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
-                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
-#else
-    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+    TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
               "\n");
-    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               mShape.ndim());
+    TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               shape_.ndim());
 #endif
-#endif
-    return mPtr[0];
+    return ptr_[0];
  }
-
  template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
+    static_assert(Rank == -1 || 1 == Rank, "error");
 #if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
-                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
-#else
-    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
-#endif
+    TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               shape_.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, shape_[0]);
 #endif
-    return mPtr[i1];
+    return ptr_[i1];
  }
  template <class T1, class T2>
  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
-                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-#else
-    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-#endif
+    static_assert(Rank == -1 || 2 == Rank, "error");
+#if defined TV_DEBUG
+    TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               shape_.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
 #endif
-    return mPtr[i1 * mShape[1] + i2];
+    return ptr_[i1 * shape_[1] + i2];
  }
  template <class T1, class T2, class T3>
  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
-                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-#else
-    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-#endif
+    static_assert(Rank == -1 || 3 == Rank, "error");
+#if defined TV_DEBUG
+    TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               shape_.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
 #endif
-    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+    return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
  }
  template <class T1, class T2, class T3, class T4>
  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
-                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
-                      mShape[3]);
-#else
-    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
-#endif
+    static_assert(Rank == -1 || 4 == Rank, "error");
+#if defined TV_DEBUG
+    TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               shape_.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
 #endif
-    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+    return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
  }

  template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
-                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-#else
-    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-#endif
+    static_assert(Rank == -1 || 1 == Rank, "error");
+#if defined TV_DEBUG
+    TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               shape_.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
 #endif
-    return mPtr[i1];
+    return ptr_[i1];
  }
  template <class T1, class T2>
  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
-                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-#else
-    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-
-#endif
+    static_assert(Rank == -1 || 2 == Rank, "error");
+#if defined TV_DEBUG
+    TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               shape_.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
 #endif
-    return mPtr[i1 * mShape[1] + i2];
+    return ptr_[i1 * shape_[1] + i2];
  }
  template <class T1, class T2, class T3>
  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
-                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-#else
-    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-#endif
+    static_assert(Rank == -1 || 3 == Rank, "error");
+#if defined TV_DEBUG
+    TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               shape_.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
 #endif
-    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+    return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
  }
  template <class T1, class T2, class T3, class T4>
  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
-                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
-                      mShape[3]);
-#else
-    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
-#endif
+    static_assert(Rank == -1 || 4 == Rank, "error");
+#if defined TV_DEBUG
+    TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               shape_.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
 #endif
-    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+    return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
  }

  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
 #ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
-                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
-#else
    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
               int(idx), size());
 #endif
-#endif
-    return mPtr[idx];
+    return ptr_[idx];
  }
+
  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
 #ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
-                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
-#else
    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
               int(idx), size());
 #endif
-#endif
-    return mPtr[idx];
+    return ptr_[idx];
  }

-  // TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
-  /*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
-    int idx = rowArrayIdx(mShape, index);
-#ifdef TV_DEBUG
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-                int(idx), size());
-#endif
-    return mPtr[idx];
+  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
+  accessor(Tindex idx) {
+    static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
+    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
+        ptr_ + stride_[0] * idx, stride_.data() + 1);
  }
-  TV_HOST_DEVICE_INLINE const T &operator[](const Shape index) const {
-    int idx = rowArrayIdx(mShape, index);
-#ifdef TV_DEBUG
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-                int(idx), size());
-#endif
-    return mPtr[idx];
-  }*/
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
-  operator[](SimpleVector<Slice> slice_vec) {
-    return _subview(slice_vec);
-  }
-  TV_HOST_DEVICE_INLINE const TensorView<T, Rank>
-  operator[](SimpleVector<Slice> slice_vec) const {
-    return _subview(slice_vec);
-  }
-  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
-  TV_HOST_DEVICE_INLINE T *data() { return mPtr; }
-  TV_HOST_DEVICE_INLINE const T *data() const { return mPtr; }
-  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
-  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
-  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Inds... newShapes) {
-    Shape shapes{int(newShapes)...};
-    TV_ASSERT(shapes.size() == size());
-    mShape = shapes;
-    return *this;
+  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() {
+    static_assert(Rank > 0, "rank must higher than zero");
+    return TensorAccesser<T, Rank, PtrTraits, Tindex>(ptr_, stride_.data());
  }
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Shape shapes) {
-    TV_ASSERT(shapes.size() == size());
-    mShape = shapes;
-    return *this;
+  TV_HOST_DEVICE_INLINE
+  TensorAccesser<T, Rank - 1, PtrTraits, Tindex> accessor(Tindex idx) const {
+    static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
+    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
+        ptr_ + stride_[0] * idx, stride_.data() + 1);
+  }
+  TV_HOST_DEVICE_INLINE
+  TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() const {
+    static_assert(Rank > 0, "error");
+    return TensorAccesser<T, Rank, PtrTraits, Tindex>(
+        ptr_, stride_.data(), "rank must higher than zero");
  }
+
+  TV_HOST_DEVICE_INLINE bool empty() const { return ptr_ == nullptr; }
+  TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
+  TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
+  TV_HOST_DEVICE_INLINE const tv_shape_t &shape() const { return shape_; }
+  TV_HOST_DEVICE_INLINE const tv_shape_t &stride() const { return stride_; }
+
+  TV_HOST_DEVICE_INLINE int dim(int idx) const { return shape_[idx]; }
+  TV_HOST_DEVICE_INLINE int ndim() const { return shape_.ndim(); }
  template <class... Inds>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Inds... newShapes) const {
-    Shape shapes{int(newShapes)...};
-    for (size_t i = 0; i < shapes.ndim(); ++i) {
+  TV_HOST_DEVICE_INLINE
+      TensorView<T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits, Tindex>
+      view(Inds... newShapes) const {
+    ShapeBase<Rank == -1 ? TV_MAX_DIM : sizeof...(Inds), Tindex> shapes{
+        int(newShapes)...};
+    for (size_t i = 0; i < sizeof...(newShapes); ++i) {
      if (shapes[i] == -1) {
        shapes[i] = 1;
        shapes[i] = size() / shapes.size();
@@ -932,220 +1038,221 @@ template <typename T, int Rank = -1> struct TensorView {
      }
    }
    TV_ASSERT(shapes.size() == size());
-    return TensorView<T, Rank>(mPtr, shapes);
+    return TensorView < T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits,
+           Tindex > (ptr_, shapes);
  }
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Shape shapes) const {
+  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
+  view(Shape shapes) const {
    TV_ASSERT(shapes.size() == size());
-    return TensorView<T, Rank>(mPtr, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze() const {
-    return TensorView<T, Rank>(mPtr, mShape.squeeze());
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze(int dim) const {
-    return TensorView<T, Rank>(mPtr, mShape.squeeze(dim));
-  }
-  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
-
-  template <class... Slices>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slice slice,
-                                                    Slices... slices) const {
-    return subview<float, Slice, Slices...>(slice, slices...);
-  }
-  template <class T2 = float, class... Slices>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slices... slices) const {
-    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
-    Shape new_shape{to_slice(slices)[0]...};
-    Shape start{to_slice(slices)[0]...};
-    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
-    TV_ASSERT(new_shape.ndim() != 0);
-    size_t idxsize = new_shape.ndim();
-    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
-      new_shape.push_back(0);
+    return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex> squeeze() const {
+    return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shape_.squeeze());
+  }
+  TV_HOST_DEVICE_INLINE
+  TensorView<T, Rank == -1 ? -1 : Rank - 1, PtrTraits, Tindex>
+  squeeze(int dim) const {
+    return TensorView < T, Rank == -1 ? -1 : Rank - 1, PtrTraits,
+           Tindex > (ptr_, shape_.squeeze < Rank == -1 ? TV_MAX_DIM
+                                                       : Rank - 1 > (dim));
+  }
+  TV_HOST_DEVICE_INLINE size_t size() const { return shape_.size(); }
+
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
+  subview(int id, Integers... ints) {
+    tv_shape_t start = {id, ints...};
+    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
      start.push_back(0);
    }
-#pragma unroll
-    for (size_t i = 0; i < sizeof...(Slices); ++i) {
-      if (slice_vec[i][1] != -1) {
-        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
-        TV_ASSERT(new_shape[i] >= 0);
-      } else {
-        new_shape[i] = 1; // reduce dim
-      }
-    }
-    auto offset = rowArrayIdx(mShape, start);
-#pragma unroll
-    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
-      new_shape[i] = mShape[i];
-      TV_ASSERT(new_shape[i] >= 0);
-    }
-    Shape reduced_shape;
-#pragma unroll
-    for (size_t i = 0; i < sizeof...(Slices); ++i) {
-      if (slice_vec[i][1] != -1) {
-        reduced_shape.push_back(new_shape[i]);
-      }
-    }
-#pragma unroll
-    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
-      reduced_shape.push_back(new_shape[i]);
-    }
-    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
+    return TensorView<T, Rank, PtrTraits, Tindex>(
+        ptr_ + rowArrayIdx(shape_, start),
+        shape_.subshape(sizeof...(ints) + 1));
  }

  template <class... Integers>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(int id, Integers... ints) {
-    Shape start = {id, ints...};
+  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
+  subview(int id, Integers... ints) const {
+    tv_shape_t start = {id, ints...};
    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
      start.push_back(0);
    }
-    return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
-                               mShape.subshape(sizeof...(ints) + 1));
+    return TensorView<T, Rank, PtrTraits, Tindex>(
+        ptr_ + rowArrayIdx(shape_, start),
+        shape_.subshape(sizeof...(ints) + 1));
  }

-  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
-  subview_ints(SimpleVector<int> ids) const {
+  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
+  subview(SimpleVector<int> ids) const {
    Shape start = ids;
    for (int i = ids.size(); i < ndim(); ++i) {
      start.push_back(0);
    }
-    return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
-                               mShape.subshape(ids.size()));
-  }
-
-  std::string print_vec(TensorView<T> tensor) const {
-    std::ostringstream ss;
-    ss << "[";
-    for (size_t i = 0; i < tensor.dim(0) - 1; ++i) {
-      ss << tensor(i) << ", ";
-    }
-    ss << tensor(tensor.dim(0) - 1) << "]";
-    return ss.str();
+    return TensorView<T, Rank, PtrTraits, Tindex>(
+        ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
  }
-
-  std::string repr() const {
-    std::ostringstream ss;
+  template <typename Os>
+  std::string repr(Os &ss, int limit = 1000, int limit_axis = 6) const {
    if (empty())
      return "";
-    if (mShape.ndim() == 0) {
-      ss << *mPtr;
-      // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
-      // detail::simpleTypeName<T>());
-      ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
+    if (shape_.ndim() == 0) {
+      ss << "Tensor[" << type_s<T> << "]" << std::endl;
+      ss << *ptr_;
      return ss.str();
    }
-    Shape counter = mShape;
-    auto tensor_flat = this->view(-1);
+    bool enable_limit = size() > limit;

-    for (int i = 0; i < counter.ndim() - 1; ++i) {
-      counter[i] = 0;
-      // ss << "[";
+    SimpleVector<int64_t, TV_MAX_DIM> prev(ndim(), -1);
+    SimpleVector<int64_t, TV_MAX_DIM> nd_index(ndim());
+    SimpleVector<int64_t, TV_MAX_DIM> _shape;
+    for (auto s : shape()) {
+      _shape.push_back(s);
    }
-    for (size_t i = 0; i < this->size() / this->dim(this->ndim() - 1); ++i) {
-      for (int i = 0; i < counter.ndim() - 1; ++i) {
-        if (counter[i] == 0) {
-          ss << "[";
+    ss << "Tensor[" << type_s<T> << "]: shape=" << shape()
+       << ", stride=" << stride() << std::endl;
+    auto ndimValue = ndim();
+    for (int64_t i = 0; i < size(); ++i) {
+      rowArrayIdxInv(i, nd_index.data(), _shape.data(), ndimValue);
+      bool newline = false;
+      int end_count = 0;
+      for (int j = 0; j < ndimValue; ++j) {
+        if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
+            prev[j] != -1) {
+          ss << "]";
+          ++end_count;
+          newline = true;
        }
      }
-      std::cout << "counter.ndim() " << counter.ndim() << std::endl;
-      auto counter_ = counter.subshape(0, counter.ndim() - 1);
-      std::cout << counter.subshape(0, counter.ndim() - 1) << std::endl;
-      ss << print_vec(this->subview_ints(counter_)) << "\n";
-      std::cout << "after counter.ndim() " << counter.ndim() << std::endl;
-      for (int i = 0; i < counter.ndim() - 1; ++i) {
-        if (counter[i] == this->dim(i) - 1) {
-          ss << "]";
+      if (prev[0] == -1) {
+        end_count = ndimValue;
+      }
+      if (newline) {
+        ss << "\n";
+      }
+      int starts_count = 0;
+      for (int j = 0; j < ndimValue; ++j) {
+        if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
+          ++starts_count;
        }
      }
-    }
-    // ss << "]";
-    // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
-    // detail::simpleTypeName<T>());
-    ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
-    return ss.str();
-  }
-
-protected:
-  // TODO: make this function public.
-  // currently this function is called unexpectedly when using subview({0, 0}).
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
-  _subview(SimpleVector<Slice> slice_vec) {
-    Shape new_shape;
-    for (int i = 0; i < slice_vec.size(); ++i) {
-      new_shape.push_back(slice_vec[i][0]);
-    }
-    Shape start = new_shape;
-    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
-    TV_ASSERT(new_shape.ndim() != 0);
-    size_t idxsize = new_shape.ndim();
-    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
-      new_shape.push_back(0);
-      start.push_back(0);
-    }
-    for (size_t i = 0; i < slice_vec.size(); ++i) {
-      if (slice_vec[i][1] != -1) {
-        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
-        TV_ASSERT(new_shape[i] >= 0);
+      if (starts_count > 0) {
+        for (int j = 0; j < ndimValue - end_count; ++j) {
+          ss << " ";
+        }
+        for (int j = 0; j < starts_count; ++j) {
+          ss << "[";
+        }
+      }
+      if (std::is_same<T, uint8_t>::value ||
+          std::is_same<T, const uint8_t>::value) {
+        ss << unsigned((*this)[i]);
      } else {
-        new_shape[i] = 1; // reduce dim
+        ss << (*this)[i];
      }
-    }
-    auto offset = rowArrayIdx(mShape, start);
-    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
-      new_shape[i] = mShape[i];
-      TV_ASSERT(new_shape[i] >= 0);
-    }
-    Shape reduced_shape;
-    for (size_t i = 0; i < slice_vec.size(); ++i) {
-      if (slice_vec[i][1] != -1) {
-        reduced_shape.push_back(new_shape[i]);
+      if (nd_index[ndimValue - 1] != _shape[ndimValue - 1] - 1) {
+        ss << ",";
+      }
+      for (int j = 0; j < ndimValue; ++j) {
+        prev[j] = nd_index[j];
      }
    }
-    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
-      reduced_shape.push_back(new_shape[i]);
+    for (int j = 0; j < ndimValue; ++j) {
+      ss << "]";
    }
-    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
+    return ss.str();
+  }
+  std::string repr() const {
+    std::ostringstream ss;
+    return repr(ss);
  }
+
+protected:
  template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
    return Slice{int(s), -1, -1};
  }

  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }

-  T *mPtr = nullptr;
-  Shape mShape;
+  ptr_t ptr_ = nullptr;
+  tv_shape_t shape_;
+  tv_shape_t stride_;
 };

-template <typename Os, typename T, int Rank>
-Os &operator<<(Os &os, const TensorView<T, Rank> &dt) {
+template <typename T> TensorView<T> vector2tv(std::vector<T> &arr) {
+  return TensorView<T>(arr.data(), {arr.size()});
+}
+
+template <typename T>
+TensorView<T> vector2tv(std::vector<T> &arr, Shape shape) {
+  TV_ASSERT_INVALID_ARG(shape.prod() == arr.size(), "error");
+  return TensorView<T>(arr.data(), shape);
+}
+
+template <typename T> TensorView<const T> vector2tv(const std::vector<T> &arr) {
+  return TensorView<const T>(arr.data(), {arr.size()});
+}
+
+template <typename Os, typename T, int Rank, template <class> class PtrTraits,
+          typename Tindex>
+Os &operator<<(Os &os, const TensorView<T, Rank, PtrTraits, Tindex> &dt) {
  os << dt.repr();
  return os;
 }

-template <typename Os, typename T, int Rank>
-Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
+template <typename Os, typename T, int Rank, template <class> class PtrTraits,
+          typename Tindex>
+Os &operator<<(Os &os, const TensorView<const T, Rank, PtrTraits, Tindex> &dt) {
  os << dt.repr();
  return os;
 }

 namespace detail {
-template <typename T> constexpr const char *printfTypeFormat(T val = T());
-template <> constexpr const char *printfTypeFormat(float val) { return "%.2f"; }
-template <> constexpr const char *printfTypeFormat(double val) {
-  return "%.2f";
-}
-template <> constexpr const char *printfTypeFormat(int val) { return "%d"; }
-template <> constexpr const char *printfTypeFormat(unsigned val) {
-  return "%u";
-}
-template <> constexpr const char *printfTypeFormat(long val) { return "%ld"; }
-template <> constexpr const char *printfTypeFormat(unsigned long val) {
-  return "%lu";
-}
-}; // namespace detail
+template <typename T> struct TypePrintfFormat;
+template <> struct TypePrintfFormat<float> {
+  static constexpr const char *value = "%.2f";
+};
+template <> struct TypePrintfFormat<double> {
+  static constexpr const char *value = "%.2f";
+};
+template <> struct TypePrintfFormat<int8_t> {
+  static constexpr const char *value = "%d";
+};
+template <> struct TypePrintfFormat<int16_t> {
+  static constexpr const char *value = "%d";
+};
+template <> struct TypePrintfFormat<int32_t> {
+  static constexpr const char *value = "%d";
+};
+template <> struct TypePrintfFormat<uint8_t> {
+  static constexpr const char *value = "%u";
+};
+template <> struct TypePrintfFormat<uint16_t> {
+  static constexpr const char *value = "%u";
+};
+template <> struct TypePrintfFormat<uint32_t> {
+  static constexpr const char *value = "%u";
+};
+template <> struct TypePrintfFormat<int64_t> {
+  static constexpr const char *value = "%ld";
+};
+template <> struct TypePrintfFormat<uint64_t> {
+  static constexpr const char *value = "%lu";
+};
+template <> struct TypePrintfFormat<bool> {
+  static constexpr const char *value = "%d";
+};

 template <typename T>
-TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
-                                    const char *format) {
+constexpr const char *type_printf_format_v = TypePrintfFormat<T>::value;
+
+}; // namespace detail
+
+template <typename T, int Rank, template <class> class PtrTraits,
+          typename Tindex>
+TV_HOST_DEVICE void
+printTensorView(const TensorView<T, Rank, PtrTraits, Tindex> &tensor,
+                const char *format) {
+  // used to print tensor in cuda kernel.
  if (tensor.empty())
    return;
  if (tensor.ndim() == 0) {
@@ -1153,51 +1260,69 @@ TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
    printf("\n");
    return;
  }
-  Shape counter = tensor.shape();
-  auto tensor_flat = tensor.view(-1);
-  for (int i = 0; i < counter.ndim(); ++i) {
-    counter[i] = 0;
-    printf("[");
-  }
-  for (size_t i = 0; i < tensor.size(); ++i) {
-    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
-    counter[counter.ndim() - 1] += 1;
-    int inc_count = 0;
-    bool print_comma = true;
-    for (int c = counter.ndim() - 1; c >= 0; --c) {
-      if (counter[c] == tensor.dim(c) && c > 0) {
-        ++inc_count;
-        counter[c - 1] += 1;
-        counter[c] = 0;
-        print_comma = false;
+  SimpleVector<int64_t, TV_MAX_DIM> prev(tensor.ndim(), -1);
+  SimpleVector<int64_t, TV_MAX_DIM> nd_index(tensor.ndim());
+  SimpleVector<int64_t, TV_MAX_DIM> shape(tensor.shape());
+
+  auto ndim = tensor.ndim();
+  for (int64_t i = 0; i < tensor.size(); ++i) {
+    rowArrayIdxInv(i, nd_index.data(), shape.data(), ndim);
+    bool newline = false;
+    int end_count = 0;
+    for (int j = 0; j < ndim; ++j) {
+      if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
+          prev[j] != -1) {
+        printf("]");
+        ++end_count;
+        newline = true;
      }
    }
-    if (print_comma && i != tensor.size() - 1)
-      printf(", ");
-    for (int j = 0; j < inc_count; ++j) {
-      printf("]");
+    if (prev[0] == -1) {
+      end_count = ndim;
+    }
+    if (newline) {
+      printf("\n");
+    }
+    int starts_count = 0;
+    for (int j = 0; j < ndim; ++j) {
+      if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
+        ++starts_count;
+      }
    }
-    if (i != tensor.size() - 1) {
-      if (inc_count != 0)
-        printf("\n");
-      for (int j = 0; j < inc_count; ++j) {
-        printf("[");
+    if (starts_count > 0) {
+      for (int j = 0; j < ndim - end_count; ++j) {
+        printf(" ");
+      }
+      for (int j = 0; j < starts_count; ++j) {
+        printf("]");
      }
    }
+    printf(format, tensor[i]);
+    if (nd_index[ndim - 1] != shape[ndim - 1] - 1) {
+      printf(",");
+    }
+    for (int j = 0; j < ndim; ++j) {
+      prev[j] = nd_index[j];
+    }
+  }
+  for (int j = 0; j < ndim; ++j) {
+    printf("]");
  }
-  printf("]\n");
+  printf("\n");
 }

-template <typename T>
-TV_HOST_DEVICE void printTensorView(TensorView<T> tensor) {
+template <typename T, int Rank, template <class> class PtrTraits,
+          typename Tindex>
+TV_HOST_DEVICE void
+printTensorView(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
  using Traw = typename std::remove_const<T>::type;
-  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
+  return printTensorView(tensor, detail::type_printf_format_v<Traw>);
 }
 template <typename T>
 TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
  using Traw = typename std::remove_const<T>::type;
  return printTensorView(TensorView<const T>(ptr, shape),
-                         detail::printfTypeFormat<Traw>());
+                         detail::type_printf_format_v<Traw>);
 }
 template <typename T>
 TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
@@ -1205,7 +1330,7 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
  return printTensorView(TensorView<const T>(ptr, shape), format);
 }

-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA

 #ifdef __DRIVER_TYPES_H__
 #ifndef DEVICE_RESET
@@ -1229,20 +1354,25 @@ void check(T result, char const *const func, const char *const file,
  }
 }

-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+#define checkCudaErrors(val) tv::check((val), #val, __FILE__, __LINE__)

 template <typename T>
 void host2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
  checkCudaErrors(
      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToDevice, s));
 }
-template <typename T>
-void host2dev(TensorView<T> dst, const TensorView<const T> src,
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+              const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
              cudaStream_t s = 0) {
  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
 }
-template <typename T>
-void host2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+              const TensorView<T, Rank, PtrTraits2, Tindex2> src,
+              cudaStream_t s = 0) {
  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
 }

@@ -1250,12 +1380,16 @@ template <typename T> void host2dev_sync(T *dst, const T *src, size_t size) {
  checkCudaErrors(
      cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyHostToDevice));
 }
-template <typename T>
-void host2dev_sync(TensorView<T> dst, const TensorView<const T> src) {
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+                   const TensorView<const T, Rank, PtrTraits2, Tindex2> src) {
  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
 }
-template <typename T>
-void host2dev_sync(TensorView<T> dst, const TensorView<T> src) {
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+                   const TensorView<T, Rank, PtrTraits2, Tindex2> src) {
  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
 }

@@ -1265,14 +1399,18 @@ void dev2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToHost, s));
 }

-template <typename T>
-void dev2host(TensorView<T> dst, const TensorView<const T> src,
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+              const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
              cudaStream_t s = 0) {
  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
 }
-
-template <typename T>
-void dev2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+              const TensorView<T, Rank, PtrTraits2, Tindex2> src,
+              cudaStream_t s = 0) {
  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
 }

@@ -1282,13 +1420,18 @@ void dev2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice, s));
 }

-template <typename T>
-void dev2dev(TensorView<T> dst, const TensorView<const T> src,
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+             const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
             cudaStream_t s = 0) {
  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
 }
-template <typename T>
-void dev2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+             const TensorView<T, Rank, PtrTraits2, Tindex2> src,
+             cudaStream_t s = 0) {
  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
 }

@@ -1298,67 +1441,39 @@ void host2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToHost, s));
 }

-template <typename T>
-void host2host(TensorView<T> dst, const TensorView<const T> src,
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+               const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
               cudaStream_t s = 0) {
  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
 }
-template <typename T>
-void host2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
+template <typename T, int Rank, template <class> class PtrTraits1,
+          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
+void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
+               const TensorView<T, Rank, PtrTraits2, Tindex2> src,
+               cudaStream_t s = 0) {
  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
 }

-template <typename T> void zero_dev(TensorView<T> tensor) {
+template <typename T, int Rank, template <class> class PtrTraits,
+          typename Tindex>
+void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
  checkCudaErrors(cudaMemset(tensor.data(), 0, tensor.size() * sizeof(T)));
 }

-template <typename T> void zero_dev(TensorView<T> tensor, cudaStream_t s) {
+template <typename T, int Rank, template <class> class PtrTraits,
+          typename Tindex>
+void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor, cudaStream_t s) {
  checkCudaErrors(
      cudaMemsetAsync(tensor.data(), 0, tensor.size() * sizeof(T), s));
 }
-template <typename T> void zero_host(TensorView<T> tensor) {
+template <typename T, int Rank, template <class> class PtrTraits,
+          typename Tindex>
+void zero_host(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
  std::fill(tensor.data(), tensor.data() + tensor.size(), 0);
 }

 #endif

-namespace detail {
-template <typename T> struct TypeToString;
-
-template <> struct TypeToString<int32_t> {
-  static constexpr const char *value = "int32";
-};
-template <> struct TypeToString<bool> {
-  static constexpr const char *value = "bool";
-};
-template <> struct TypeToString<float> {
-  static constexpr const char *value = "float";
-};
-template <> struct TypeToString<double> {
-  static constexpr const char *value = "double";
-};
-template <> struct TypeToString<int16_t> {
-  static constexpr const char *value = "int16";
-};
-template <> struct TypeToString<int8_t> {
-  static constexpr const char *value = "int8";
-};
-template <> struct TypeToString<int64_t> {
-  static constexpr const char *value = "int64";
-};
-template <> struct TypeToString<uint8_t> {
-  static constexpr const char *value = "uint8";
-};
-template <> struct TypeToString<uint16_t> {
-  static constexpr const char *value = "uint16";
-};
-template <> struct TypeToString<uint32_t> {
-  static constexpr const char *value = "uint32";
-};
-template <> struct TypeToString<uint64_t> {
-  static constexpr const char *value = "uint64";
-};
-
-} // namespace detail
-
 } // namespace tv
\ No newline at end of file
--- a/include/tensorview/tools.h
+++ b/include/tensorview/tools.h
+// Copyright 2019-2020 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <chrono>
+#ifdef TV_CUDA
+#include <cuda_runtime_api.h>
+#endif
+#include <iostream>
+
+namespace tv {
+
+#ifdef TV_CUDA
+template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
+  CudaContextTimer() {
+    cudaDeviceSynchronize();
+    mCurTime = std::chrono::steady_clock::now();
+  }
+  typename TimeT::rep report() {
+    cudaDeviceSynchronize();
+    auto duration = std::chrono::duration_cast<TimeT>(
+        std::chrono::steady_clock::now() - mCurTime);
+    auto res = duration.count();
+    mCurTime = std::chrono::steady_clock::now();
+    return res;
+  }
+
+private:
+  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
+};
+#endif
+
+template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
+  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
+  typename TimeT::rep report() {
+    auto duration = std::chrono::duration_cast<TimeT>(
+        std::chrono::steady_clock::now() - mCurTime);
+    auto res = duration.count();
+    mCurTime = std::chrono::steady_clock::now();
+    return res;
+  }
+
+private:
+  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
+};
+
+} // namespace tv
--- a/include/tensorview/torch_utils.h
+++ b/include/tensorview/torch_utils.h
+// Copyright 2019-2020 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "mp_helper.h"
+#include <tensorview/tensorview.h>
+
+#include <ATen/ATen.h>
+#include <torch/script.h>
+#ifdef TV_CUDA
+#include <ATen/cuda/CUDAContext.h>
+#endif
+
+namespace tv {
+
+#ifdef TV_CUDA
+struct TorchGPU : public tv::GPU {
+  virtual cudaStream_t getStream() const override {
+    return at::cuda::getCurrentCUDAStream();
+  }
+};
+#endif
+namespace detail {
+template <typename T> struct TypeToTorchDtypeTraits;
+
+template <> struct TypeToTorchDtypeTraits<int32_t> {
+  static constexpr decltype(torch::kInt32) value = torch::kInt32;
+};
+template <> struct TypeToTorchDtypeTraits<int16_t> {
+  static constexpr decltype(torch::kInt32) value = torch::kInt16;
+};
+template <> struct TypeToTorchDtypeTraits<int8_t> {
+  static constexpr decltype(torch::kInt8) value = torch::kInt8;
+};
+template <> struct TypeToTorchDtypeTraits<int64_t> {
+  static constexpr decltype(torch::kInt32) value = torch::kInt64;
+};
+template <> struct TypeToTorchDtypeTraits<uint8_t> {
+  static constexpr decltype(torch::kInt32) value = torch::kUInt8;
+};
+template <> struct TypeToTorchDtypeTraits<bool> {
+  static constexpr decltype(torch::kInt32) value = torch::kBool;
+};
+template <> struct TypeToTorchDtypeTraits<float> {
+  static constexpr decltype(torch::kInt32) value = torch::kFloat32;
+};
+template <> struct TypeToTorchDtypeTraits<double> {
+  static constexpr decltype(torch::kInt32) value = torch::kFloat64;
+};
+template <> struct TypeToTorchDtypeTraits<at::Half> {
+  static constexpr decltype(torch::kInt32) value = torch::kHalf;
+};
+
+using all_torch_types_t = std::tuple<float, double, int8_t, int16_t, int32_t,
+                                     int64_t, uint8_t, bool, at::Half>;
+
+} // namespace detail
+
+template <typename T>
+constexpr decltype(torch::kInt32) torch_type_v =
+    detail::TypeToTorchDtypeTraits<T>::value;
+
+template <class... Ts, typename F>
+void dispatch_torch(at::ScalarType t, F &&f) {
+  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
+  bool notFound = true;
+  tv::mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
+    if (detail::TypeToTorchDtypeTraits<decltype(I)>::value == t) {
+      std::forward<F>(f)(decltype(I)());
+      notFound = false;
+    }
+  });
+  if (notFound) {
+    std::stringstream ss;
+    tv::mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
+      ss << tv::detail::TypeToString<decltype(I)>::value << " ";
+    });
+    TV_THROW_RT_ERR("unknown type", t, ", available:", ss.str());
+  }
+}
+
+template <class T> struct DispatchTorch;
+
+template <template <class...> class T, class... Args>
+struct DispatchTorch<T<Args...>> {
+  template <typename F> inline void operator()(at::ScalarType t, F &&f) {
+    return dispatch_torch<Args...>(t, std::forward<F>(f));
+  }
+};
+
+template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
+  DispatchTorch<detail::all_torch_types_t>()(tensor.scalar_type(), [&](auto I) {
+    using Ttensor = decltype(I);
+    constexpr bool val = std::is_same<std::remove_cv_t<T>, Ttensor>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+  });
+}
+
+template <typename T, int Rank = -1,
+          template <class> class PtrTraits = DefaultPtrTraits,
+          typename Tindex = int>
+TensorView<T, Rank, PtrTraits, Tindex> torch2tv(const torch::Tensor &tensor) {
+  using tv_shape_t =
+      typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
+  check_torch_dtype<T>(tensor);
+  // TODO stride
+  if (Rank > 0) {
+    TV_ASSERT_INVALID_ARG(tensor.dim() == Rank, "error");
+  }
+  tv_shape_t shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<T, Rank, PtrTraits, Tindex>(
+      tensor.data_ptr<std::remove_const_t<T>>(), shape);
+}
+namespace detail {
+template <> struct TypeToString<at::Half> {
+  static constexpr const char *value = "half";
+};
+} // namespace detail
+} // namespace tv
\ No newline at end of file
--- a/include/torch_utils.h
+++ b/include/torch_utils.h
@@ -13,18 +13,18 @@
 // limitations under the License.

 #pragma once
-#include <spconv/mp_helper.h>
+#include <tensorview/mp_helper.h>
 #include <tensorview/tensorview.h>

 #include <ATen/ATen.h>
 #include <torch/script.h>
-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
 #include <ATen/cuda/CUDAContext.h>
 #endif

 namespace tv {

-#ifdef SPCONV_CUDA
+#ifdef TV_CUDA
 struct TorchGPU : public tv::GPU {
  virtual cudaStream_t getStream() const override {
    return at::cuda::getCurrentCUDAStream();
@@ -103,10 +103,10 @@ template <> struct TypeToString<at::Half> {
 };
 } // namespace detail
 template <class... Ts, typename F>
-void torch_dispatch(at::ScalarType t, F &&f) {
+void dispatch_torch(at::ScalarType t, F &&f) {
  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
  bool notFound = true;
-  spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
+  spconv::tv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
    if (torch_type_v<decltype(I)> == t) {
      std::forward<F>(f)(decltype(I)());
      notFound = false;
@@ -114,7 +114,7 @@ void torch_dispatch(at::ScalarType t, F &&f) {
  });
  if (notFound) {
    std::stringstream ss;
-    spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
+    spconv::tv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
      ss << tv::detail::TypeToString<decltype(I)>::value << " ";
    });
    TV_THROW_RT_ERR("unknown type", t, ", available: ", ss.str());