working on tensor core test

01ed382c · yan.yan · 3517290c · 3517290c · 3517290c · 3517290c
Commit 01ed382c authored Oct 18, 2021 by yan.yan
20 changed files
--- a/include/spconv/point2voxel_ops.h
+++ b/include/spconv/point2voxel_ops.h
-// Copyright 2020 xmyqsh
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <spconv/points2voxels.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-namespace spconv {
-int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
-                      torch::Tensor pointIndex, torch::Tensor grids,
-                      torch::Tensor numPointsPerGrid, torch::Tensor voxels,
-                      torch::Tensor coors, std::vector<int64_t> gridShape,
-                      const int64_t ndim);
-} // namespace spconv
--- a/include/spconv/points2voxels.h
+++ b/include/spconv/points2voxels.h
-#pragma once
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-namespace spconv {
-void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
-                                torch::Tensor grids,
-                                torch::Tensor numPointsPerGrid,
-                                torch::Tensor pointIndex,
-                                std::vector<int64_t> gridShape, const int ndim);
-void gather_point_from_grid_cuda(torch::Tensor grids,
-                                 torch::Tensor numPointsPerGrid,
-                                 torch::Tensor pointIndex,
-                                 torch::Tensor pointIndexUnique,
-                                 torch::Tensor voxels, torch::Tensor coors,
-                                 std::vector<int64_t> gridShape,
-                                 const int ndim);
-} // namespace spconv
--- a/include/spconv/pool_ops.h
+++ b/include/spconv/pool_ops.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef SPARSE_POOL_OP_H_
-#define SPARSE_POOL_OP_H_
-#include <spconv/maxpool.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-namespace spconv {
-torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
-                            torch::Tensor indiceNum, int64_t numAct);
-torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
-                                    torch::Tensor outFeatures,
-                                    torch::Tensor outGrad,
-                                    torch::Tensor indicePairs,
-                                    torch::Tensor indiceNum);
-} // namespace spconv
-#endif
\ No newline at end of file
--- a/include/spconv/reordering.cu.h
+++ b/include/spconv/reordering.cu.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef REORDERING_CU_H_
-#define REORDERING_CU_H_
-#include <THC/THCAtomics.cuh>
-#include <THC/THCNumerics.cuh>
-#include <cuda_fp16.h>
-#include <tensorview/kernel_utils.h>
-#if PYTORCH_VERSION < 10500
-#define TH_ATOMIC_ADD atomicAdd
-#else
-#define TH_ATOMIC_ADD gpuAtomicAdd
-#endif
-// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-namespace spconv {
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void gatherGenericKernel(T *buffer, const T *features,
-                                    const Index *indices, int size,
-                                    int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              features[inds[ilp] + iy];
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void gatherVecKernel(T *buffer, const T *features,
-                                const Index *indices, int size, int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          reinterpret_cast<VecType *>(
-              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void gatherVecBlockKernel(T *buffer, const T *features,
-                                     const Index *indices, int size,
-                                     int numPlanes) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  features += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      reinterpret_cast<VecType *>(
-          buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
-          reinterpret_cast<const VecType *>(
-              features)[indices[ix + ILPStrideX[ilp]] * numPlanes +
-                        threadIdx.y];
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void batchGatherGenericKernel(T *buffer, const T *features,
-                                         const Index *indices, int size,
-                                         int numPlanes, int indice_batch_stride,
-                                         int feature_batch_stride) {
-  // size: max indice num * kernel volume
-  // inds: [volume, num_elems]
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-  Index inds_elem;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size) {
-        inds_elem = ix + ILPStrideX[ilp];
-        inds[ilp] =
-            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                    inds_elem % feature_batch_stride];
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          if (inds[ilp] != -1) {
-            buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-                features[inds[ilp] * numPlanes + iy];
-          } else {
-            buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = T(0);
-          }
-        }
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void
-batchGatherVecKernel(T *buffer, const T *features, const Index *indices,
-                     int size, int feature_offset, int numPlanes,
-                     int indice_batch_stride, int feature_batch_stride) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-  Index zero[sizeof(VecType) / sizeof(T)];
-#pragma unroll
-  for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
-    zero[i] = T(0);
-  }
-  Index inds_elem;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size) {
-        inds_elem = ix + ILPStrideX[ilp] + feature_offset;
-        inds[ilp] =
-            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                    inds_elem % feature_batch_stride];
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          if (inds[ilp] != -1) {
-            reinterpret_cast<VecType *>(
-                buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-                reinterpret_cast<const VecType *>(
-                    features)[inds[ilp] * numPlanes + iy];
-          } else {
-            reinterpret_cast<VecType *>(
-                buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-                reinterpret_cast<const VecType *>(&zero)[0];
-          }
-        }
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void
-batchGatherVecBlockKernel(T *buffer, const T *features, const Index *indices,
-                          int size, int numPlanes, int indice_batch_stride,
-                          int feature_batch_stride) {
-  int ILPStrideX[NumILP];
-  Index inds;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  features += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  Index inds_elem;
-  Index zero[sizeof(VecType) / sizeof(T)];
-#pragma unroll
-  for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
-    zero[i] = T(0);
-  }
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      inds_elem = ix + ILPStrideX[ilp];
-      inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                     inds_elem % feature_batch_stride];
-      if (inds != -1) {
-        reinterpret_cast<VecType *>(
-            buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
-            reinterpret_cast<const VecType *>(
-                features)[inds * numPlanes + threadIdx.y];
-      } else {
-        reinterpret_cast<VecType *>(
-            buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
-            reinterpret_cast<const VecType *>(&zero)[0];
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer,
-                                        const Index *indices, int size,
-                                        int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          outFeatures[inds[ilp] + iy] +=
-              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
-        }
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
-                                         const Index *indices, int size,
-                                         int numPlanes) {
-  int ILPStrideX[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  constexpr int vecloadHalf2Factor = sizeof(VecType) / sizeof(__half2);
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  T buf[vecloadFactor];
-  T buf2[vecloadFactor];
-  Index idx;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
-      reinterpret_cast<VecType *>(buf)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idx];
-      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
-          buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
-      if (std::is_same<T, at::Half>::value) {
-#if __CUDA_ARCH__ >= 530
-#pragma unroll
-        for (int i = 0; i < vecloadHalf2Factor; i++) {
-          reinterpret_cast<__half2 *>(buf)[i] =
-              __hadd2(reinterpret_cast<__half2 *>(buf)[i],
-                      reinterpret_cast<__half2 *>(buf2)[i]);
-        }
-#else
-#pragma unroll
-        for (int i = 0; i < vecloadFactor; i++) {
-          buf[i] += buf2[i];
-        }
-#endif
-      } else {
-#pragma unroll
-        for (int i = 0; i < vecloadFactor; i++) {
-          buf[i] += buf2[i];
-        }
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idx] =
-          reinterpret_cast<VecType *>(buf)[0];
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
-                                      const Index *indices, int size,
-                                      int numPlanes) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] +=
-          buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
-    }
-  }
-}
-#if __CUDA_ARCH__ >= 530
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
-                                          const Index *indices, int size,
-                                          int numPlanes) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  Index idx;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
-      reinterpret_cast<__half2 *>(outFeatures)[idx] = __hadd2(
-          reinterpret_cast<__half2 *>(outFeatures)[idx],
-          reinterpret_cast<__half2 *>(
-              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
-    }
-  }
-}
-#endif
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
-                                             const Index *indices, int size,
-                                             int feature_offset, int numPlanes,
-                                             int indice_batch_stride,
-                                             int feature_batch_stride) {
-  // batch scatter add is greatly slower than native scatter when the number of
-  // points is large. this may due to atomicAdd?
-  // batch scatter add is greatly faster than native when the number of points
-  // is small.
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-  Index inds_elem;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size) {
-        inds_elem = ix + ILPStrideX[ilp] + feature_offset;
-        inds[ilp] =
-            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                    inds_elem % feature_batch_stride];
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1) {
-          TH_ATOMIC_ADD(outFeatures + inds[ilp] * numPlanes + iy,
-                        buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]);
-        }
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
-                           const Index *indices, int size, int numPlanes,
-                           int indice_batch_stride, int feature_batch_stride) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  Index inds, inds_elem;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      inds_elem = ix + ILPStrideX[ilp];
-      inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                     inds_elem % feature_batch_stride];
-      if (inds != -1) {
-        TH_ATOMIC_ADD(outFeatures + inds * numPlanes + threadIdx.y,
-                      buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
-      }
-    }
-  }
-}
-} // namespace spconv
-#undef TH_ATOMIC_ADD
-#endif
\ No newline at end of file
--- a/include/spconv/reordering.h
+++ b/include/spconv/reordering.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef SPARSE_REORDERING_FUNCTOR_H_
-#define SPARSE_REORDERING_FUNCTOR_H_
-#include <cuda_runtime_api.h>
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-namespace spconv {
-void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                              torch::Tensor indices, int size);
-void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
-                                   torch::Tensor outFeatures,
-                                   torch::Tensor indices, int size);
-void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                        torch::Tensor indices, int size);
-void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
-                             torch::Tensor indices, int size);
-void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
-                       torch::Tensor indices, int size);
-void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
-                            torch::Tensor indices, int size);
-void sparse_gather_cuda(cudaStream_t s, torch::Tensor buffer,
-                        torch::Tensor features, torch::Tensor indices,
-                        int size);
-void sparse_scatter_add_cuda(cudaStream_t s, torch::Tensor buffer,
-                             torch::Tensor outFeatures, torch::Tensor indices,
-                             int size);
-} // namespace spconv
-#endif
\ No newline at end of file
--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef SPARSE_CONV_OP_H_
-#define SPARSE_CONV_OP_H_
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-namespace spconv {
-enum ConvAlgo {
-  kNative = 0,
-  kBatch,
-  kBatchGemmGather,
-  kSparseConvNet,
-  kMinkowskiEngine
-};
-using all_conv_algos_t = tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather,
-                                       kSparseConvNet, kMinkowskiEngine>;
-// torch.jit's doc says only support int64, so we need to convert to int32.
-std::vector<torch::Tensor>
-getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-               std::vector<int64_t> outSpatialShape,
-               std::vector<int64_t> spatialShape,
-               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-               std::vector<int64_t> padding, std::vector<int64_t> dilation,
-               std::vector<int64_t> outPadding, int64_t _subM,
-               int64_t _transpose, int64_t _useHash);
-torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM,
-                         int64_t algo);
-std::vector<torch::Tensor>
-indiceConvBackward(torch::Tensor features, torch::Tensor filters,
-                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
-                   int64_t algo);
-} // namespace spconv
-#endif
\ No newline at end of file
--- a/include/spgemm/gemm.h
+++ b/include/spgemm/gemm.h
-#pragma once
-#include <cutlass/gemm/device/gemm.h>
-#include <type_traits>
-namespace spconv {
-template <typename T>
-using determine_acc_t =
-    std::conditional_t<std::is_same<T, cutlass::half_t>::value, float, T>;
-template <typename T, bool TransA, bool TransB, bool TransC>
-cudaError_t cutlassGemm(cudaStream_t s, int M, int N, int K, T alpha,
-                        T const *A, int lda, T const *B, int ldb, T beta, T *C,
-                        int ldc) {
-  // Define type definition for single-precision CUTLASS GEMM with column-major
-  // input matrices and 128x128x8 threadblock tile size (chosen by default).
-  //
-  // To keep the interface manageable, several helpers are defined for plausible
-  // compositions including the following example for single-precision GEMM.
-  // Typical values are used as default template arguments. See
-  // `cutlass/gemm/device/default_gemm_configuration.h` for more details.
-  //
-  // To view the full gemm device API interface, see
-  // `cutlass/gemm/device/gemm.h`
-  using TAcc = determine_acc_t<T>;
-  using ColumnMajor = cutlass::layout::ColumnMajor;
-  using RowMajor = cutlass::layout::RowMajor;
-  using LayoutA = std::conditional_t<TransA, ColumnMajor, RowMajor>;
-  using LayoutB = std::conditional_t<TransB, ColumnMajor, RowMajor>;
-  using LayoutC = std::conditional_t<TransC, ColumnMajor, RowMajor>;
-  using CutlassGemm = cutlass::gemm::device::Gemm<T, // Data-type of A matrix
-                                                  LayoutA, // Layout of A matrix
-                                                  T, // Data-type of B matrix
-                                                  LayoutB, // Layout of B matrix
-                                                  T, // Data-type of C matrix
-                                                  LayoutC,
-                                                  TAcc>; // Layout of C matrix
-  // Define a CUTLASS GEMM type
-  CutlassGemm gemm_operator;
-  // Construct the CUTLASS GEMM arguments object.
-  //
-  // One of CUTLASS's design patterns is to define gemm argument objects that
-  // are constructible in host code and passed to kernels by value. These may
-  // include pointers, strides, scalars, and other arguments needed by Gemm and
-  // its components.
-  //
-  // The benefits of this pattern are (1.) a structured, composable strategy for
-  // passing host-constructible arguments to kernels and (2.) minimized
-  // initialization overhead on kernel entry.
-  //
-  typename CutlassGemm::Arguments args(
-      {M, N, K}, // Gemm Problem dimensions
-      {A, lda},  // Tensor-ref for source matrix A
-      {B, ldb},  // Tensor-ref for source matrix B
-      {C, ldc},  // Tensor-ref for source matrix C
-      {C, ldc},  // Tensor-ref for destination matrix D (may be different memory
-                 // than source C matrix)
-      {alpha, beta}); // Scalars used in the Epilogue
-  //
-  // Launch the CUTLASS GEMM kernel.
-  //
-  cutlass::Status status = gemm_operator(args, nullptr, s);
-  //
-  // Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
-  //
-  if (status != cutlass::Status::kSuccess) {
-    return cudaErrorUnknown;
-  }
-  // Return success, if no errors were encountered.
-  return cudaSuccess;
-}
-} // namespace spconv
--- a/include/spgemm/gemm_th.h
+++ b/include/spgemm/gemm_th.h
-#pragma once
-#include <cuda_runtime_api.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-namespace spconv {
-void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b);
-void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
-                    torch::Tensor b);
-} // namespace spconv
\ No newline at end of file
--- a/include/sphash/hashmap.h
+++ b/include/sphash/hashmap.h
-#include <tensorview/tensor.h>
-namespace spconv {
-enum HashTypes { kDenseMap = 0, kCUDPPHash = 1 };
-template <int Impl> struct HashMap;
-template <> struct HashMap<kDenseMap> {};
-} // namespace spconv
\ No newline at end of file
--- a/include/tensorrt/inference.h
+++ b/include/tensorrt/inference.h
-#include "NvInfer.h"
-#include <memory>
-#include <tensorview/tensor.h>
-#include <unordered_map>
-#include <vector>
-namespace trt {
-template <typename T> tv::DType trt_dtype_to_tv(T trt_dtype) {
-  switch (trt_dtype) {
-  case nvinfer1::DataType::kFLOAT:
-    return tv::float32;
-  case nvinfer1::DataType::kHALF:
-    return tv::float16;
-  case nvinfer1::DataType::kINT32:
-    return tv::int32;
-  case nvinfer1::DataType::kINT8:
-    return tv::int8;
-  default:;
-  }
-  TV_THROW_INVALID_ARG("unknown trt dtype");
-}
-struct InferDeleter {
-  template <typename T> void operator()(T *obj) const {
-    if (obj) {
-      obj->destroy();
-    }
-  }
-};
-template <typename T> using trt_unique_ptr_t = std::unique_ptr<T, InferDeleter>;
-class Logger : public nvinfer1::ILogger {
-public:
-  Logger(Severity severity = Severity::kWARNING)
-      : reportableSeverity(severity) {}
-  void log(Severity severity, const char *msg) override {
-    // suppress messages with severity enum value greater than the reportable
-    if (severity > reportableSeverity)
-      return;
-    switch (severity) {
-    case Severity::kINTERNAL_ERROR:
-      std::cerr << "INTERNAL_ERROR: ";
-      break;
-    case Severity::kERROR:
-      std::cerr << "ERROR: ";
-      break;
-    case Severity::kWARNING:
-      std::cerr << "WARNING: ";
-      break;
-    case Severity::kINFO:
-      std::cerr << "INFO: ";
-      break;
-    default:
-      std::cerr << "UNKNOWN: ";
-      break;
-    }
-    std::cerr << msg << std::endl;
-  }
-  Severity reportableSeverity;
-};
-class InferenceContext {
-public:
-  explicit InferenceContext(const std::string &engine_bin, int device)
-      : logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) {
-    TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id");
-    int deviceCount;
-    cudaGetDeviceCount(&deviceCount);
-    if (device >= deviceCount) {
-      TV_THROW_INVALID_ARG("you provide device ", device, " but you only have ",
-                           deviceCount, " device.");
-    }
-    cudaSetDevice(device);
-    auto runtime = trt_unique_ptr_t<nvinfer1::IRuntime>(
-        nvinfer1::createInferRuntime(logger_));
-    engine_ =
-        trt_unique_ptr_t<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(
-            engine_bin.c_str(), engine_bin.size(), nullptr));
-    ctx_ = trt_unique_ptr_t<nvinfer1::IExecutionContext>(
-        engine_->createExecutionContext());
-    max_batch_size_ = engine_->getMaxBatchSize();
-    for (int i = 0; i < engine_->getNbBindings(); ++i) {
-      auto dims = engine_->getBindingDimensions(i);
-      std::vector<int> shape_vec(dims.d, dims.d + dims.nbDims);
-      shape_vec.insert(shape_vec.begin(), {max_batch_size_});
-      tv::TensorShape shape(shape_vec);
-      std::string name = engine_->getBindingName(i);
-      auto trt_dtype = engine_->getBindingDataType(i);
-      auto tv_dtype = trt_dtype_to_tv(trt_dtype);
-      bool isInput = engine_->bindingIsInput(i);
-      name_to_idx_[name] = i;
-      idx_to_name_[i] = name;
-      name_to_host_mem_.insert({name, tv::Tensor(shape, tv_dtype, -1)});
-      name_to_dev_mem_.insert({name, tv::Tensor(shape, tv_dtype, 0)});
-      if (isInput)
-        inp_idxes_.push_back(i);
-      else
-        out_idxes_.push_back(i);
-      bindings_.push_back(name_to_dev_mem_[name].raw_data());
-    }
-    checkCudaErrors(cudaStreamCreate(&stream_));
-  }
-  std::unordered_map<std::string, tv::Tensor>
-  operator()(std::vector<tv::Tensor> inputs) {
-    TV_ASSERT_INVALID_ARG(inputs.size() == inp_idxes_.size(), "must provide",
-                          inp_idxes_.size(), "inputs, but got", inputs.size());
-    // inference batch size
-    int bs = inputs[0].dim(0);
-    for (auto &inp : inputs) {
-      TV_ASSERT_INVALID_ARG(inp.dim(0) == bs,
-                            "batch sizes of all input must same");
-    }
-    TV_ASSERT_INVALID_ARG(bs <= max_batch_size_, "your batchsize too large", bs,
-                          max_batch_size_);
-    for (int i = 0; i < inputs.size(); ++i) {
-      auto &dev_mem = name_to_dev_mem_[idx_to_name_[i]];
-      auto shape_inp = inputs[i].shape().subshape(1);
-      auto shape_dev = dev_mem.shape().subshape(1);
-      TV_ASSERT_INVALID_ARG(shape_inp == shape_dev,
-                            "shape except batch must same", shape_inp,
-                            shape_dev);
-      dev_mem.slice_first_axis(0, bs).copy_(inputs[i].slice_first_axis(0, bs),
-                                            stream_);
-    }
-    ctx_->enqueue(bs, bindings_.data(), stream_, nullptr);
-    for (int i : out_idxes_) {
-      name_to_host_mem_[idx_to_name_[i]].slice_first_axis(0, bs).copy_(
-          name_to_dev_mem_[idx_to_name_[i]].slice_first_axis(0, bs), stream_);
-    }
-    checkCudaErrors(cudaStreamSynchronize(stream_));
-    std::unordered_map<std::string, tv::Tensor> output_map;
-    for (int i = 0; i < out_idxes_.size(); ++i) {
-      auto name = idx_to_name_[out_idxes_[i]];
-      output_map[name] = name_to_host_mem_[name].slice_first_axis(0, bs);
-    }
-    return output_map;
-  }
-  std::unordered_map<std::string, tv::Tensor>
-  operator()(std::unordered_map<std::string, tv::Tensor> inputs) {
-    std::vector<tv::Tensor> inputs_vec(inp_idxes_.size());
-    int count = 0;
-    for (auto &p : inputs) {
-      auto iter = name_to_idx_.find(p.first);
-      TV_ASSERT_INVALID_ARG(iter != name_to_idx_.end(), "cant find your name",
-                            p.first);
-      inputs_vec[name_to_idx_[p.first]] = p.second;
-    }
-    TV_ASSERT_INVALID_ARG(count == inp_idxes_.size(), "your inp not enough");
-    return (*this)(inputs_vec);
-  }
-  tv::Tensor operator[](std::string name) {
-    auto iter = name_to_host_mem_.find(name);
-    if (iter == name_to_host_mem_.end()) {
-      TV_THROW_INVALID_ARG(name, "not found.");
-    }
-    return iter->second;
-  }
-  std::string repr() {
-    std::stringstream ss;
-    ss << "InferenceContext[gpu=" << device_ << "]";
-    ss << "\n  Inputs:";
-    std::string name;
-    for (auto &i : inp_idxes_) {
-      name = idx_to_name_[i];
-      auto &mem = name_to_host_mem_[name];
-      ss << "\n    " << name << "[" << tv::detail::typeString(mem.dtype())
-         << "]: " << mem.shape();
-    }
-    ss << "\n  Outputs:";
-    for (auto &i : out_idxes_) {
-      name = idx_to_name_[i];
-      auto &mem = name_to_host_mem_[name];
-      ss << "\n    " << name << "[" << tv::detail::typeString(mem.dtype())
-         << "]: " << mem.shape();
-    }
-    return ss.str();
-  }
-private:
-  Logger logger_;
-  trt_unique_ptr_t<nvinfer1::ICudaEngine> engine_;
-  trt_unique_ptr_t<nvinfer1::IExecutionContext> ctx_;
-  std::unordered_map<std::string, tv::Tensor> name_to_dev_mem_;
-  std::unordered_map<std::string, tv::Tensor> name_to_host_mem_;
-  std::unordered_map<std::string, int> name_to_idx_;
-  std::unordered_map<int, std::string> idx_to_name_;
-  std::vector<int> inp_idxes_;
-  std::vector<int> out_idxes_;
-  std::vector<void *> bindings_;
-  cudaStream_t stream_;
-  int max_batch_size_;
-  int device_;
-};
-} // namespace trt
--- a/include/tensorview/cc17.h
+++ b/include/tensorview/cc17.h
-/*
-From PyTorch:
-Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
-Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
-(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
-Samy Bengio, Johnny Mariethoz)
-From Caffe2:
-Copyright (c) 2016-present, Facebook Inc. All rights reserved.
-All contributions by Facebook:
-Copyright (c) 2016 Facebook Inc.
-All contributions by Google:
-Copyright (c) 2015 Google Inc.
-All rights reserved.
-All contributions by Yangqing Jia:
-Copyright (c) 2015 Yangqing Jia
-All rights reserved.
-All contributions from Caffe:
-Copyright(c) 2013, 2014, 2015, the respective contributors
-All rights reserved.
-All other contributions:
-Copyright(c) 2015, 2016 the respective contributors
-All rights reserved.
-Caffe2 uses a copyright model similar to Caffe: each contributor holds
-copyright over their contributions to Caffe2. The project versioning records
-all such contribution and copyright details. If a contributor wants to further
-mark their specific copyright on a particular contribution, they should
-indicate their copyright solely in the commit message of the change when it is
-committed.
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
-America and IDIAP Research Institute nor the names of its contributors may be
-   used to endorse or promote products derived from this software without
-   specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-*/
-#pragma once
-#include <type_traits>
-#include <utility>
-namespace tv {
-#ifdef __cpp_lib_void_t
-template <class T> using void_t = std::void_t<T>;
-#else
-// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
-// (it takes CWG1558 into account and also works for older compilers)
-template <typename... Ts> struct make_void { typedef void type; };
-template <typename... Ts> using void_t = typename make_void<Ts...>::type;
-#endif
-namespace detail {
-struct _identity final {
-  template <class T> using type_identity = T;
-  template <class T> decltype(auto) operator()(T &&arg) {
-    return std::forward<T>(arg);
-  }
-};
-template <class Func, class Enable = void>
-struct function_takes_identity_argument : std::false_type {};
-#if defined(_MSC_VER)
-// For some weird reason, MSVC shows a compiler error when using guts::void_t
-// instead of std::void_t. But we're only building on MSVC versions that have
-// std::void_t, so let's just use that one.
-template <class Func>
-struct function_takes_identity_argument<
-    Func, std::void_t<decltype(std::declval<Func>()(_identity()))>>
-    : std::true_type {};
-#else
-template <class Func>
-struct function_takes_identity_argument<
-    Func, void_t<decltype(std::declval<Func>()(_identity()))>>
-    : std::true_type {};
-#endif
-template <bool Condition> struct _if_constexpr;
-template <> struct _if_constexpr<true> final {
-  template <
-      class ThenCallback, class ElseCallback,
-      std::enable_if_t<function_takes_identity_argument<ThenCallback>::value,
-                       void *> = nullptr>
-  static decltype(auto) call(ThenCallback &&thenCallback,
-                             ElseCallback && /* elseCallback */) {
-    // The _identity instance passed in can be used to delay evaluation of an
-    // expression, because the compiler can't know that it's just the identity
-    // we're passing in.
-    return thenCallback(_identity());
-  }
-  template <
-      class ThenCallback, class ElseCallback,
-      std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value,
-                       void *> = nullptr>
-  static decltype(auto) call(ThenCallback &&thenCallback,
-                             ElseCallback && /* elseCallback */) {
-    return thenCallback();
-  }
-};
-template <> struct _if_constexpr<false> final {
-  template <
-      class ThenCallback, class ElseCallback,
-      std::enable_if_t<function_takes_identity_argument<ElseCallback>::value,
-                       void *> = nullptr>
-  static decltype(auto) call(ThenCallback && /* thenCallback */,
-                             ElseCallback &&elseCallback) {
-    // The _identity instance passed in can be used to delay evaluation of an
-    // expression, because the compiler can't know that it's just the identity
-    // we're passing in.
-    return elseCallback(_identity());
-  }
-  template <
-      class ThenCallback, class ElseCallback,
-      std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value,
-                       void *> = nullptr>
-  static decltype(auto) call(ThenCallback && /* thenCallback */,
-                             ElseCallback &&elseCallback) {
-    return elseCallback();
-  }
-};
-} // namespace detail
-/*
- * Get something like C++17 if constexpr in C++14.
- *
- * Example 1: simple constexpr if/then/else
- *   template<int arg> int increment_absolute_value() {
- *     int result = arg;
- *     if_constexpr<(arg > 0)>(
- *       [&] { ++result; }  // then-case
- *       [&] { --result; }  // else-case
- *     );
- *     return result;
- *   }
- *
- * Example 2: without else case (i.e. conditionally prune code from assembly)
- *   template<int arg> int decrement_if_positive() {
- *     int result = arg;
- *     if_constexpr<(arg > 0)>(
- *       // This decrement operation is only present in the assembly for
- *       // template instances with arg > 0.
- *       [&] { --result; }
- *     );
- *     return result;
- *   }
- *
- * Example 3: branch based on type (i.e. replacement for SFINAE)
- *   struct MyClass1 {int value;};
- *   struct MyClass2 {int val};
- *   template <class T>
- *   int func(T t) {
- *     return if_constexpr<std::is_same<T, MyClass1>::value>(
- *       [&](auto _) { return _(t).value; }, // this code is invalid for T ==
- * MyClass2, so a regular non-constexpr if statement wouldn't compile
- *       [&](auto _) { return _(t).val; }    // this code is invalid for T ==
- * MyClass1
- *     );
- *   }
- *
- * Note: The _ argument passed in Example 3 is the identity function, i.e. it
- * does nothing. It is used to force the compiler to delay type checking,
- * because the compiler doesn't know what kind of _ is passed in. Without it,
- * the compiler would fail when you try to access t.value but the member doesn't
- * exist.
- *
- * Note: In Example 3, both branches return int, so func() returns int. This is
- * not necessary. If func() had a return type of "auto", then both branches
- * could return different types, say func<MyClass1>() could return int and
- * func<MyClass2>() could return string.
- */
-template <bool Condition, class ThenCallback, class ElseCallback>
-decltype(auto) if_constexpr(ThenCallback &&thenCallback,
-                            ElseCallback &&elseCallback) {
-#if defined(__cpp_if_constexpr)
-  // If we have C++17, just use it's "if constexpr" feature instead of wrapping
-  // it. This will give us better error messages.
-  if constexpr (Condition) {
-    if constexpr (detail::function_takes_identity_argument<
-                      ThenCallback>::value) {
-      return std::forward<ThenCallback>(thenCallback)(detail::_identity());
-    } else {
-      return std::forward<ThenCallback>(thenCallback)();
-    }
-  } else {
-    if constexpr (detail::function_takes_identity_argument<
-                      ElseCallback>::value) {
-      return std::forward<ElseCallback>(elseCallback)(detail::_identity());
-    } else {
-      return std::forward<ElseCallback>(elseCallback)();
-    }
-  }
-#else
-  // C++14 implementation of if constexpr
-  return detail::_if_constexpr<Condition>::call(
-      std::forward<ThenCallback>(thenCallback),
-      std::forward<ElseCallback>(elseCallback));
-#endif
-}
-template <bool Condition, class ThenCallback>
-decltype(auto) if_constexpr(ThenCallback &&thenCallback) {
-#if defined(__cpp_if_constexpr)
-  // If we have C++17, just use it's "if constexpr" feature instead of wrapping
-  // it. This will give us better error messages.
-  if constexpr (Condition) {
-    if constexpr (detail::function_takes_identity_argument<
-                      ThenCallback>::value) {
-      return std::forward<ThenCallback>(thenCallback)(detail::_identity());
-    } else {
-      return std::forward<ThenCallback>(thenCallback)();
-    }
-  }
-#else
-  // C++14 implementation of if constexpr
-  return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback),
-                                 [](auto) {});
-#endif
-}
-} // namespace tv
--- a/include/tensorview/common.h
+++ b/include/tensorview/common.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <iostream>
-#include <sstream>
-#ifdef TV_USE_STACKTRACE
-#if defined(WIN32) || defined(_WIN32) ||                                       \
-    defined(__WIN32) && !defined(__CYGWIN__)
-#define BOOST_STACKTRACE_USE_WINDBG
-#else
-// require linking with -ldl and -lbacktrace in linux
-#define BOOST_STACKTRACE_USE_BACKTRACE
-#endif
-#include <boost/stacktrace.hpp>
-#endif
-namespace tv {
-template <class SStream, class T> void sstream_print(SStream &ss, T val) {
-  ss << val;
-}
-template <class SStream, class T, class... TArgs>
-void sstream_print(SStream &ss, T val, TArgs... args) {
-  ss << val << " ";
-  sstream_print(ss, args...);
-}
-template <class... TArgs> void ssprint(TArgs... args) {
-  std::stringstream ss;
-  sstream_print(ss, args...);
-  std::cout << ss.str() << std::endl;
-}
-#ifdef TV_USE_STACKTRACE
-#define TV_BACKTRACE_PRINT(ss)                                                 \
-  ss << std::endl << boost::stacktrace::stacktrace();
-#else
-#define TV_BACKTRACE_PRINT(ss)
-#endif
-#define TV_THROW_RT_ERR(...)                                                   \
-  {                                                                            \
-    std::stringstream __macro_s;                                               \
-    __macro_s << __FILE__ << " " << __LINE__ << "\n";                          \
-    tv::sstream_print(__macro_s, __VA_ARGS__);                                 \
-    TV_BACKTRACE_PRINT(__macro_s);                                             \
-    throw std::runtime_error(__macro_s.str());                                 \
-  }
-#define TV_THROW_INVALID_ARG(...)                                              \
-  {                                                                            \
-    std::stringstream __macro_s;                                               \
-    __macro_s << __FILE__ << " " << __LINE__ << "\n";                          \
-    tv::sstream_print(__macro_s, __VA_ARGS__);                                 \
-    TV_BACKTRACE_PRINT(__macro_s);                                             \
-    throw std::invalid_argument(__macro_s.str());                              \
-  }
-#define TV_ASSERT_RT_ERR(expr, ...)                                            \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << #expr << " assert faild. ";                                 \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      TV_BACKTRACE_PRINT(__macro_s);                                           \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
-  }
-#define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << #expr << " assert faild. ";                                 \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      TV_BACKTRACE_PRINT(__macro_s);                                           \
-      throw std::invalid_argument(__macro_s.str());                            \
-    }                                                                          \
-  }
-} // namespace tv
\ No newline at end of file
--- a/include/tensorview/cuda_utils.h
+++ b/include/tensorview/cuda_utils.h
-#pragma once
-// from pytorch.aten
-#include "tensorview.h"
-#include <type_traits>
-namespace tv {
-namespace cuda {
-template <typename T1, typename T2> inline int DivUp(const T1 a, const T2 b) {
-  return (a + b - 1) / b;
-}
-// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
-// CUDA: number of blocks for threads.
-inline int getNumThreads(const int N) {
-  if (N > CUDA_NUM_THREADS) {
-    return CUDA_NUM_THREADS;
-  }
-  return DivUp(N, 32) * 32;
-}
-inline int getBlocks(const int N) {
-  TV_ASSERT_RT_ERR(N > 0,
-                   "CUDA kernel launch blocks must be positive, but got N=", N);
-  return DivUp(N, getNumThreads(N));
-}
-} // namespace cuda
-} // namespace tv
\ No newline at end of file
--- a/include/tensorview/eigen_utils.h
+++ b/include/tensorview/eigen_utils.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "tensor.h"
-#include "tensorview.h"
-#include <eigen3/Eigen/Dense>
-namespace tv {
-template <typename T, int Row = Eigen::Dynamic, int Col = Eigen::Dynamic>
-Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>>
-tv2eigen(TensorView<T> view) {
-  TV_ASSERT_INVALID_ARG(view.ndim() <= 2 && view.ndim() > 0, "error");
-  if (Row != Eigen::Dynamic) {
-    TV_ASSERT_INVALID_ARG(view.dim(0) == Row, "error");
-  }
-  if (Col != Eigen::Dynamic) {
-    TV_ASSERT_INVALID_ARG(view.dim(1) == Col, "error");
-  }
-  int row = 1;
-  if (view.ndim() == 2) {
-    row = view.dim(0);
-  }
-  Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>> eigen_map(
-      view.data(), row, view.dim(1));
-  return eigen_map;
-}
-} // namespace tv
--- a/include/tensorview/kernel_utils.h
+++ b/include/tensorview/kernel_utils.h
-#pragma once
-// from tensorflow
-namespace tv {
-namespace detail {
-template <typename T> class KernelLoop {
-  struct Iterator {
-    __forceinline__ __device__ Iterator(T index, T delta)
-        : index_(index), delta_(delta) {}
-    __forceinline__ __device__ T operator*() const { return index_; }
-    __forceinline__ __device__ Iterator &operator++() {
-      index_ += delta_;
-      return *this;
-    }
-    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
-      bool greater = index_ > other.index_;
-      bool less = index_ < other.index_;
-      // Anything past an end iterator (delta_ == 0) is equal.
-      // In range-based for loops, this optimizes to 'return less'.
-      if (!other.delta_) {
-        return less;
-      }
-      if (!delta_) {
-        return greater;
-      }
-      return less || greater;
-    }
-  private:
-    T index_;
-    const T delta_;
-  };
-public:
-  __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
-      : begin_(begin), delta_(delta), end_(end) {}
-  __forceinline__ __device__ Iterator begin() const {
-    return Iterator{begin_, delta_};
-  }
-  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
-private:
-  T begin_;
-  T delta_;
-  T end_;
-};
-} // namespace detail
-template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
-  return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
-                               gridDim.x * blockDim.x * NumILP, count);
-}
-// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
-// Usage: for(int i : KernelLoopY(count)) { visit(i); }
-template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
-  return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
-                               gridDim.y * blockDim.y * NumILP, count);
-}
-// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
-// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
-template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
-  return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
-                               gridDim.z * blockDim.z * NumILP, count);
-}
-} // namespace tv
\ No newline at end of file
--- a/include/tensorview/mp_helper.h
+++ b/include/tensorview/mp_helper.h
-#ifndef MP_HELPER_H_
-#define MP_HELPER_H_
-#include <type_traits>
-#include <utility>
-namespace tv {
-template <class... T> struct mp_list {};
-template <class T, T... I>
-using mp_list_c = mp_list<std::integral_constant<T, I>...>;
-template <int... I>
-using mp_list_int_c = mp_list<std::integral_constant<int, I>...>;
-namespace detail {
-template <class... Ts, class F>
-constexpr F mp_for_each_impl(mp_list<Ts...>, F &&f) {
-  return (void)(std::initializer_list<int>{(f(Ts()), 0)...}),
-         std::forward<F>(f);
-}
-template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
-  return std::forward<F>(f);
-}
-} // namespace detail
-template <class... T>
-using mp_length = std::integral_constant<std::size_t, sizeof...(T)>;
-namespace detail {
-template <class A, template <class...> class B> struct mp_rename_impl {
-  // An error "no type named 'type'" here means that the first argument to
-  // mp_rename is not a list
-};
-template <template <class...> class A, class... T, template <class...> class B>
-struct mp_rename_impl<A<T...>, B> {
-  using type = B<T...>;
-};
-} // namespace detail
-template <class A, template <class...> class B>
-using mp_rename = typename detail::mp_rename_impl<A, B>::type;
-template <class L> using mp_size = mp_rename<L, mp_length>;
-template <class L, class F> constexpr F mp_for_each(F &&f) {
-  return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
-}
-} // namespace tv
-#endif
\ No newline at end of file
--- a/include/tensorview/prettyprint.h
+++ b/include/tensorview/prettyprint.h
-//          Copyright Louis Delacroix 2010 - 2014.
-// Distributed under the Boost Software License, Version 1.0.
-//    (See accompanying file LICENSE_1_0.txt or copy at
-//          http://www.boost.org/LICENSE_1_0.txt)
-//
-// A pretty printing library for C++
-//
-// Usage:
-// Include this header, and operator<< will "just work".
-#ifndef H_PRETTY_PRINT
-#define H_PRETTY_PRINT
-#include <cstddef>
-#include <iterator>
-#include <memory>
-#include <ostream>
-#include <set>
-#include <tuple>
-#include <type_traits>
-#include <unordered_set>
-#include <utility>
-#include <valarray>
-namespace pretty_print {
-namespace detail {
-// SFINAE type trait to detect whether T::const_iterator exists.
-struct sfinae_base {
-  using yes = char;
-  using no = yes[2];
-};
-template <typename T> struct has_const_iterator : private sfinae_base {
-private:
-  template <typename C> static yes &test(typename C::const_iterator *);
-  template <typename C> static no &test(...);
-public:
-  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
-  using type = T;
-};
-template <typename T> struct has_begin_end : private sfinae_base {
-private:
-  template <typename C>
-  static yes &
-  f(typename std::enable_if<
-      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
-                                            const>(&C::begin)),
-                   typename C::const_iterator (C::*)() const>::value>::type *);
-  template <typename C> static no &f(...);
-  template <typename C>
-  static yes &
-  g(typename std::enable_if<
-      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
-                                            const>(&C::end)),
-                   typename C::const_iterator (C::*)() const>::value,
-      void>::type *);
-  template <typename C> static no &g(...);
-public:
-  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
-  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
-};
-} // namespace detail
-// Holds the delimiter values for a specific character type
-template <typename TChar> struct delimiters_values {
-  using char_type = TChar;
-  const char_type *prefix;
-  const char_type *delimiter;
-  const char_type *postfix;
-};
-// Defines the delimiter values for a specific container and character type
-template <typename T, typename TChar> struct delimiters {
-  using type = delimiters_values<TChar>;
-  static const type values;
-};
-// Functor to print containers. You can use this directly if you want
-// to specificy a non-default delimiters type. The printing logic can
-// be customized by specializing the nested template.
-template <typename T, typename TChar = char,
-          typename TCharTraits = ::std::char_traits<TChar>,
-          typename TDelimiters = delimiters<T, TChar>>
-struct print_container_helper {
-  using delimiters_type = TDelimiters;
-  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
-  template <typename U> struct printer {
-    static void print_body(const U &c, ostream_type &stream) {
-      using std::begin;
-      using std::end;
-      auto it = begin(c);
-      const auto the_end = end(c);
-      if (it != the_end) {
-        for (;;) {
-          stream << *it;
-          if (++it == the_end)
-            break;
-          if (delimiters_type::values.delimiter != NULL)
-            stream << delimiters_type::values.delimiter;
-        }
-      }
-    }
-  };
-  print_container_helper(const T &container) : container_(container) {}
-  inline void operator()(ostream_type &stream) const {
-    if (delimiters_type::values.prefix != NULL)
-      stream << delimiters_type::values.prefix;
-    printer<T>::print_body(container_, stream);
-    if (delimiters_type::values.postfix != NULL)
-      stream << delimiters_type::values.postfix;
-  }
-private:
-  const T &container_;
-};
-// Specialization for pairs
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename T1, typename T2>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::pair<T1, T2>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
-    stream << c.first;
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-    stream << c.second;
-  }
-};
-// Specialization for tuples
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename... Args>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::tuple<Args...>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-  using element_type = std::tuple<Args...>;
-  template <std::size_t I> struct Int {};
-  static void print_body(const element_type &c, ostream_type &stream) {
-    tuple_print(c, stream, Int<0>());
-  }
-  static void tuple_print(const element_type &, ostream_type &,
-                          Int<sizeof...(Args)>) {}
-  static void
-  tuple_print(const element_type &c, ostream_type &stream,
-              typename std::conditional<sizeof...(Args) != 0, Int<0>,
-                                        std::nullptr_t>::type) {
-    stream << std::get<0>(c);
-    tuple_print(c, stream, Int<1>());
-  }
-  template <std::size_t N>
-  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-    stream << std::get<N>(c);
-    tuple_print(c, stream, Int<N + 1>());
-  }
-};
-// Prints a print_container_helper to the specified stream.
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &stream,
-    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
-  helper(stream);
-  return stream;
-}
-// Basic is_container template; specialize to derive from std::true_type for all
-// desired container types
-template <typename T>
-struct is_container
-    : public std::integral_constant<bool,
-                                    detail::has_const_iterator<T>::value &&
-                                        detail::has_begin_end<T>::beg_value &&
-                                        detail::has_begin_end<T>::end_value> {};
-template <typename T, std::size_t N>
-struct is_container<T[N]> : std::true_type {};
-template <std::size_t N> struct is_container<char[N]> : std::false_type {};
-template <typename T> struct is_container<std::valarray<T>> : std::true_type {};
-template <typename T1, typename T2>
-struct is_container<std::pair<T1, T2>> : std::true_type {};
-template <typename... Args>
-struct is_container<std::tuple<Args...>> : std::true_type {};
-// Default delimiters
-template <typename T> struct delimiters<T, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T>
-const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
-template <typename T> struct delimiters<T, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T>
-const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
-                                                                   L"]"};
-// Delimiters for (multi)set and unordered_(multi)set
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
-                                                                  "}"};
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char> delimiters<::std::multiset<T, TComp, TAllocator>,
-                                         char>::values = {"{", ", ", "}"};
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
-    L"{", L", ", L"}"};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  char> {
-  static const delimiters_values<char> values;
-};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-               wchar_t>::values = {L"{", L", ", L"}"};
-// Delimiters for pair and tuple
-template <typename T1, typename T2> struct delimiters<std::pair<T1, T2>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
-    "(", ", ", ")"};
-template <typename T1, typename T2>
-struct delimiters<::std::pair<T1, T2>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<wchar_t>
-    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
-template <typename... Args> struct delimiters<std::tuple<Args...>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename... Args>
-const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
-    "(", ", ", ")"};
-template <typename... Args> struct delimiters<::std::tuple<Args...>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename... Args>
-const delimiters_values<wchar_t>
-    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
-// Type-erasing helper class for easy use of custom delimiters.
-// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
-// and MyDelims needs to be defined for TChar. Usage: "cout <<
-// pretty_print::custom_delims<MyDelims>(x)".
-struct custom_delims_base {
-  virtual ~custom_delims_base() {}
-  virtual std::ostream &stream(::std::ostream &) = 0;
-  virtual std::wostream &stream(::std::wostream &) = 0;
-};
-template <typename T, typename Delims>
-struct custom_delims_wrapper : custom_delims_base {
-  custom_delims_wrapper(const T &t_) : t(t_) {}
-  std::ostream &stream(std::ostream &s) {
-    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
-               t);
-  }
-  std::wostream &stream(std::wostream &s) {
-    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
-                                       Delims>(t);
-  }
-private:
-  const T &t;
-};
-template <typename Delims> struct custom_delims {
-  template <typename Container>
-  custom_delims(const Container &c)
-      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
-  std::unique_ptr<custom_delims_base> base;
-};
-template <typename TChar, typename TCharTraits, typename Delims>
-inline std::basic_ostream<TChar, TCharTraits> &
-operator<<(std::basic_ostream<TChar, TCharTraits> &s,
-           const custom_delims<Delims> &p) {
-  return p.base->stream(s);
-}
-// A wrapper for a C-style array given as pointer-plus-size.
-// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
-template <typename T> struct array_wrapper_n {
-  typedef const T *const_iterator;
-  typedef T value_type;
-  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
-  inline const_iterator begin() const { return _array; }
-  inline const_iterator end() const { return _array + _n; }
-private:
-  const T *const _array;
-  size_t _n;
-};
-// A wrapper for hash-table based containers that offer local iterators to each
-// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
-// 5 of container m.)
-template <typename T> struct bucket_print_wrapper {
-  typedef typename T::const_local_iterator const_iterator;
-  typedef typename T::size_type size_type;
-  const_iterator begin() const { return m_map.cbegin(n); }
-  const_iterator end() const { return m_map.cend(n); }
-  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
-private:
-  const T &m_map;
-  const size_type n;
-};
-} // namespace pretty_print
-// Global accessor functions for the convenience wrappers
-template <typename T>
-inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
-                                                           size_t n) {
-  return pretty_print::array_wrapper_n<T>(a, n);
-}
-template <typename T>
-pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
-                                                   typename T::size_type n) {
-  return pretty_print::bucket_print_wrapper<T>(m, n);
-}
-// Main magic entry point: An overload snuck into namespace std.
-// Can we do better?
-namespace std {
-// Prints a container to the stream using default delimiters
-template <typename T, typename TChar, typename TCharTraits>
-inline typename enable_if<::pretty_print::is_container<T>::value,
-                          basic_ostream<TChar, TCharTraits> &>::type
-operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
-  return stream
-         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
-                container);
-}
-} // namespace std
-#endif // H_PRETTY_PRINT
--- a/include/tensorview/pybind_utils.h
+++ b/include/tensorview/pybind_utils.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "tensor.h"
-#include "tensorview.h"
-#include <algorithm>
-#include <array>
-#include <iostream>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-namespace py = pybind11;
-namespace tv {
-template <typename Tarr> bool is_c_style(const Tarr &arr) {
-  return bool(arr.flags() & py::array::c_style);
-}
-template <typename T, int Rank = -1>
-TensorView<T, Rank> arrayt2tv(py::array_t<T> arr) {
-  TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
-  Shape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  if (Rank >= 0) {
-    TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
-  }
-  return TensorView<T, Rank>(arr.mutable_data(), shape);
-}
-template <typename T, int Rank = -1>
-TensorView<const T> carrayt2tv(py::array_t<T> arr) {
-  TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
-  Shape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  if (Rank >= 0) {
-    TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
-  }
-  return TensorView<const T, Rank>(arr.data(), shape);
-}
-template <typename Tarr> tv::DType get_array_tv_dtype(const Tarr &arr) {
-  switch (arr.dtype().kind()) {
-  case 'b':
-    return tv::bool_;
-  case 'i': {
-    switch (arr.itemsize()) {
-    case 1:
-      return tv::int8;
-    case 2:
-      return tv::int16;
-    case 4:
-      return tv::int32;
-    case 8:
-      return tv::int64;
-    default:
-      break;
-    }
-  }
-  case 'u': {
-    switch (arr.itemsize()) {
-    case 1:
-      return tv::uint8;
-    case 2:
-      return tv::uint16;
-    case 4:
-      return tv::uint32;
-    case 8:
-      return tv::uint64;
-    default:
-      break;
-    }
-  }
-  case 'f': {
-    switch (arr.itemsize()) {
-    case 2:
-      return tv::float16;
-    case 4:
-      return tv::float32;
-    case 8:
-      return tv::float64;
-    default:
-      break;
-    }
-  }
-  }
-  TV_THROW_RT_ERR("unknown dtype", arr.dtype().kind(), arr.itemsize());
-}
-template <typename Tarr> Tensor array2tensor(Tarr &arr) {
-  TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
-  TensorShape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  return tv::from_blob(arr.mutable_data(), shape, get_array_tv_dtype(arr), -1);
-}
-template <typename T> Tensor arrayt2tensor(py::array_t<T> &arr) {
-  TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
-  TensorShape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  return tv::from_blob(arr.mutable_data(), shape, tv::type_v<T>, -1);
-}
-template <typename TDType> py::dtype tv_dtype_to_py(TDType d) {
-  switch (d) {
-  case float32:
-    return py::dtype("float32");
-  case float64:
-    return py::dtype("float64");
-  case float16:
-    return py::dtype("float16");
-  case int32:
-    return py::dtype("int32");
-  case int16:
-    return py::dtype("int16");
-  case int8:
-    return py::dtype("int8");
-  case int64:
-    return py::dtype("int64");
-  case uint32:
-    return py::dtype("uint32");
-  case uint16:
-    return py::dtype("uint16");
-  case uint8:
-    return py::dtype("uint8");
-  case uint64:
-    return py::dtype("uint64");
-  case bool_:
-    return py::dtype("bool_");
-  default:;
-  }
-  TV_THROW_INVALID_ARG("unknown dtype", d);
-}
-// add template to define function in header
-template <typename Ttensor> py::array tensor2array(Ttensor &tensor) {
-  // you cant call this function during GIL released.
-  TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
-  auto shape = tensor.shape();
-  std::vector<int> shape_vec(shape.begin(), shape.end());
-  auto dtype = tv_dtype_to_py(tensor.dtype());
-  // construct py::array will copy content from ptr.
-  // its expected because we can't transfer ownership from
-  // c++ tv::Tensor to numpy array when c++ object is deleted.
-  return py::array(dtype, shape_vec, {}, tensor.raw_data());
-}
-} // namespace tv
--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/*
-tv::Tensor is a lightweight header-only tensor container
-without template and annoying dependencies. no algorithm is implemented.
-it should only be used when you want a no-template simple container but
-dont want to link with libtorch.
-If you can use libtorch, dont use tv::Tensor.
-*/
-#pragma once
-#include "cc17.h"
-#include "mp_helper.h"
-#include "tensorview.h"
-#include <cstring>
-#include <iomanip>
-#include <memory>
-#include <type_traits>
-#ifdef TV_CUDA
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#endif
-namespace tv {
-enum DType {
-  float32,
-  int32,
-  int16,
-  int8,
-  float64,
-  bool_,
-  uint8,
-  float16,
-  int64,
-  uint16,
-  uint32,
-  uint64
-};
-namespace detail {
-using dtype_collection_t =
-    tv::mp_list_c<int, float32, int32, int16, int8, float64, bool_, uint8,
-                  float16, int64, uint16, uint32, uint64>;
-#ifdef TV_CUDA
-using all_tensor_types_t =
-    std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
-               uint16_t, uint32_t, uint64_t, bool>;
-#else
-using all_tensor_types_t =
-    std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
-               uint16_t, uint32_t, uint64_t, bool>;
-#endif
-template <typename T> class TensorStorage {
-public:
-  TensorStorage(size_t size, int device = -1, bool managed = false,
-                bool pinned = false)
-      : mSize(size), device_(device), managed_(managed), pinned_(pinned) {
-    if (size == 0) {
-      mPtr = nullptr;
-    } else {
-      if (device == -1) {
-        if (pinned_) {
-#ifdef TV_CUDA
-          checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
-#else
-          TV_THROW_INVALID_ARG("you need to define TV_CUDA to use pinned");
-#endif
-        } else {
-          mPtr = new T[size];
-        }
-      } else {
-#ifdef TV_CUDA
-        // we should select device in external
-        /*
-        int deviceCount;
-        cudaGetDeviceCount(&deviceCount);
-        if (device >= deviceCount) {
-          TV_THROW_INVALID_ARG("you provide device ", device,
-                               " but you only have ", deviceCount, " device.");
-        }
-        cudaSetDevice(device);
-        */
-        if (managed) {
-          checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
-        } else {
-          checkCudaErrors(cudaMalloc(&mPtr, size * sizeof(T)));
-        }
-#else
-        TV_THROW_INVALID_ARG("don't compiled with cuda");
-#endif
-      }
-    }
-  }
-  TensorStorage(T *ptr, size_t size, int device)
-      : mSize(size), mPtr(ptr), from_blob_(true), device_(device) {}
-  virtual ~TensorStorage() {
-    if (empty()) {
-      return;
-    }
-    if (from_blob_) {
-      return;
-    }
-    if (device_ == -1) {
-      if (pinned_) {
-#ifdef TV_CUDA
-        cudaFreeHost(mPtr);
-#endif
-      } else {
-        delete[] mPtr;
-      }
-    } else {
-#ifdef TV_CUDA
-      cudaFree(mPtr);
-#endif
-    }
-  };
-  inline size_t size() const { return mSize; }
-  T *data() { return mPtr; }
-  const T *data() const { return mPtr; }
-  bool empty() const { return mPtr == nullptr || mSize == 0; }
-  bool managed() const { return managed_; }
-  bool pinned() const { return pinned_; }
-  int device() const { return device_; }
-  void zero_() {
-    if (device_ == -1) {
-      std::memset(data(), 0, mSize);
-      // std::fill(data(), data() + mSize, 0);
-    } else {
-#ifdef TV_CUDA
-      checkCudaErrors(cudaMemset(data(), 0, mSize / sizeof(T)));
-#else
-      TV_THROW_INVALID_ARG("don't compiled with cuda");
-#endif
-    }
-  }
-private:
-  size_t mSize = 0;
-  T *mPtr = nullptr;
-  bool from_blob_ = false;
-  int device_ = -1;
-  bool managed_ = false;
-  bool pinned_ = false;
-};
-template <typename T> size_t sizeof_dtype(T dtype) {
-  switch (dtype) {
-  case float32:
-    return sizeof(float);
-  case int8:
-    return sizeof(int8_t);
-  case int16:
-    return sizeof(int16_t);
-  case int32:
-    return sizeof(int32_t);
-  case float64:
-    return sizeof(double);
-  case int64:
-    return sizeof(int64_t);
-  case bool_:
-    return sizeof(bool);
-  case uint8:
-    return sizeof(uint8_t);
-  case uint16:
-    return sizeof(uint16_t);
-  case uint32:
-    return sizeof(uint32_t);
-  case uint64:
-    return sizeof(uint64_t);
-  case float16:
-    return 2;
-  default:
-    TV_THROW_RT_ERR("unsupported dtype");
-  }
-  return 0;
-}
-template <typename T> std::string typeString(T t) {
-  switch (t) {
-  case DType::bool_:
-    return "bool";
-  case DType::float32:
-    return "float32";
-  case DType::int8:
-    return "int8";
-  case DType::int16:
-    return "int16";
-  case DType::int32:
-    return "int32";
-  case DType::float64:
-    return "float64";
-  case DType::int64:
-    return "int64";
-  case DType::uint8:
-    return "uint8";
-  case DType::uint16:
-    return "uint16";
-  case DType::uint32:
-    return "uint32";
-  case DType::uint64:
-    return "uint64";
-  case DType::float16:
-    return "half";
-  default:
-    return "";
-  }
-}
-template <typename T> struct TypeToDtypeTraits;
-template <> struct TypeToDtypeTraits<int32_t> {
-  static constexpr DType dtype = int32;
-};
-#ifdef TV_CUDA
-template <> struct TypeToDtypeTraits<__half> {
-  static constexpr DType dtype = float16;
-};
-#endif
-template <> struct TypeToDtypeTraits<float> {
-  static constexpr DType dtype = float32;
-};
-template <> struct TypeToDtypeTraits<double> {
-  static constexpr DType dtype = float64;
-};
-template <> struct TypeToDtypeTraits<int16_t> {
-  static constexpr DType dtype = int16;
-};
-template <> struct TypeToDtypeTraits<int8_t> {
-  static constexpr DType dtype = int8;
-};
-template <> struct TypeToDtypeTraits<int64_t> {
-  static constexpr DType dtype = int64;
-};
-template <> struct TypeToDtypeTraits<uint8_t> {
-  static constexpr DType dtype = uint8;
-};
-template <> struct TypeToDtypeTraits<uint16_t> {
-  static constexpr DType dtype = uint16;
-};
-template <> struct TypeToDtypeTraits<uint32_t> {
-  static constexpr DType dtype = uint32;
-};
-template <> struct TypeToDtypeTraits<uint64_t> {
-  static constexpr DType dtype = uint64;
-};
-template <> struct TypeToDtypeTraits<bool> {
-  static constexpr DType dtype = bool_;
-};
-template <> struct TypeToDtypeTraits<const int32_t> {
-  static constexpr DType dtype = int32;
-};
-#ifdef TV_CUDA
-template <> struct TypeToDtypeTraits<const __half> {
-  static constexpr DType dtype = float16;
-};
-#endif
-template <> struct TypeToDtypeTraits<const float> {
-  static constexpr DType dtype = float32;
-};
-template <> struct TypeToDtypeTraits<const double> {
-  static constexpr DType dtype = float64;
-};
-template <> struct TypeToDtypeTraits<const int16_t> {
-  static constexpr DType dtype = int16;
-};
-template <> struct TypeToDtypeTraits<const int8_t> {
-  static constexpr DType dtype = int8;
-};
-template <> struct TypeToDtypeTraits<const int64_t> {
-  static constexpr DType dtype = int64;
-};
-template <> struct TypeToDtypeTraits<const uint8_t> {
-  static constexpr DType dtype = uint8;
-};
-template <> struct TypeToDtypeTraits<const uint16_t> {
-  static constexpr DType dtype = uint16;
-};
-template <> struct TypeToDtypeTraits<const uint32_t> {
-  static constexpr DType dtype = uint32;
-};
-template <> struct TypeToDtypeTraits<const uint64_t> {
-  static constexpr DType dtype = uint64;
-};
-template <> struct TypeToDtypeTraits<const bool> {
-  static constexpr DType dtype = bool_;
-};
-} // namespace detail
-template <class T> constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
-template <class... Ts, typename F> bool dispatch_noexcept(DType t, F &&f) {
-  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
-  bool notFound = true;
-  mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    if (type_v<decltype(I)> == t && notFound) {
-      std::forward<F>(f)(decltype(I)());
-      notFound = false;
-    }
-  });
-  return !notFound;
-}
-template <class... Ts, typename F> void dispatch(DType t, F &&f) {
-  if (!dispatch_noexcept<Ts...>(t, std::forward<F>(f))) {
-    std::stringstream ss;
-    mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
-      ss << detail::TypeToString<decltype(I)>::value << " ";
-    });
-    TV_THROW_RT_ERR("unknown type", detail::typeString(t),
-                    ", available:", ss.str());
-  }
-}
-template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) {
-  static_assert(sizeof...(Is) > 0,
-                "you need to provide at least one candidate");
-  bool notFound = true;
-  mp_for_each<mp_list_c<T, Is...>>([=, &notFound, &f](auto I) {
-    if (T(I) == idx && notFound) {
-      std::forward<F>(f)(I);
-      notFound = false;
-    }
-  });
-  if (notFound) {
-    std::stringstream ss;
-    mp_for_each<mp_list_c<T, Is...>>([=, &ss](auto I) { ss << T(I) << " "; });
-    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
-  }
-}
-template <int... Is, typename F> bool dispatch_int_noexcept(int idx, F &&f) {
-  static_assert(sizeof...(Is) > 0,
-                "you need to provide at least one candidate");
-  bool notFound = true;
-  mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
-    if (decltype(I)::value == idx && notFound) {
-      std::forward<F>(f)(I);
-      notFound = false;
-    }
-  });
-  return !notFound;
-}
-template <int... Is, typename F, class BinaryPredicate>
-bool dispatch_int_noexcept(int idx, BinaryPredicate p, F &&f) {
-  static_assert(sizeof...(Is) > 0,
-                "you need to provide at least one candidate");
-  bool notFound = true;
-  mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
-    if (p(idx, decltype(I)::value) && notFound) {
-      std::forward<F>(f)(I);
-      notFound = false;
-    }
-  });
-  return !notFound;
-}
-template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
-  if (!dispatch_int_noexcept<Is...>(idx, std::forward<F>(f))) {
-    std::stringstream ss;
-    mp_for_each<mp_list_c<int, Is...>>(
-        [=, &ss](auto I) { ss << decltype(I)::value << " "; });
-    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
-  }
-}
-template <int... Is, typename F, class BinaryPredicate>
-void dispatch_int(int idx, BinaryPredicate p, F &&f) {
-  // BinaryPredicate: BinaryPredicate(idx, candidate)
-  if (!dispatch_int_noexcept<Is...>(idx, p, std::forward<F>(f))) {
-    std::stringstream ss;
-    mp_for_each<mp_list_c<int, Is...>>(
-        [=, &ss](auto I) { ss << decltype(I)::value << " "; });
-    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
-  }
-}
-// Ts is pack of mp_list_c
-template <class... Ts, typename Iterator, typename F>
-bool dispatch_container_noexcept(Iterator begin, Iterator end, F &&f) {
-  static_assert(sizeof...(Ts) > 0,
-                "you need to provide at least one candidate");
-  bool notFound = true;
-  mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    using val_lst_t = decltype(I);
-    auto val_lst_size = mp_size<val_lst_t>::value;
-    bool equal = true;
-    std::size_t count = 0;
-    auto iter = begin;
-    mp_for_each<val_lst_t>([&](auto E) {
-      if (iter == end || !equal) {
-        return;
-      }
-      if (count >= val_lst_size) {
-        equal = false;
-        return;
-      }
-      constexpr auto c = decltype(E)::value;
-      if (c != *iter) {
-        equal = false;
-      }
-      ++count;
-      std::advance(iter, 1);
-    });
-    if (count != val_lst_size || iter != end) {
-      equal = false;
-    }
-    if (equal && notFound) {
-      std::forward<F>(f)(I);
-      notFound = false;
-    }
-  });
-  return !notFound;
-}
-template <class... Ts, typename Iterator, typename F>
-void dispatch_container(Iterator begin, Iterator end, F &&f) {
-  if (!dispatch_container_noexcept<Ts...>(begin, end, std::forward<F>(f))) {
-    std::stringstream ss;
-    ss << "unknown value [";
-    for (auto iter = begin; iter != end; std::advance(iter, 1)) {
-      ss << *iter << ",";
-    }
-    ss << "], available: ";
-    mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
-      ss << "[";
-      mp_for_each<decltype(I)>(
-          [=, &ss](auto E) { ss << decltype(E)::value << ","; });
-      ss << "]";
-    });
-    TV_THROW_RT_ERR(ss.str());
-  }
-}
-/*
-template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
-  return dispatch_scalar<int, Is...>(idx, f);
-}
-*/
-template <class T> struct Dispatch;
-template <template <class...> class T, class... Args>
-struct Dispatch<T<Args...>> {
-  template <typename F> inline void operator()(DType t, F &&f) {
-    return dispatch<Args...>(t, std::forward<F>(f));
-  }
-};
-template <class T> struct DispatchContainer;
-template <template <class...> class T, class... Args>
-struct DispatchContainer<T<Args...>> {
-  template <typename Iterator, typename F>
-  inline void operator()(Iterator begin, Iterator end, F &&f) {
-    return dispatch_container<Args...>(begin, end, std::forward<F>(f));
-  }
-};
-template <class T> struct DispatchContainerNoexcept;
-template <template <class...> class T, class... Args>
-struct DispatchContainerNoexcept<T<Args...>> {
-  template <typename Iterator, typename F>
-  inline bool operator()(Iterator begin, Iterator end, F &&f) {
-    return dispatch_container_noexcept<Args...>(begin, end, std::forward<F>(f));
-  }
-};
-template <class T> struct DispatchInt;
-// Args should be std::integral_constant<int, value>
-// you need to use type_container<std::integral_constant<int, value>...>
-// as template parameter of DispatchInt.
-// tv::mp_list_c is ok.
-template <template <class...> class T, class... Args>
-struct DispatchInt<T<Args...>> {
-  template <typename F> inline void operator()(int t, F &&f) {
-    return dispatch_int<Args::value...>(t, std::forward<F>(f));
-  }
-  template <typename F, typename BinaryPredicate>
-  inline void operator()(int t, BinaryPredicate p, F &&f) {
-    return dispatch_int<Args::value...>(t, p, std::forward<F>(f));
-  }
-};
-template <class T> struct DispatchIntNoexcept;
-template <template <class...> class T, class... Args>
-struct DispatchIntNoexcept<T<Args...>> {
-  template <typename F> inline bool operator()(int t, F &&f) {
-    return dispatch_int_noexcept<Args::value...>(t, std::forward<F>(f));
-  }
-  template <typename F, typename BinaryPredicate>
-  inline bool operator()(int t, BinaryPredicate p, F &&f) {
-    return dispatch_int_noexcept<Args::value...>(t, p, std::forward<F>(f));
-  }
-};
-constexpr size_t kTensorMaxDim = 10;
-using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;
-struct Tensor {
-  Tensor() {}
-  Tensor(TensorShape shape, TensorShape stride, DType dtype, int device = -1,
-         bool pinned = false, bool managed = false)
-      : dtype_(dtype) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
-    shape_ = shape;
-    stride_ = stride;
-  }
-  Tensor(TensorShape shape, DType dtype, int device = -1, bool pinned = false,
-         bool managed = false)
-      : dtype_(dtype) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
-    shape_ = shape;
-    stride_ = shape.stride_rowmajor();
-  }
-  Tensor(void *ptr, TensorShape shape, TensorShape stride, DType dtype,
-         int device = -1)
-      : dtype_(dtype) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        reinterpret_cast<uint8_t *>(ptr),
-        shape.size() * detail::sizeof_dtype(dtype), device);
-    shape_ = shape;
-    stride_ = stride;
-  }
-  Tensor(void *ptr, TensorShape shape, DType dtype, int device = -1)
-      : dtype_(dtype) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        reinterpret_cast<uint8_t *>(ptr),
-        shape.size() * detail::sizeof_dtype(dtype), device);
-    shape_ = shape;
-    stride_ = shape.stride_rowmajor();
-  }
-  Tensor(const void *ptr, TensorShape shape, TensorShape stride, DType dtype,
-         int device = -1)
-      : dtype_(dtype), writeable_(false) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
-        shape.size() * detail::sizeof_dtype(dtype), device);
-    shape_ = shape;
-    stride_ = stride;
-  }
-  Tensor(const void *ptr, TensorShape shape, DType dtype, int device = -1)
-      : dtype_(dtype), writeable_(false) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
-        shape.size() * detail::sizeof_dtype(dtype), device);
-    shape_ = shape;
-    stride_ = shape.stride_rowmajor();
-  }
-  Tensor(std::initializer_list<int32_t> init)
-      : Tensor({int(init.size())}, tv::int32) {
-    std::copy(init.begin(), init.end(), data<int32_t>());
-  }
-  Tensor(std::initializer_list<int64_t> init)
-      : Tensor({int(init.size())}, tv::int64) {
-    std::copy(init.begin(), init.end(), data<int64_t>());
-  }
-  Tensor(std::initializer_list<float> init)
-      : Tensor({int(init.size())}, tv::float32) {
-    std::copy(init.begin(), init.end(), data<float>());
-  }
-  Tensor(std::initializer_list<double> init)
-      : Tensor({int(init.size())}, tv::float64) {
-    std::copy(init.begin(), init.end(), data<double>());
-  }
-  template <typename T, int Rank = -1,
-            template <class> class PtrTraits = DefaultPtrTraits,
-            typename Tindex = int,
-            typename std::enable_if<(Rank > 0), int>::type = 0>
-  TensorView<T, Rank, PtrTraits, Tindex> tview() {
-    using tv_shape_t =
-        typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
-    writable_check();
-    static_assert(Rank == -1 || Rank > 0, "error");
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    tv_shape_t shape(Rank), stride(Rank);
-    for (int i = 0; i < Rank; ++i) {
-      shape[i] = shape_[i];
-      stride[i] = stride_[i];
-    }
-    return TensorView<T, Rank, PtrTraits, Tindex>(
-        reinterpret_cast<T *>(data<T>()), shape, stride);
-  }
-  template <typename T, int Rank = -1,
-            template <class> class PtrTraits = DefaultPtrTraits,
-            typename Tindex = int>
-  TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
-  tview() const {
-    static_assert(Rank == -1 || Rank > 0, "error");
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    return if_constexpr<(Rank > 0)>(
-        [&](auto _) {
-          TV_ASSERT_RT_ERR(Rank == ndim(), "error");
-          ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank),
-              stride(Rank);
-          for (int i = 0; i < Rank; ++i) {
-            shape[i] = shape_[i];
-            stride[i] = stride_[i];
-          }
-          return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
-                            Tindex>(
-              reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
-              shape, stride);
-        },
-        [&](auto _) {
-          ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim());
-          for (int i = 0; i < int(ndim()); ++i) {
-            shape[i] = shape_[i];
-            stride[i] = stride_[i];
-          }
-          return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
-                            Tindex>(
-              reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
-              shape, stride);
-        });
-  }
-  template <class... Inds> Tensor view(Inds... newShapes) const {
-    static_assert(sizeof...(newShapes) > 0, "dont support empty for now");
-    TensorShape shape{int(newShapes)...};
-    bool found_minus_1 = false;
-    for (size_t i = 0; i < shape.ndim(); ++i) {
-      if (!found_minus_1) {
-        if (shape[i] == -1) {
-          shape[i] = 1;
-          shape[i] = size() / shape.size();
-          found_minus_1 = true;
-        } else {
-          TV_ASSERT_INVALID_ARG(shape[i] > 0,
-                                "shape except -1 must larger than 0");
-        }
-      } else {
-        TV_ASSERT_INVALID_ARG(shape[i] > 0, "multiple -1 in your argument.");
-      }
-    }
-    TV_ASSERT_RT_ERR(shape.size() == size(), "error");
-    Tensor res(*this);
-    res.shape_ = shape;
-    res.stride_ = shape.stride_rowmajor();
-    return res;
-  }
-  Tensor view(TensorShape shape) const {
-    TV_ASSERT_RT_ERR(shape.size() == size(), "error");
-    Tensor res(*this);
-    res.shape_ = shape;
-    res.stride_ = shape.stride_rowmajor();
-    return res;
-  }
-  Tensor operator[](int64_t index) {
-    TV_ASSERT_INVALID_ARG(ndim() > 1, "error");
-    if (index < 0) {
-      index += dim(0);
-    }
-    TV_ASSERT_INVALID_ARG(index < dim(0), "error");
-    Tensor res = Tensor();
-    res.storage_ = storage_;
-    res.shape_ = shape_.subshape(1);
-    res.offset_ = offset_ + index * stride_[0];
-    res.stride_ = stride_.subshape(1);
-    res.writeable_ = writeable_;
-    return res;
-  }
-  Tensor squeeze() const { return view(shape_.squeeze()); }
-  Tensor squeeze(int axis) const {
-    if (axis < 0) {
-      axis = ndim() + axis;
-    }
-    return view(shape_.squeeze(axis));
-  }
-  Tensor unsqueeze(int axis) const {
-    if (axis < 0) {
-      axis = ndim() + axis;
-    }
-    return view(shape_.unsqueeze(axis));
-  }
-  bool pinned() const { return storage_->pinned(); }
-  Tensor slice_first_axis(int start, int end) const {
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    if (start < 0) {
-      start = shape_[0] + start;
-    }
-    if (end < 0) {
-      end = shape_[0] + end;
-    }
-    TV_ASSERT_INVALID_ARG(start < shape_[0], "start must small than dim 0");
-    TV_ASSERT_INVALID_ARG(start < end, "start must small than end");
-    size_t new_offset = start * shape_.prod(1) * itemsize();
-    Tensor res(*this);
-    TensorShape newshape(shape_);
-    newshape[0] = end - start;
-    res.shape_ = newshape;
-    res.stride_ = stride_;
-    res.offset_ = new_offset;
-    return res;
-  }
-  bool empty() const { return storage_->empty(); }
-  DType dtype() const { return dtype_; }
-  int device() const { return storage_->device(); }
-  size_t ndim() const { return shape_.ndim(); }
-  const TensorShape &shape() const { return shape_; }
-  const TensorShape &sizes() const { return shape_; }
-  const TensorShape &stride() const { return stride_; }
-  int dim(int idx) const {
-    if (idx < 0) {
-      TV_ASSERT_RT_ERR(shape_.ndim() + idx < shape_.ndim(), idx, shape_);
-      return shape_[shape_.ndim() + idx];
-    } else {
-      TV_ASSERT_RT_ERR(idx < int(shape_.ndim()), idx, shape_);
-      return shape_[idx];
-    }
-  }
-  const uint8_t *raw_data() const { return storage_->data() + offset_; }
-  size_t raw_size() const { return size() * itemsize(); }
-  size_t size() const { return shape_.size(); }
-  size_t size(int64_t idx) const { return dim(idx); }
-  size_t itemsize() const { return detail::sizeof_dtype(dtype_); }
-  Tensor &zero_() {
-    writable_check();
-    storage_->zero_();
-    return *this;
-  }
-  uint8_t *raw_data() {
-    writable_check();
-    return storage_->data() + offset_;
-  }
-  template <typename T> Tensor &fill_(T value) {
-    writable_check();
-    TV_ASSERT_RT_ERR(device() == -1, "error");
-    Dispatch<detail::all_tensor_types_t>()(dtype_, [&](auto I) {
-      using Treal = decltype(I);
-      if (std::is_convertible<T, Treal>::value) {
-        auto ptr = reinterpret_cast<Treal *>(raw_data());
-        std::fill(ptr, ptr + size(), Treal(value));
-      } else {
-        TV_THROW_INVALID_ARG("not convertable from", type_s<T>, "to",
-                             type_s<Treal>);
-      }
-    });
-    return *this;
-  }
-  template <typename T> T *data() {
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    writable_check();
-    return reinterpret_cast<T *>(raw_data());
-  }
-  template <typename T> const T *data() const {
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    return reinterpret_cast<const T *>(raw_data());
-  }
-  template <typename T> T *data_ptr() { return data<T>(); }
-  template <typename T> const T *data_ptr() const { return data<T>(); }
-  void *data_ptr() { return reinterpret_cast<void *>(raw_data()); }
-  const void *data_ptr() const {
-    return reinterpret_cast<const void *>(raw_data());
-  }
-  void copy_(const Tensor &tensor) {
-    writable_check();
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
-    TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
-    TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
-                     detail::typeString(dtype()),
-                     detail::typeString(tensor.dtype()));
-    if (device() == -1 && tensor.device() == -1) {
-#ifdef TV_CUDA
-      host2host(storage_->data(), tensor.raw_data(),
-                size() * detail::sizeof_dtype(dtype_));
-#else
-      std::copy(tensor.raw_data(),
-                tensor.raw_data() + size() * detail::sizeof_dtype(dtype_),
-                storage_->data());
-#endif
-    }
-#ifdef TV_CUDA
-    else if (device() >= 0 && tensor.device() == -1) {
-      host2dev(storage_->data(), tensor.raw_data(),
-               size() * detail::sizeof_dtype(dtype_));
-    } else if (device() == -1 && tensor.device() >= 0) {
-      dev2host(storage_->data(), tensor.raw_data(),
-               size() * detail::sizeof_dtype(dtype_));
-    } else if (device() >= 0 && tensor.device() >= 0) {
-      dev2dev(storage_->data(), tensor.raw_data(),
-              size() * detail::sizeof_dtype(dtype_));
-    }
-#endif
-    else {
-      TV_THROW_RT_ERR("only support cpu tensor");
-    }
-  }
-#ifdef TV_CUDA
-  void copy_(const Tensor &tensor, cudaStream_t stream) {
-    writable_check();
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
-    TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
-    TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
-                     detail::typeString(dtype()),
-                     detail::typeString(tensor.dtype()));
-    if (device() == -1 && tensor.device() == -1) {
-      host2host(storage_->data(), tensor.raw_data(),
-                size() * detail::sizeof_dtype(dtype_), stream);
-    } else if (device() >= 0 && tensor.device() == -1) {
-      host2dev(storage_->data(), tensor.raw_data(),
-               size() * detail::sizeof_dtype(dtype_), stream);
-    } else if (device() == -1 && tensor.device() >= 0) {
-      dev2host(storage_->data(), tensor.raw_data(),
-               size() * detail::sizeof_dtype(dtype_), stream);
-    } else if (device() >= 0 && tensor.device() >= 0) {
-      dev2dev(storage_->data(), tensor.raw_data(),
-              size() * detail::sizeof_dtype(dtype_), stream);
-    } else {
-      TV_THROW_RT_ERR("only support cpu tensor");
-    }
-  }
-#endif
-  Tensor cpu() const {
-    if (storage_->device() == -1) {
-      // cpu() should always copy tensor.
-      return clone();
-    }
-    Tensor res(shape_, stride_, dtype_, -1, storage_->managed());
-    res.copy_(*this);
-    return res;
-  }
-  template <typename T> void copy_(const TensorView<T> &tensor, int device) {
-    writable_check();
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    Tensor src = from_blob(tensor, device);
-    return copy_(src);
-  }
-  Tensor &operator=(const Tensor &tensor) {
-    dtype_ = tensor.dtype_;
-    storage_ = tensor.storage_;
-    shape_ = tensor.shape_;
-    writeable_ = tensor.writeable_;
-    offset_ = tensor.offset_;
-    stride_ = tensor.stride_;
-    return *this;
-  }
-  Tensor(const Tensor &tensor) {
-    dtype_ = tensor.dtype_;
-    storage_ = tensor.storage_;
-    shape_ = tensor.shape_;
-    writeable_ = tensor.writeable_;
-    offset_ = tensor.offset_;
-    stride_ = tensor.stride_;
-  }
-  Tensor clone(bool pinned = false) const {
-    TV_ASSERT_RT_ERR(!empty(), "clone a empty tensor");
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    Tensor newtensor(shape_, stride_, dtype_, device(), pinned,
-                     storage_->managed());
-    newtensor.copy_(*this);
-    return newtensor;
-  }
-  Tensor astype(DType dtype) {
-    if (dtype == dtype_) {
-      return clone();
-    }
-    TV_ASSERT_INVALID_ARG(device() == -1, "only support cpu tensor");
-    TV_ASSERT_INVALID_ARG(!empty(), "can't be used in empty tensor");
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    auto tensor = Tensor();
-    Dispatch<detail::all_tensor_types_t>()(dtype, [&](auto Idst) {
-      using Tdst = decltype(Idst);
-      Dispatch<detail::all_tensor_types_t>()(this->dtype_, [&](auto Icur) {
-        using Tcur = decltype(Icur);
-        if (std::is_convertible<Tcur, Tdst>::value) {
-          auto ptr = this->data<Tcur>();
-          tensor = Tensor(this->shape_, this->stride_, dtype, this->device(),
-                          this->pinned(), this->storage_->managed());
-          std::copy(ptr, ptr + this->size(), tensor.data<Tdst>());
-        } else {
-          TV_THROW_INVALID_ARG("not convertable from", type_s<Tcur>, "to",
-                               type_s<Tdst>);
-        }
-      });
-    });
-    return tensor;
-  }
-  template <class... Ts, typename F> inline void dispatch(F &&f) {
-    return tv::dispatch<Ts...>(dtype_, std::forward<F>(f));
-  }
-protected:
-  inline void writable_check() {
-    TV_ASSERT_RT_ERR(writeable_,
-                     "you cant do non-const operation when not writable");
-  }
-  DType dtype_;
-  std::shared_ptr<detail::TensorStorage<uint8_t>> storage_;
-  TensorShape shape_;
-  size_t offset_ = 0;
-  TensorShape stride_;
-private:
-  bool writeable_ = true;
-  bool contiguous_ = true;
-};
-template <typename Os> Os &operator<<(Os &os, const Tensor &tensor) {
-  TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
-  Dispatch<detail::all_tensor_types_t>()(tensor.dtype(), [&](auto I) {
-    using T = decltype(I);
-    std::stringstream ss;
-    if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
-      ss << std::setprecision(4);
-    }
-    os << tensor.tview<T, -1, DefaultPtrTraits, int64_t>().repr(ss);
-  });
-  return os;
-}
-inline Tensor from_blob(void *ptr, TensorShape shape, DType dtype, int device) {
-  return Tensor(ptr, shape, dtype, device);
-}
-inline Tensor from_blob(const void *ptr, TensorShape shape, DType dtype,
-                        int device) {
-  return Tensor(ptr, shape, dtype, device);
-}
-} // namespace tv
\ No newline at end of file
--- a/include/tensorview/tensorview.h
+++ b/include/tensorview/tensorview.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "common.h"
-#include "prettyprint.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <sstream>
-#include <type_traits>
-#include <vector>
-#ifdef TV_CUDA
-#include <cuda_runtime_api.h>
-#endif
-namespace tv {
-#if (defined(__clang__) && defined(__CUDA__)) || defined(__NVCC__)
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#define TV_ASSERT(expr) assert(expr)
-#elif defined(__CUDACC_RTC__)
-#define TV_ASSERT(expr) assert(expr)
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#else
-#define TV_ASSERT(x) assert(x)
-#define TV_HOST_DEVICE_INLINE inline
-#define TV_HOST_DEVICE
-#endif
-#define TV_REQUIRE(expr, ...)                                                  \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      printf(__VA_ARGS__);                                                     \
-      assert(expr);                                                            \
-    }                                                                          \
-  }
-#define TV_CHECK_CUDA_ERR()                                                    \
-  {                                                                            \
-    auto __macro_err = cudaGetLastError();                                     \
-    if (__macro_err != cudaSuccess) {                                          \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << __macro_err;         \
-      TV_BACKTRACE_PRINT(__macro_s);                                           \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
-  }
-#define TV_CHECK_CUDA_ERR_V2(...)                                              \
-  {                                                                            \
-    auto __macro_err = cudaGetLastError();                                     \
-    if (__macro_err != cudaSuccess) {                                          \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << __macro_err;         \
-      __macro_s << " " << cudaGetErrorString(__macro_err) << "\n";             \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      TV_BACKTRACE_PRINT(__macro_s);                                           \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
-  }
-#ifdef TV_CUDA
-struct GPU {
-  GPU(cudaStream_t s = 0) : mStream(s) {}
-  virtual cudaStream_t getStream() const { return mStream; }
-  cudaStream_t mStream = 0;
-};
-#endif
-struct CPU {};
-#ifndef TV_MAX_DIM
-#define TV_MAX_DIM 6
-#endif
-template <typename T> struct DefaultPtrTraits { typedef T *type; };
-#if defined(__CUDACC__) || defined(__HIPCC__)
-template <typename T> struct RestrictPtrTraits {
-  typedef T *__restrict__ type;
-};
-#endif
-/*
-template <typename T>
-constexpr size_t calc_align(size_t ndim)
-{
-  if (ndim * sizeof(T) == 1)
-    return 1;
-  else if (ndim * sizeof(T) == 2)
-    return 2;
-  else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
-    return 4;
-  else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
-    return 8;
-  else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
-    return 16;
-  else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
-    return 32;
-  else
-    return 64;
-}
-*/
-namespace detail {
-template <typename _InIter>
-using _RequireInputIter = typename std::enable_if<std::is_convertible<
-    typename std::iterator_traits<_InIter>::iterator_category,
-    std::input_iterator_tag>::value>::type;
-}
-template <typename T, size_t MaxDim = TV_MAX_DIM>
-struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
-public:
-  TV_HOST_DEVICE_INLINE SimpleVector(){};
-  TV_HOST_DEVICE_INLINE SimpleVector(size_t count, T init = T())
-      : size_(count) {
-    for (size_t i = 0; i < count; ++i) {
-      array_[i] = init;
-    }
-  };
-  template <typename Iterator, typename = detail::_RequireInputIter<Iterator>>
-  SimpleVector(Iterator first, Iterator last) {
-    size_ = 0;
-    for (; first != last; ++first) {
-      if (size_ >= MaxDim) {
-        TV_THROW_INVALID_ARG("iterator too long");
-      }
-      array_[size_++] = *first;
-    }
-  };
-  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
-    TV_ASSERT(q.size() <= MaxDim);
-    size_ = 0;
-    for (T s : q) {
-      array_[size_++] = s;
-    }
-    size_ = q.size();
-  }
-  SimpleVector(const std::vector<T> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      array_[i] = arr[i];
-    }
-    size_ = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      array_[i] = arr[i];
-    }
-    size_ = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < size_);
-#endif
-    return array_[idx];
-  }
-  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < size_);
-#endif
-    return array_[idx];
-  }
-  TV_HOST_DEVICE_INLINE void push_back(T s) {
-#ifdef TV_DEBUG
-    TV_ASSERT(size_ < MaxDim);
-#endif
-    array_[size_] = s;
-    size_++;
-  }
-  TV_HOST_DEVICE_INLINE void pop_back() {
-#ifdef TV_DEBUG
-    TV_ASSERT(size_ > 0);
-#endif
-    size_--;
-  }
-  TV_HOST_DEVICE_INLINE size_t size() const { return size_; }
-  TV_HOST_DEVICE_INLINE const T *data() const { return array_; }
-  TV_HOST_DEVICE_INLINE T *data() { return array_; }
-  TV_HOST_DEVICE_INLINE size_t empty() const { return size_ == 0; }
-  typedef size_t size_type;
-  class iterator {
-  public:
-    typedef iterator self_type;
-    typedef T value_type;
-    typedef T &reference;
-    typedef T *pointer;
-    typedef std::forward_iterator_tag iterator_category;
-    typedef std::ptrdiff_t difference_type;
-    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
-      return ptr_ != rhs.ptr_;
-    }
-  private:
-    pointer ptr_;
-  };
-  class const_iterator {
-  public:
-    typedef const_iterator self_type;
-    typedef T value_type;
-    typedef const T &reference;
-    typedef const T *pointer;
-    typedef std::ptrdiff_t difference_type;
-    typedef std::forward_iterator_tag iterator_category;
-    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
-      return ptr_ != rhs.ptr_;
-    }
-  private:
-    pointer ptr_;
-  };
-  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(array_); }
-  TV_HOST_DEVICE_INLINE iterator end() { return iterator(array_ + size_); }
-  TV_HOST_DEVICE_INLINE const_iterator begin() const {
-    return const_iterator(array_);
-  }
-  TV_HOST_DEVICE_INLINE const_iterator end() const {
-    return const_iterator(array_ + size_);
-  }
-  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
-    return const_iterator(array_);
-  }
-  TV_HOST_DEVICE_INLINE const_iterator cend() const {
-    return const_iterator(array_ + size_);
-  }
-protected:
-  T array_[MaxDim];
-  size_t size_ = 0;
-};
-template <typename T, size_t MaxDim>
-bool operator==(const SimpleVector<T, MaxDim> &lfs,
-                const SimpleVector<T, MaxDim> &rfs) {
-  if (lfs.size() != rfs.size())
-    return false;
-  for (size_t i = 0; i < lfs.size(); ++i) {
-    if (lfs[i] != rfs[i])
-      return false;
-  }
-  return true;
-}
-template <typename T, size_t MaxDim>
-bool operator!=(const SimpleVector<T, MaxDim> &lfs,
-                const SimpleVector<T, MaxDim> &rfs) {
-  return !(lfs == rfs);
-}
-struct Slice {
-  template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
-    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
-    SimpleVector<int, 3> slices{int(ints)...};
-    slices_[0] = -1;
-    slices_[1] = -1;
-    slices_[2] = -1;
-    for (size_t i = 0; i < slices.size(); ++i) {
-      slices_[i] = slices[i];
-    }
-  }
-  TV_HOST_DEVICE_INLINE Slice() {
-    slices_[0] = -1;
-    slices_[1] = -1;
-    slices_[2] = -1;
-  }
-  template <typename T>
-  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
-    slices_[0] = -1;
-    slices_[1] = -1;
-    slices_[2] = -1;
-    TV_ASSERT(slice.size() <= 3);
-    int idx = 0;
-    for (T s : slice) {
-      slices_[idx] = int(s);
-      ++idx;
-    }
-  }
-  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return slices_[idx];
-  }
-  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return slices_[idx];
-  }
-protected:
-  int slices_[3];
-};
-template <size_t MaxDim = TV_MAX_DIM, typename Tindex = int>
-struct ShapeBase : public SimpleVector<Tindex, MaxDim> {
-  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<Tindex, MaxDim>(){};
-  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<Tindex> shape)
-      : SimpleVector<Tindex, MaxDim>(shape) {}
-  TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<Tindex, MaxDim> vec)
-      : SimpleVector<Tindex, MaxDim>(vec) {}
-  template <typename T, template <class...> class Container>
-  ShapeBase(Container<T> shape) : SimpleVector<Tindex, MaxDim>(shape) {}
-  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
-      : SimpleVector<Tindex, MaxDim>(shape) {}
-  ShapeBase(const std::vector<Tindex> &arr)
-      : SimpleVector<Tindex, MaxDim>(arr) {}
-  ShapeBase<MaxDim, Tindex> &
-  operator=(const ShapeBase<MaxDim, Tindex> &shape) = default;
-  TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start,
-                                                    Tindex end) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && end <= this->size_ && end > start);
-#endif
-    ShapeBase<MaxDim, Tindex> shape;
-    for (Tindex i = start; i < end; ++i) {
-      shape.push_back(this->array_[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && start <= this->size_);
-#endif
-    ShapeBase<MaxDim, Tindex> shape;
-    for (size_t i = start; i < this->size_; ++i) {
-      shape.push_back(this->array_[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE size_t size() const {
-    if (this->size_ == 0)
-      return 0;
-    size_t s = 1;
-    for (int i = 0; i < int(this->size_); ++i) {
-      s *= this->array_[i];
-    }
-    return s;
-  }
-  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->size_; }
-  TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> squeeze() const {
-    ShapeBase<MaxDim, Tindex> shape;
-    for (size_t i = 0; i < this->size_; ++i) {
-      if (this->array_[i] != 1)
-        shape.push_back(this->array_[i]);
-    }
-    if (shape.empty()) {
-      // dont support empty shape for now
-      shape.push_back(1);
-    }
-    return shape;
-  }
-  template <size_t MaxDim2 = MaxDim>
-  TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> squeeze(int dim) const {
-    static_assert(MaxDim2 >= MaxDim - 1, "error");
-    ShapeBase<MaxDim2, Tindex> shape;
-    for (size_t i = 0; i < this->size_; ++i) {
-      if (i != size_t(dim) || this->array_[i] != 1)
-        shape.push_back(this->array_[i]);
-    }
-    return shape;
-  }
-  template <size_t MaxDim2 = MaxDim>
-  TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> unsqueeze(int dim) const {
-    static_assert(MaxDim2 >= MaxDim - 1, "error");
-    ShapeBase<MaxDim2, Tindex> shape;
-    for (size_t i = 0; i < this->size_; ++i) {
-      if (i == size_t(dim))
-        shape.push_back(1);
-      shape.push_back(this->array_[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE size_t prod(Tindex start = 0) const {
-    size_t res = 1;
-    for (size_t i = start; i < this->size_; ++i) {
-      res *= this->array_[i];
-    }
-    return res;
-  }
-  template <size_t MaxDim2 = MaxDim>
-  TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> stride_rowmajor() {
-    static_assert(MaxDim2 >= MaxDim, "error");
-    Tindex p = Tindex(1);
-    ShapeBase<MaxDim2, Tindex> res(this->size_);
-    for (Tindex i = this->size_ - 1; i >= 0; --i) {
-      res[i] = p;
-      p *= this->array_[i];
-    }
-    return res;
-  }
-};
-using Shape = ShapeBase<TV_MAX_DIM, int>;
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#ifdef TV_DEBUG
-  TV_ASSERT(sizeof...(indexes) == shape.size());
-#endif
-#if defined(__CUDA_ARCH__)
-#pragma unroll
-#endif
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           std::vector<int> &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = shape.size() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#if defined(__CUDA_ARCH__)
-#pragma unroll
-#endif
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           const Shape &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
-                                           const Index *shape) {
-  unsigned offset = 0;
-  unsigned m = 1;
-#if defined(__CUDA_ARCH__)
-#pragma unroll
-#endif
-  for (int i = NDim - 1; i >= 0; --i) {
-    offset += m * indexes[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
-                                           const Index *shape) {
-#pragma unroll
-  for (int i = NDim - 1; i >= 0; --i) {
-    output[i] = index % shape[i];
-    index -= output[i];
-    index /= shape[i];
-  }
-  return index;
-}
-template <typename Index>
-TV_HOST_DEVICE Index rowArrayIdxInv(Index index, Index *output,
-                                    const Index *shape, int ndim) {
-  for (int i = ndim - 1; i >= 0; --i) {
-    output[i] = index % shape[i];
-    index -= output[i];
-    index /= shape[i];
-  }
-  return index;
-}
-template <int N> struct ArrayIndexRowMajorReverse {
-  template <typename TShape, typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T index,
-                                            Ts... inds) {
-    return index +
-           shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
-  }
-  template <typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T index,
-                                                 Ts... inds) {
-    return index +
-           shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
-  }
-};
-template <> struct ArrayIndexRowMajorReverse<1> {
-  template <typename TShape, typename T>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T idx) {
-    return idx;
-  }
-  template <typename T>
-  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T idx) {
-    return idx;
-  }
-};
-template <int N, int Ndim> struct ArrayIndexRowMajor {
-  // this array index provide almost same compiled code. compile it in
-  // https://godbolt.org/ for more details.
-  template <typename TShape, typename Tinit, typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
-                                            T index, Ts... inds) {
-    return ArrayIndexRowMajor<N - 1, Ndim>::run(
-        shape, (index + start) * shape[Ndim - N + 1], inds...);
-  }
-  template <typename Tinit, typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned
-  runShape(const Shape &shape, Tinit start, T index, Ts... inds) {
-    return ArrayIndexRowMajor<N - 1, Ndim>::runShape(
-        shape, (index + start) * shape[Ndim - N + 1], inds...);
-  }
-  template <typename TShape, typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned
-  runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
-    return ArrayIndexRowMajor<N - 1, Ndim>::runPtrs(
-        indexes, shape, (indexes[Ndim - N] + start) * shape[Ndim - N + 1]);
-  }
-};
-template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
-  template <typename TShape, typename Tinit, typename T>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
-                                            T idx) {
-    return start + idx;
-  }
-  template <typename Tinit, typename T>
-  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
-                                                 Tinit start, T idx) {
-    return start + idx;
-  }
-  template <typename TShape, typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned
-  runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
-    return start + indexes[Ndim - 1];
-  }
-};
-template <> struct ArrayIndexRowMajor<0, 0> {
-  template <typename TShape, typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start) {
-    return 0;
-  }
-  template <typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
-                                                 Tinit start) {
-    return 0;
-  }
-  template <typename TShape, typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned
-  runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
-    return 0;
-  }
-};
-template <int N, int Ndim> struct ArrayIndexStride {
-  // this array index provide almost same compiled code. compile it in
-  // https://godbolt.org/ for more details.
-  template <typename TShape, typename Tinit, typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
-                                            T index, Ts... inds) {
-    return ArrayIndexStride<N - 1, Ndim>::run(
-        stride, start + index * stride[Ndim - N + 1], inds...);
-  }
-};
-template <int Ndim> struct ArrayIndexStride<1, Ndim> {
-  template <typename TShape, typename Tinit, typename T>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
-                                            T idx) {
-    return start + idx * stride[Ndim - 1];
-  }
-};
-#if __cplusplus >= 201703L
-template <size_t... N, class T, class... Ts>
-TV_HOST_DEVICE_INLINE T array_index_stride(const T *stride, Ts... ids) {
-  return ((stride[N] * std::get<N>(std::forward_as_tuple(ids...))) + ...);
-}
-#endif
-namespace detail {
-template <typename T> struct TypeToString;
-template <> struct TypeToString<bool> {
-  static constexpr const char *value = "bool";
-};
-template <> struct TypeToString<const bool> {
-  static constexpr const char *value = "bool";
-};
-template <> struct TypeToString<int32_t> {
-  static constexpr const char *value = "int32";
-};
-template <> struct TypeToString<float> {
-  static constexpr const char *value = "float";
-};
-template <> struct TypeToString<double> {
-  static constexpr const char *value = "double";
-};
-template <> struct TypeToString<int16_t> {
-  static constexpr const char *value = "int16";
-};
-template <> struct TypeToString<int8_t> {
-  static constexpr const char *value = "int8";
-};
-template <> struct TypeToString<int64_t> {
-  static constexpr const char *value = "int64";
-};
-template <> struct TypeToString<uint8_t> {
-  static constexpr const char *value = "uint8";
-};
-template <> struct TypeToString<uint16_t> {
-  static constexpr const char *value = "uint16";
-};
-template <> struct TypeToString<uint32_t> {
-  static constexpr const char *value = "uint32";
-};
-template <> struct TypeToString<uint64_t> {
-  static constexpr const char *value = "uint64";
-};
-template <> struct TypeToString<const int32_t> {
-  static constexpr const char *value = "int32";
-};
-template <> struct TypeToString<const float> {
-  static constexpr const char *value = "float";
-};
-template <> struct TypeToString<const double> {
-  static constexpr const char *value = "double";
-};
-template <> struct TypeToString<const int16_t> {
-  static constexpr const char *value = "int16";
-};
-template <> struct TypeToString<const int8_t> {
-  static constexpr const char *value = "int8";
-};
-template <> struct TypeToString<const int64_t> {
-  static constexpr const char *value = "int64";
-};
-template <> struct TypeToString<const uint8_t> {
-  static constexpr const char *value = "uint8";
-};
-template <> struct TypeToString<const uint16_t> {
-  static constexpr const char *value = "uint16";
-};
-template <> struct TypeToString<const uint32_t> {
-  static constexpr const char *value = "uint32";
-};
-template <> struct TypeToString<const uint64_t> {
-  static constexpr const char *value = "uint64";
-};
-} // namespace detail
-template <typename T>
-constexpr const char *type_s = detail::TypeToString<T>::value;
-namespace detail {
-template <typename T, int Rank,
-          template <class> class PtrTraits = DefaultPtrTraits,
-          typename Tindex = int>
-struct TensorAccesserBase {
-  static constexpr int rank_value = Rank;
-  using ptr_t = typename PtrTraits<T>::type;
-  static_assert(Rank > 0, "error");
-  explicit TV_HOST_DEVICE_INLINE TensorAccesserBase(ptr_t ptr,
-                                                    const Tindex *stride_ptr)
-      : ptr_(ptr), stride_ptr_(stride_ptr) {}
-  TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
-  TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
-  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
-    static_assert(sizeof...(inds) == Rank, "error");
-    return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
-  }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
-    static_assert(sizeof...(inds) == Rank, "error");
-    return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
-  }
-protected:
-  ptr_t ptr_;
-  const Tindex *stride_ptr_;
-};
-} // namespace detail
-template <typename T, int Rank,
-          template <class> class PtrTraits = DefaultPtrTraits,
-          typename Tindex = int>
-struct TensorAccesser
-    : public detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex> {
-  using ptr_t = typename PtrTraits<T>::type;
-  static_assert(Rank > 0, "error");
-  explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
-                                                const Tindex *stride_ptr)
-      : detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex>(ptr,
-                                                               stride_ptr) {}
-  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
-  operator[](int i) {
-    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
-        this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
-  }
-  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
-  operator[](int i) const {
-    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
-        this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
-  }
-};
-template <typename T, template <class> class PtrTraits, typename Tindex>
-struct TensorAccesser<T, 1, PtrTraits, Tindex>
-    : public detail::TensorAccesserBase<T, 1, PtrTraits, Tindex> {
-  using ptr_t = typename PtrTraits<T>::type;
-  explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
-                                                const Tindex *stride_ptr)
-      : detail::TensorAccesserBase<T, 1, PtrTraits, Tindex>(ptr, stride_ptr) {}
-  TV_HOST_DEVICE_INLINE T &operator[](int i) {
-    return this->ptr_[this->stride_ptr_[0] * i];
-  }
-  TV_HOST_DEVICE_INLINE T &operator[](int i) const {
-    return this->ptr_[this->stride_ptr_[0] * i];
-  }
-};
-template <typename T, int Rank = -1,
-          template <class> class PtrTraits = DefaultPtrTraits,
-          typename Tindex = int>
-struct TensorView {
-  static constexpr int rank_value = Rank;
-  using ptr_t = typename PtrTraits<T>::type;
-  using tv_shape_t = ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex>;
-  using no_cv_type = typename std::remove_cv<T>::type;
-  static_assert(Rank == -1 || Rank > 0, "error");
-  TV_HOST_DEVICE_INLINE TensorView() {}
-  explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape)
-      : ptr_(ptr), shape_(shape), stride_(shape.stride_rowmajor()) {}
-  explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape,
-                                            tv_shape_t stride)
-      : ptr_(ptr), shape_(shape), stride_(stride) {}
-  operator TensorView<const no_cv_type, Rank, PtrTraits, Tindex>() {
-    return TensorView<const no_cv_type, Rank, PtrTraits, Tindex>(ptr_, shape_);
-  } // conversion function
-  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
-    static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
-#if defined TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               shape_.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 shape_[i]);
-    }
-#endif
-    constexpr int Ndim = sizeof...(Inds);
-    return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
-  }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
-    static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
-#if defined TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               shape_.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 shape_[i]);
-    }
-#endif
-    constexpr int Ndim = sizeof...(Inds);
-    return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
-  }
-  TV_HOST_DEVICE_INLINE T &operator()() {
-    static_assert(Rank == -1 || 0 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               shape_.ndim());
-#endif
-    return ptr_[0];
-  }
-  TV_HOST_DEVICE_INLINE const T &operator()() const {
-    static_assert(Rank == -1 || 0 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               shape_.ndim());
-#endif
-    return ptr_[0];
-  }
-  template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
-    static_assert(Rank == -1 || 1 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, shape_[0]);
-#endif
-    return ptr_[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
-    static_assert(Rank == -1 || 2 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-#endif
-    return ptr_[i1 * shape_[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
-    static_assert(Rank == -1 || 3 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
-#endif
-    return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
-    static_assert(Rank == -1 || 4 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
-#endif
-    return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
-  }
-  template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
-    static_assert(Rank == -1 || 1 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-#endif
-    return ptr_[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
-    static_assert(Rank == -1 || 2 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-#endif
-    return ptr_[i1 * shape_[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
-    static_assert(Rank == -1 || 3 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
-#endif
-    return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
-    static_assert(Rank == -1 || 4 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
-#endif
-    return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
-  }
-  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-               int(idx), size());
-#endif
-    return ptr_[idx];
-  }
-  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-               int(idx), size());
-#endif
-    return ptr_[idx];
-  }
-  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
-  accessor(Tindex idx) {
-    static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
-    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
-        ptr_ + stride_[0] * idx, stride_.data() + 1);
-  }
-  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() {
-    static_assert(Rank > 0, "rank must higher than zero");
-    return TensorAccesser<T, Rank, PtrTraits, Tindex>(ptr_, stride_.data());
-  }
-  TV_HOST_DEVICE_INLINE
-  TensorAccesser<T, Rank - 1, PtrTraits, Tindex> accessor(Tindex idx) const {
-    static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
-    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
-        ptr_ + stride_[0] * idx, stride_.data() + 1);
-  }
-  TV_HOST_DEVICE_INLINE
-  TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() const {
-    static_assert(Rank > 0, "error");
-    return TensorAccesser<T, Rank, PtrTraits, Tindex>(
-        ptr_, stride_.data(), "rank must higher than zero");
-  }
-  TV_HOST_DEVICE_INLINE bool empty() const { return ptr_ == nullptr; }
-  TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
-  TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
-  TV_HOST_DEVICE_INLINE const tv_shape_t &shape() const { return shape_; }
-  TV_HOST_DEVICE_INLINE const tv_shape_t &stride() const { return stride_; }
-  TV_HOST_DEVICE_INLINE int dim(int idx) const { return shape_[idx]; }
-  TV_HOST_DEVICE_INLINE int ndim() const { return shape_.ndim(); }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE
-      TensorView<T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits, Tindex>
-      view(Inds... newShapes) const {
-    ShapeBase<Rank == -1 ? TV_MAX_DIM : sizeof...(Inds), Tindex> shapes{
-        int(newShapes)...};
-    for (size_t i = 0; i < sizeof...(newShapes); ++i) {
-      if (shapes[i] == -1) {
-        shapes[i] = 1;
-        shapes[i] = size() / shapes.size();
-        break;
-      }
-    }
-    TV_ASSERT(shapes.size() == size());
-    return TensorView < T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits,
-           Tindex > (ptr_, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
-  view(Shape shapes) const {
-    TV_ASSERT(shapes.size() == size());
-    return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex> squeeze() const {
-    return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shape_.squeeze());
-  }
-  TV_HOST_DEVICE_INLINE
-  TensorView<T, Rank == -1 ? -1 : Rank - 1, PtrTraits, Tindex>
-  squeeze(int dim) const {
-    return TensorView < T, Rank == -1 ? -1 : Rank - 1, PtrTraits,
-           Tindex > (ptr_, shape_.squeeze < Rank == -1 ? TV_MAX_DIM
-                                                       : Rank - 1 > (dim));
-  }
-  TV_HOST_DEVICE_INLINE size_t size() const { return shape_.size(); }
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
-  subview(int id, Integers... ints) {
-    tv_shape_t start = {id, ints...};
-    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
-      start.push_back(0);
-    }
-    return TensorView<T, Rank, PtrTraits, Tindex>(
-        ptr_ + rowArrayIdx(shape_, start),
-        shape_.subshape(sizeof...(ints) + 1));
-  }
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
-  subview(int id, Integers... ints) const {
-    tv_shape_t start = {id, ints...};
-    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
-      start.push_back(0);
-    }
-    return TensorView<T, Rank, PtrTraits, Tindex>(
-        ptr_ + rowArrayIdx(shape_, start),
-        shape_.subshape(sizeof...(ints) + 1));
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
-  subview(SimpleVector<int> ids) const {
-    Shape start = ids;
-    for (int i = ids.size(); i < ndim(); ++i) {
-      start.push_back(0);
-    }
-    return TensorView<T, Rank, PtrTraits, Tindex>(
-        ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
-  }
-  template <typename Os> std::string repr(Os &ss) const {
-    if (empty())
-      return "";
-    if (shape_.ndim() == 0) {
-      ss << "Tensor[" << type_s<T> << "]" << std::endl;
-      ss << *ptr_;
-      return ss.str();
-    }
-    SimpleVector<int64_t, TV_MAX_DIM> prev(ndim(), -1);
-    SimpleVector<int64_t, TV_MAX_DIM> nd_index(ndim());
-    SimpleVector<int64_t, TV_MAX_DIM> _shape;
-    for (auto s : shape()) {
-      _shape.push_back(s);
-    }
-    ss << "Tensor[" << type_s<T> << "]: shape=" << shape()
-       << ", stride=" << stride() << std::endl;
-    auto ndimValue = ndim();
-    for (int64_t i = 0; i < int64_t(size()); ++i) {
-      rowArrayIdxInv(i, nd_index.data(), _shape.data(), ndimValue);
-      bool newline = false;
-      int end_count = 0;
-      for (int j = 0; j < ndimValue; ++j) {
-        if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
-            prev[j] != -1) {
-          ss << "]";
-          ++end_count;
-          newline = true;
-        }
-      }
-      if (prev[0] == -1) {
-        end_count = ndimValue;
-      }
-      if (newline) {
-        ss << "\n";
-      }
-      int starts_count = 0;
-      for (int j = 0; j < ndimValue; ++j) {
-        if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
-          ++starts_count;
-        }
-      }
-      if (starts_count > 0) {
-        for (int j = 0; j < ndimValue - end_count; ++j) {
-          ss << " ";
-        }
-        for (int j = 0; j < starts_count; ++j) {
-          ss << "[";
-        }
-      }
-      if (std::is_same<T, uint8_t>::value ||
-          std::is_same<T, const uint8_t>::value) {
-        ss << unsigned((*this)[i]);
-      } else {
-        ss << (*this)[i];
-      }
-      if (nd_index[ndimValue - 1] != _shape[ndimValue - 1] - 1) {
-        ss << ",";
-      }
-      for (int j = 0; j < ndimValue; ++j) {
-        prev[j] = nd_index[j];
-      }
-    }
-    for (int j = 0; j < ndimValue; ++j) {
-      ss << "]";
-    }
-    return ss.str();
-  }
-  std::string repr() const {
-    std::ostringstream ss;
-    return repr(ss);
-  }
-protected:
-  template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
-    return Slice{int(s), -1, -1};
-  }
-  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
-  ptr_t ptr_ = nullptr;
-  tv_shape_t shape_;
-  tv_shape_t stride_;
-};
-template <typename T> TensorView<T> vector2tv(std::vector<T> &arr) {
-  return TensorView<T>(arr.data(), {arr.size()});
-}
-template <typename T>
-TensorView<T> vector2tv(std::vector<T> &arr, Shape shape) {
-  TV_ASSERT_INVALID_ARG(shape.prod() == arr.size(), "error");
-  return TensorView<T>(arr.data(), shape);
-}
-template <typename T> TensorView<const T> vector2tv(const std::vector<T> &arr) {
-  return TensorView<const T>(arr.data(), {arr.size()});
-}
-template <typename Os, typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-Os &operator<<(Os &os, const TensorView<T, Rank, PtrTraits, Tindex> &dt) {
-  os << dt.repr();
-  return os;
-}
-template <typename Os, typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-Os &operator<<(Os &os, const TensorView<const T, Rank, PtrTraits, Tindex> &dt) {
-  os << dt.repr();
-  return os;
-}
-namespace detail {
-template <typename T> struct TypePrintfFormat;
-template <> struct TypePrintfFormat<float> {
-  static constexpr const char *value = "%.2f";
-};
-template <> struct TypePrintfFormat<double> {
-  static constexpr const char *value = "%.2f";
-};
-template <> struct TypePrintfFormat<int8_t> {
-  static constexpr const char *value = "%d";
-};
-template <> struct TypePrintfFormat<int16_t> {
-  static constexpr const char *value = "%d";
-};
-template <> struct TypePrintfFormat<int32_t> {
-  static constexpr const char *value = "%d";
-};
-template <> struct TypePrintfFormat<uint8_t> {
-  static constexpr const char *value = "%u";
-};
-template <> struct TypePrintfFormat<uint16_t> {
-  static constexpr const char *value = "%u";
-};
-template <> struct TypePrintfFormat<uint32_t> {
-  static constexpr const char *value = "%u";
-};
-template <> struct TypePrintfFormat<int64_t> {
-  static constexpr const char *value = "%ld";
-};
-template <> struct TypePrintfFormat<uint64_t> {
-  static constexpr const char *value = "%lu";
-};
-template <> struct TypePrintfFormat<bool> {
-  static constexpr const char *value = "%d";
-};
-template <typename T>
-constexpr const char *type_printf_format_v = TypePrintfFormat<T>::value;
-}; // namespace detail
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-TV_HOST_DEVICE void
-printTensorView(const TensorView<T, Rank, PtrTraits, Tindex> &tensor,
-                const char *format) {
-  // used to print tensor in cuda kernel.
-  if (tensor.empty())
-    return;
-  if (tensor.ndim() == 0) {
-    printf(format, tensor());
-    printf("\n");
-    return;
-  }
-  SimpleVector<int64_t, TV_MAX_DIM> prev(tensor.ndim(), -1);
-  SimpleVector<int64_t, TV_MAX_DIM> nd_index(tensor.ndim());
-  SimpleVector<int64_t, TV_MAX_DIM> shape(tensor.shape());
-  auto ndim = tensor.ndim();
-  for (int64_t i = 0; i < tensor.size(); ++i) {
-    rowArrayIdxInv(i, nd_index.data(), shape.data(), ndim);
-    bool newline = false;
-    int end_count = 0;
-    for (int j = 0; j < ndim; ++j) {
-      if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
-          prev[j] != -1) {
-        printf("]");
-        ++end_count;
-        newline = true;
-      }
-    }
-    if (prev[0] == -1) {
-      end_count = ndim;
-    }
-    if (newline) {
-      printf("\n");
-    }
-    int starts_count = 0;
-    for (int j = 0; j < ndim; ++j) {
-      if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
-        ++starts_count;
-      }
-    }
-    if (starts_count > 0) {
-      for (int j = 0; j < ndim - end_count; ++j) {
-        printf(" ");
-      }
-      for (int j = 0; j < starts_count; ++j) {
-        printf("]");
-      }
-    }
-    printf(format, tensor[i]);
-    if (nd_index[ndim - 1] != shape[ndim - 1] - 1) {
-      printf(",");
-    }
-    for (int j = 0; j < ndim; ++j) {
-      prev[j] = nd_index[j];
-    }
-  }
-  for (int j = 0; j < ndim; ++j) {
-    printf("]");
-  }
-  printf("\n");
-}
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-TV_HOST_DEVICE void
-printTensorView(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
-  using Traw = typename std::remove_const<T>::type;
-  return printTensorView(tensor, detail::type_printf_format_v<Traw>);
-}
-template <typename T>
-TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
-  using Traw = typename std::remove_const<T>::type;
-  return printTensorView(TensorView<const T>(ptr, shape),
-                         detail::type_printf_format_v<Traw>);
-}
-template <typename T>
-TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
-                                    const char *format) {
-  return printTensorView(TensorView<const T>(ptr, shape), format);
-}
-#ifdef TV_CUDA
-#ifdef __DRIVER_TYPES_H__
-#ifndef DEVICE_RESET
-#define DEVICE_RESET cudaDeviceReset();
-#endif
-#else
-#ifndef DEVICE_RESET
-#define DEVICE_RESET
-#endif
-#endif
-template <typename T>
-void check(T result, char const *const func, const char *const file,
-           int const line) {
-  if (result) {
-    fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
-            static_cast<unsigned int>(result), func);
-    DEVICE_RESET
-    // Make sure we call CUDA Device Reset before exiting
-    exit(EXIT_FAILURE);
-  }
-}
-#define checkCudaErrors(val) tv::check((val), #val, __FILE__, __LINE__)
-template <typename T>
-void host2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
-  checkCudaErrors(
-      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToDevice, s));
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-              const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
-              cudaStream_t s = 0) {
-  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-              const TensorView<T, Rank, PtrTraits2, Tindex2> src,
-              cudaStream_t s = 0) {
-  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T> void host2dev_sync(T *dst, const T *src, size_t size) {
-  checkCudaErrors(
-      cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyHostToDevice));
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-                   const TensorView<const T, Rank, PtrTraits2, Tindex2> src) {
-  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-                   const TensorView<T, Rank, PtrTraits2, Tindex2> src) {
-  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
-}
-template <typename T>
-void dev2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
-  checkCudaErrors(
-      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToHost, s));
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-              const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
-              cudaStream_t s = 0) {
-  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-              const TensorView<T, Rank, PtrTraits2, Tindex2> src,
-              cudaStream_t s = 0) {
-  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T>
-void dev2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
-  checkCudaErrors(
-      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice, s));
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-             const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
-             cudaStream_t s = 0) {
-  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-             const TensorView<T, Rank, PtrTraits2, Tindex2> src,
-             cudaStream_t s = 0) {
-  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T>
-void host2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
-  checkCudaErrors(
-      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToHost, s));
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-               const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
-               cudaStream_t s = 0) {
-  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-               const TensorView<T, Rank, PtrTraits2, Tindex2> src,
-               cudaStream_t s = 0) {
-  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
-  checkCudaErrors(cudaMemset(tensor.data(), 0, tensor.size() * sizeof(T)));
-}
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor, cudaStream_t s) {
-  checkCudaErrors(
-      cudaMemsetAsync(tensor.data(), 0, tensor.size() * sizeof(T), s));
-}
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-void zero_host(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
-  std::fill(tensor.data(), tensor.data() + tensor.size(), 0);
-}
-#endif
-} // namespace tv
\ No newline at end of file