Release v1.0.0rc1

333536f6 · Wenwei Zhang · GitHub · 9c7270d0 · f747daab · 9c7270d0
Unverified Commit 333536f6 authored Apr 06, 2022 by Wenwei Zhang Committed by GitHub Apr 06, 2022
20 changed files
--- a/mmdet3d/ops/spconv/include/tensorview/tensorview.h
+++ b/mmdet3d/ops/spconv/include/tensorview/tensorview.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cuda_runtime_api.h>
-
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <memory>
-// #include <prettyprint.h>
-#include <sstream>
-#include <type_traits>
-#include <vector>
-
-namespace tv {
-
-#ifdef __NVCC__
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#define TV_ASSERT(expr) assert(expr)
-#elif defined(__CUDACC_RTC__)
-#define TV_ASSERT(expr) assert(expr)
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#else
-#define TV_ASSERT(x) assert(x)
-#define TV_HOST_DEVICE_INLINE inline
-#define TV_HOST_DEVICE
-#endif
-
-#define TV_REQUIRE(expr, ...) \
-  {                           \
-    if (!(expr)) {            \
-      printf(__VA_ARGS__);    \
-      assert(expr);           \
-    }                         \
-  }
-
-#define TV_DEVICE_REQUIRE(expr, ...)                      \
-  {                                                       \
-    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
-    assert(expr);                                         \
-  }
-
-template <class SStream, class T>
-void sstream_print(SStream &ss, T val) {
-  ss << val;
-}
-
-template <class SStream, class T, class... TArgs>
-void sstream_print(SStream &ss, T val, TArgs... args) {
-  ss << val << " ";
-  sstream_print(ss, args...);
-}
-
-#define TV_ASSERT_RT_ERR(expr, ...)                     \
-  {                                                     \
-    if (!(expr)) {                                      \
-      std::stringstream __macro_s;                      \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
-      __macro_s << #expr << " assert failed. ";          \
-      tv::sstream_print(__macro_s, __VA_ARGS__);        \
-      throw std::runtime_error(__macro_s.str());        \
-    }                                                   \
-  }
-
-#define TV_ASSERT_INVALID_ARG(expr, ...)                \
-  {                                                     \
-    if (!(expr)) {                                      \
-      std::stringstream __macro_s;                      \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
-      __macro_s << #expr << " assert failed. ";          \
-      tv::sstream_print(__macro_s, __VA_ARGS__);        \
-      throw std::invalid_argument(__macro_s.str());     \
-    }                                                   \
-  }
-
-#define TV_CHECK_CUDA_ERR()                                    \
-  {                                                            \
-    auto err = cudaGetLastError();                             \
-    if (err != cudaSuccess) {                                  \
-      std::stringstream __macro_s;                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
-      __macro_s << "cuda execution failed with error " << err; \
-      throw std::runtime_error(__macro_s.str());               \
-    }                                                          \
-  }
-
-struct GPU {
-  GPU(cudaStream_t s = 0) : mStream(s) {}
-  virtual cudaStream_t getStream() const { return mStream; }
-  cudaStream_t mStream = 0;
-};
-struct CPU {};
-
-#define TV_MAX_DIM 6
-/*
-template <typename T>
-constexpr size_t calc_align(size_t ndim)
-{
-  if (ndim * sizeof(T) == 1)
-    return 1;
-  else if (ndim * sizeof(T) == 2)
-    return 2;
-  else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
-    return 4;
-  else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
-    return 8;
-  else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
-    return 16;
-  else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
-    return 32;
-  else
-    return 64;
-}
-*/
-template <typename T, size_t MaxDim = TV_MAX_DIM>
-struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
- public:
-  TV_HOST_DEVICE_INLINE SimpleVector(){};
-  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
-    TV_ASSERT(q.size() <= MaxDim);
-    mSize = 0;
-    for (T s : q) {
-      mArray[mSize++] = s;
-    }
-    mSize = q.size();
-  }
-  SimpleVector(const std::vector<T> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      mArray[i] = arr[i];
-    }
-    mSize = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      mArray[i] = arr[i];
-    }
-    mSize = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < mSize);
-#endif
-    return mArray[idx];
-  }
-  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < mSize);
-#endif
-    return mArray[idx];
-  }
-  TV_HOST_DEVICE_INLINE void push_back(T s) {
-#ifdef TV_DEBUG
-    TV_ASSERT(mSize < MaxDim);
-#endif
-    mArray[mSize] = s;
-    mSize++;
-  }
-  TV_HOST_DEVICE_INLINE void pop_back() {
-#ifdef TV_DEBUG
-    TV_ASSERT(mSize > 0);
-#endif
-    mSize--;
-  }
-
-  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
-  TV_HOST_DEVICE_INLINE const T *data() const { return mArray; }
-  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
-
-  typedef size_t size_type;
-
-  class iterator {
-   public:
-    typedef iterator self_type;
-    typedef T value_type;
-    typedef T &reference;
-    typedef T *pointer;
-    typedef std::forward_iterator_tag iterator_category;
-    typedef std::ptrdiff_t difference_type;
-    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
-      return ptr_ != rhs.ptr_;
-    }
-
-   private:
-    pointer ptr_;
-  };
-
-  class const_iterator {
-   public:
-    typedef const_iterator self_type;
-    typedef T value_type;
-    typedef const T &reference;
-    typedef const T *pointer;
-    typedef std::ptrdiff_t difference_type;
-    typedef std::forward_iterator_tag iterator_category;
-    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
-      return ptr_ != rhs.ptr_;
-    }
-
-   private:
-    pointer ptr_;
-  };
-
-  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
-
-  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
-
-  TV_HOST_DEVICE_INLINE const_iterator begin() const {
-    return const_iterator(mArray);
-  }
-
-  TV_HOST_DEVICE_INLINE const_iterator end() const {
-    return const_iterator(mArray + mSize);
-  }
-  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
-    return const_iterator(mArray);
-  }
-
-  TV_HOST_DEVICE_INLINE const_iterator cend() const {
-    return const_iterator(mArray + mSize);
-  }
-
- protected:
-  T mArray[MaxDim];
-  size_t mSize = 0;
-};
-
-template <typename T, size_t MaxDim>
-bool operator==(const SimpleVector<T, MaxDim> &lfs,
-                const SimpleVector<T, MaxDim> &rfs) {
-  if (lfs.size() != rfs.size()) return false;
-  for (size_t i = 0; i < lfs.size(); ++i) {
-    if (lfs[i] != rfs[i]) return false;
-  }
-  return true;
-}
-
-template <typename T, size_t MaxDim>
-bool operator!=(const SimpleVector<T, MaxDim> &lfs,
-                const SimpleVector<T, MaxDim> &rfs) {
-  return !(lfs == rfs);
-}
-
-struct Slice {
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
-    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
-    SimpleVector<int, 3> slices{int(ints)...};
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
-    for (size_t i = 0; i < slices.size(); ++i) {
-      mSlices[i] = slices[i];
-    }
-  }
-
-  TV_HOST_DEVICE_INLINE Slice() {
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
-  }
-  template <typename T>
-  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
-    TV_ASSERT(slice.size() <= 3);
-    int idx = 0;
-    for (T s : slice) {
-      mSlices[idx] = int(s);
-      ++idx;
-    }
-  }
-  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return mSlices[idx];
-  }
-  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return mSlices[idx];
-  }
-
- protected:
-  int mSlices[3];
-};
-
-template <size_t MaxDim = TV_MAX_DIM>
-struct ShapeBase : public SimpleVector<int, MaxDim> {
-  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
-  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
-      : SimpleVector<int, MaxDim>(shape) {}
-
-  template <typename T, template <class...> class Container>
-  ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
-  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
-      : SimpleVector<int, MaxDim>(shape) {}
-  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
-
-  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
-#endif
-    ShapeBase<MaxDim> shape;
-    for (int i = start; i < end; ++i) {
-      shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && start <= this->mSize);
-#endif
-    ShapeBase<MaxDim> shape;
-    for (int i = start; i < this->mSize; ++i) {
-      shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-
-  TV_HOST_DEVICE_INLINE size_t size() const {
-    if (this->mSize == 0) return 0;
-    size_t s = 1;
-    for (int i = 0; i < int(this->mSize); ++i) {
-      s *= this->mArray[i];
-    }
-    return s;
-  }
-  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
-    ShapeBase<MaxDim> shape;
-    for (int i = 0; i < this->mSize; ++i) {
-      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
-    ShapeBase<MaxDim> shape;
-    for (int i = 0; i < this->mSize; ++i) {
-      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-};
-
-using Shape = ShapeBase<TV_MAX_DIM>;
-
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#ifdef TV_DEBUG
-  TV_ASSERT(sizeof...(indexes) == shape.size());
-#endif
-#pragma unroll
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           std::vector<int> &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = shape.size() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#pragma unroll
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           const Shape &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
-                                           const Index *shape) {
-  unsigned offset = 0;
-  unsigned m = 1;
-#pragma unroll
-  for (int i = NDim - 1; i >= 0; --i) {
-    offset += m * indexes[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
-                                           const Index *shape) {
-#pragma unroll
-  for (int i = NDim - 1; i >= 0; --i) {
-    output[i] = index % shape[i];
-    index -= output[i];
-    index /= shape[i];
-  }
-  return index;
-}
-
-template <int N>
-struct ArrayIndexRowMajor {
-  // mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
-  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
-                                            const Shape &indexes) {
-    return indexes[N - 1] +
-           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
-  }
-};
-
-template <>
-struct ArrayIndexRowMajor<0> {
-  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
-                                            const Shape &indexes) {
-    return 0;
-  }
-};
-
-namespace detail {
-template <typename T>
-constexpr const char *simpleTypeName(T val = T());
-template <>
-constexpr const char *simpleTypeName(float val) {
-  return "float32";
-}
-template <>
-constexpr const char *simpleTypeName(double val) {
-  return "float64";
-}
-template <>
-constexpr const char *simpleTypeName(int val) {
-  return "int32";
-}
-template <>
-constexpr const char *simpleTypeName(unsigned val) {
-  return "uint32";
-}
-template <>
-constexpr const char *simpleTypeName(long val) {
-  return "int64";
-}
-template <>
-constexpr const char *simpleTypeName(unsigned long val) {
-  return "uint64";
-}
-};  // namespace detail
-
-template <typename T, int Rank = -1>
-struct TensorView {
-  TV_HOST_DEVICE_INLINE TensorView() {}
-  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
-      : mPtr(ptr), mShape(shape) {}
-  // explicit TV_HOST_DEVICE_INLINE TensorView(const
-  // TensorView<std::remove_const_t<T>> &tview) : mPtr(tview.data()),
-  // mShape(tview.shape()) {}
-  template <class... Integers>
-  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Integers... shapes)
-      : mPtr(ptr) {
-    mShape = {int(shapes)...};
-  }
-
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &assign(
-      const TensorView<T, Rank> &tensor) {
-    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
-               "\n");
-    T *ptr = mPtr;
-    const T *other_ptr = tensor.data();
-    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
-    return *this;
-  }
-
-  template <typename T1>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &assign(
-      std::initializer_list<T1> seq) {
-    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
-               "\n");
-    T *ptr = mPtr;
-    for (const T1 &s : seq) *(ptr++) = T(s);
-    return *this;
-  }
-
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
-#ifdef TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               mShape.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 mShape[i]);
-    }
-#endif
-    return mPtr[rowArrayIdx(mShape, int(inds)...)];
-  }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
-#ifdef TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               mShape.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 mShape[i]);
-    }
-#endif
-    return mPtr[rowArrayIdx(mShape, int(inds)...)];
-  }
-  TV_HOST_DEVICE_INLINE T &operator()() {
-#if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mPtr != nullptr,
-                      "you want get value but the view is empty.%s", "\n");
-    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
-                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
-#else
-    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               mShape.ndim());
-#endif
-#endif
-    return mPtr[0];
-  }
-  TV_HOST_DEVICE_INLINE const T &operator()() const {
-#if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mPtr != nullptr,
-                      "you want get value but the view is empty.%s", "\n");
-    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
-                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
-#else
-    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               mShape.ndim());
-#endif
-#endif
-    return mPtr[0];
-  }
-
-  template <class T1>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
-#if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
-                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
-#else
-    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
-#endif
-#endif
-    return mPtr[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
-                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-#else
-    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-#endif
-#endif
-    return mPtr[i1 * mShape[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
-                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-#else
-    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-#endif
-#endif
-    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
-                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
-                      mShape[3]);
-#else
-    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
-#endif
-#endif
-    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
-  }
-
-  template <class T1>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
-                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-#else
-    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-#endif
-#endif
-    return mPtr[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
-                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-#else
-    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-
-#endif
-#endif
-    return mPtr[i1 * mShape[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
-                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-#else
-    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-#endif
-#endif
-    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
-                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
-                      mShape[3]);
-#else
-    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
-#endif
-#endif
-    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
-  }
-
-  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
-                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
-#else
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-               int(idx), size());
-#endif
-#endif
-    return mPtr[idx];
-  }
-  // TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
-  /*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
-    int idx = rowArrayIdx(mShape, index);
-#ifdef TV_DEBUG
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-                int(idx), size());
-#endif
-    return mPtr[idx];
-  }
-  TV_HOST_DEVICE_INLINE const T &operator[](const Shape index) const {
-    int idx = rowArrayIdx(mShape, index);
-#ifdef TV_DEBUG
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-                int(idx), size());
-#endif
-    return mPtr[idx];
-  }*/
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> operator[](
-      SimpleVector<Slice> slice_vec) {
-    return _subview(slice_vec);
-  }
-  TV_HOST_DEVICE_INLINE const TensorView<T, Rank> operator[](
-      SimpleVector<Slice> slice_vec) const {
-    return _subview(slice_vec);
-  }
-  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
-  TV_HOST_DEVICE_INLINE T *data() { return mPtr; }
-  TV_HOST_DEVICE_INLINE const T *data() const { return mPtr; }
-  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
-  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
-  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Inds... newShapes) {
-    Shape shapes{int(newShapes)...};
-    TV_ASSERT(shapes.size() == size());
-    mShape = shapes;
-    return *this;
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Shape shapes) {
-    TV_ASSERT(shapes.size() == size());
-    mShape = shapes;
-    return *this;
-  }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Inds... newShapes) const {
-    Shape shapes{int(newShapes)...};
-    for (size_t i = 0; i < shapes.ndim(); ++i) {
-      if (shapes[i] == -1) {
-        shapes[i] = 1;
-        shapes[i] = size() / shapes.size();
-        break;
-      }
-    }
-    TV_ASSERT(shapes.size() == size());
-    return TensorView<T, Rank>(mPtr, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Shape shapes) const {
-    TV_ASSERT(shapes.size() == size());
-    return TensorView<T, Rank>(mPtr, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze() const {
-    return TensorView<T, Rank>(mPtr, mShape.squeeze());
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze(int dim) const {
-    return TensorView<T, Rank>(mPtr, mShape.squeeze(dim));
-  }
-  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
-
-  template <class... Slices>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slice slice,
-                                                    Slices... slices) const {
-    return subview<float, Slice, Slices...>(slice, slices...);
-  }
-  template <class T2 = float, class... Slices>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slices... slices) const {
-    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
-    Shape new_shape{to_slice(slices)[0]...};
-    Shape start{to_slice(slices)[0]...};
-    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
-    TV_ASSERT(new_shape.ndim() != 0);
-    size_t idxsize = new_shape.ndim();
-    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
-      new_shape.push_back(0);
-      start.push_back(0);
-    }
-#pragma unroll
-    for (size_t i = 0; i < sizeof...(Slices); ++i) {
-      if (slice_vec[i][1] != -1) {
-        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
-        TV_ASSERT(new_shape[i] >= 0);
-      } else {
-        new_shape[i] = 1;  // reduce dim
-      }
-    }
-    auto offset = rowArrayIdx(mShape, start);
-#pragma unroll
-    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
-      new_shape[i] = mShape[i];
-      TV_ASSERT(new_shape[i] >= 0);
-    }
-    Shape reduced_shape;
-#pragma unroll
-    for (size_t i = 0; i < sizeof...(Slices); ++i) {
-      if (slice_vec[i][1] != -1) {
-        reduced_shape.push_back(new_shape[i]);
-      }
-    }
-#pragma unroll
-    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
-      reduced_shape.push_back(new_shape[i]);
-    }
-    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
-  }
-
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(int id, Integers... ints) {
-    Shape start = {id, ints...};
-    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
-      start.push_back(0);
-    }
-    return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
-                               mShape.subshape(sizeof...(ints) + 1));
-  }
-
-  std::string repr() const {
-    std::ostringstream ss;
-    if (empty()) return "";
-    if (mShape.ndim() == 0) {
-      ss << *mPtr;
-      // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
-      // detail::simpleTypeName<T>());
-      ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
-      return ss.str();
-    }
-    Shape counter = mShape;
-    auto tensor_flat = this->view(-1);
-    for (int i = 0; i < counter.ndim(); ++i) {
-      counter[i] = 0;
-      ss << "[";
-    }
-    for (size_t i = 0; i < this->size(); ++i) {
-      ss << tensor_flat(rowArrayIdx(mShape, counter));
-      counter[counter.ndim() - 1] += 1;
-      int inc_count = 0;
-      bool print_comma = true;
-      for (int c = counter.ndim() - 1; c >= 0; --c) {
-        if (counter[c] == this->dim(c) && c > 0) {
-          ++inc_count;
-          counter[c - 1] += 1;
-          counter[c] = 0;
-          print_comma = false;
-        }
-      }
-      if (print_comma && i != this->size() - 1) ss << ", ";
-      for (int j = 0; j < inc_count; ++j) {
-        ss << "]";
-      }
-      if (i != this->size() - 1) {
-        if (inc_count != 0) ss << "\n";
-        for (int j = 0; j < inc_count; ++j) {
-          ss << "[";
-        }
-      }
-    }
-    ss << "]";
-    // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
-    // detail::simpleTypeName<T>());
-    ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
-    return ss.str();
-  }
-
- protected:
-  // TODO: make this function public.
-  // currently this function is called unexpectedly when using subview({0, 0}).
-  TV_HOST_DEVICE_INLINE TensorView<T, Rank> _subview(
-      SimpleVector<Slice> slice_vec) {
-    Shape new_shape;
-    for (int i = 0; i < slice_vec.size(); ++i) {
-      new_shape.push_back(slice_vec[i][0]);
-    }
-    Shape start = new_shape;
-    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
-    TV_ASSERT(new_shape.ndim() != 0);
-    size_t idxsize = new_shape.ndim();
-    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
-      new_shape.push_back(0);
-      start.push_back(0);
-    }
-    for (size_t i = 0; i < slice_vec.size(); ++i) {
-      if (slice_vec[i][1] != -1) {
-        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
-        TV_ASSERT(new_shape[i] >= 0);
-      } else {
-        new_shape[i] = 1;  // reduce dim
-      }
-    }
-    auto offset = rowArrayIdx(mShape, start);
-    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
-      new_shape[i] = mShape[i];
-      TV_ASSERT(new_shape[i] >= 0);
-    }
-    Shape reduced_shape;
-    for (size_t i = 0; i < slice_vec.size(); ++i) {
-      if (slice_vec[i][1] != -1) {
-        reduced_shape.push_back(new_shape[i]);
-      }
-    }
-    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
-      reduced_shape.push_back(new_shape[i]);
-    }
-    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
-  }
-  template <typename T1>
-  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
-    return Slice{int(s), -1, -1};
-  }
-
-  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
-
-  T *mPtr = nullptr;
-  Shape mShape;
-};
-
-template <typename Os, typename T, int Rank>
-Os &operator<<(Os &os, const TensorView<T, Rank> &dt) {
-  os << dt.repr();
-  return os;
-}
-
-template <typename Os, typename T, int Rank>
-Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
-  os << dt.repr();
-  return os;
-}
-
-namespace detail {
-template <typename T>
-constexpr const char *printfTypeFormat(T val = T());
-template <>
-constexpr const char *printfTypeFormat(float val) {
-  return "%.2f";
-}
-template <>
-constexpr const char *printfTypeFormat(double val) {
-  return "%.2f";
-}
-template <>
-constexpr const char *printfTypeFormat(int val) {
-  return "%d";
-}
-template <>
-constexpr const char *printfTypeFormat(unsigned val) {
-  return "%u";
-}
-template <>
-constexpr const char *printfTypeFormat(long val) {
-  return "%ld";
-}
-template <>
-constexpr const char *printfTypeFormat(unsigned long val) {
-  return "%lu";
-}
-};  // namespace detail
-
-template <typename T>
-TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
-                                    const char *format) {
-  if (tensor.empty()) return;
-  if (tensor.ndim() == 0) {
-    printf(format, tensor());
-    printf("\n");
-    return;
-  }
-  Shape counter = tensor.shape();
-  auto tensor_flat = tensor.view(-1);
-  for (int i = 0; i < counter.ndim(); ++i) {
-    counter[i] = 0;
-    printf("[");
-  }
-  for (size_t i = 0; i < tensor.size(); ++i) {
-    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
-    counter[counter.ndim() - 1] += 1;
-    int inc_count = 0;
-    bool print_comma = true;
-    for (int c = counter.ndim() - 1; c >= 0; --c) {
-      if (counter[c] == tensor.dim(c) && c > 0) {
-        ++inc_count;
-        counter[c - 1] += 1;
-        counter[c] = 0;
-        print_comma = false;
-      }
-    }
-    if (print_comma && i != tensor.size() - 1) printf(", ");
-    for (int j = 0; j < inc_count; ++j) {
-      printf("]");
-    }
-    if (i != tensor.size() - 1) {
-      if (inc_count != 0) printf("\n");
-      for (int j = 0; j < inc_count; ++j) {
-        printf("[");
-      }
-    }
-  }
-  printf("]\n");
-}
-
-template <typename T>
-TV_HOST_DEVICE void printTensorView(TensorView<T> tensor) {
-  using Traw = typename std::remove_const<T>::type;
-  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
-}
-template <typename T>
-TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
-  using Traw = typename std::remove_const<T>::type;
-  return printTensorView(TensorView<const T>(ptr, shape),
-                         detail::printfTypeFormat<Traw>());
-}
-template <typename T>
-TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
-                                    const char *format) {
-  return printTensorView(TensorView<const T>(ptr, shape), format);
-}
-
-}  // namespace tv
--- a/mmdet3d/ops/spconv/include/torch_utils.h
+++ b/mmdet3d/ops/spconv/include/torch_utils.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace tv {
-
-struct TorchGPU : public tv::GPU {
-  virtual cudaStream_t getStream() const override {
-    return at::cuda::getCurrentCUDAStream();
-  }
-};
-
-template <typename T>
-void check_torch_dtype(const torch::Tensor &tensor) {
-  switch (tensor.type().scalarType()) {
-    case at::ScalarType::Double: {
-      auto val = std::is_same<std::remove_const_t<T>, double>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    case at::ScalarType::Float: {
-      auto val = std::is_same<std::remove_const_t<T>, float>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    case at::ScalarType::Int: {
-      auto val = std::is_same<std::remove_const_t<T>, int>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    case at::ScalarType::Half: {
-      auto val = std::is_same<std::remove_const_t<T>, at::Half>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    case at::ScalarType::Long: {
-      auto val = std::is_same<std::remove_const_t<T>, long>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    default:
-      TV_ASSERT_RT_ERR(false, "error");
-  }
-}
-
-template <typename T>
-tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
-  check_torch_dtype<T>(tensor);
-  tv::Shape shape;
-  for (auto i : tensor.sizes()) {
-    shape.push_back(i);
-  }
-  return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
-}
-}  // namespace tv
--- a/mmdet3d/ops/spconv/include/utility/timer.h
+++ b/mmdet3d/ops/spconv/include/utility/timer.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <chrono>
-#include <cuda_runtime_api.h>
-#include <iostream>
-
-namespace spconv {
-
-template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
-  CudaContextTimer() {
-    cudaDeviceSynchronize();
-    mCurTime = std::chrono::steady_clock::now();
-  }
-  typename TimeT::rep report() {
-    cudaDeviceSynchronize();
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-
-template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
-  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
-  typename TimeT::rep report() {
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-
-} // namespace spconv
--- a/mmdet3d/ops/spconv/modules.py
+++ b/mmdet3d/ops/spconv/modules.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-from collections import OrderedDict
-
-import torch
-from torch import nn
-
-from .structure import SparseConvTensor
-
-
-def is_spconv_module(module):
-    spconv_modules = (SparseModule, )
-    return isinstance(module, spconv_modules)
-
-
-def is_sparse_conv(module):
-    from .conv import SparseConvolution
-    return isinstance(module, SparseConvolution)
-
-
-def _mean_update(vals, m_vals, t):
-    outputs = []
-    if not isinstance(vals, list):
-        vals = [vals]
-    if not isinstance(m_vals, list):
-        m_vals = [m_vals]
-    for val, m_val in zip(vals, m_vals):
-        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
-        outputs.append(output)
-    if len(outputs) == 1:
-        outputs = outputs[0]
-    return outputs
-
-
-class SparseModule(nn.Module):
-    """place holder, All module subclass from this will take sptensor in
-    SparseSequential."""
-    pass
-
-
-class SparseSequential(SparseModule):
-    r"""A sequential container.
-    Modules will be added to it in the order they are passed in the
-    constructor.
-    Alternatively, an ordered dict of modules can also be passed in.
-
-    To make it easier to understand, given is a small example::
-
-        # Example of using Sequential
-        model = SparseSequential(
-                  SparseConv2d(1,20,5),
-                  nn.ReLU(),
-                  SparseConv2d(20,64,5),
-                  nn.ReLU()
-                )
-
-        # Example of using Sequential with OrderedDict
-        model = SparseSequential(OrderedDict([
-                  ('conv1', SparseConv2d(1,20,5)),
-                  ('relu1', nn.ReLU()),
-                  ('conv2', SparseConv2d(20,64,5)),
-                  ('relu2', nn.ReLU())
-                ]))
-
-        # Example of using Sequential with kwargs(python 3.6+)
-        model = SparseSequential(
-                  conv1=SparseConv2d(1,20,5),
-                  relu1=nn.ReLU(),
-                  conv2=SparseConv2d(20,64,5),
-                  relu2=nn.ReLU()
-                )
-    """
-
-    def __init__(self, *args, **kwargs):
-        super(SparseSequential, self).__init__()
-        if len(args) == 1 and isinstance(args[0], OrderedDict):
-            for key, module in args[0].items():
-                self.add_module(key, module)
-        else:
-            for idx, module in enumerate(args):
-                self.add_module(str(idx), module)
-        for name, module in kwargs.items():
-            if sys.version_info < (3, 6):
-                raise ValueError('kwargs only supported in py36+')
-            if name in self._modules:
-                raise ValueError('name exists.')
-            self.add_module(name, module)
-        self._sparity_dict = {}
-
-    def __getitem__(self, idx):
-        if not (-len(self) <= idx < len(self)):
-            raise IndexError('index {} is out of range'.format(idx))
-        if idx < 0:
-            idx += len(self)
-        it = iter(self._modules.values())
-        for i in range(idx):
-            next(it)
-        return next(it)
-
-    def __len__(self):
-        return len(self._modules)
-
-    @property
-    def sparity_dict(self):
-        return self._sparity_dict
-
-    def add(self, module, name=None):
-        if name is None:
-            name = str(len(self._modules))
-            if name in self._modules:
-                raise KeyError('name exists')
-        self.add_module(name, module)
-
-    def forward(self, input):
-        for k, module in self._modules.items():
-            if is_spconv_module(module):  # use SpConvTensor as input
-                assert isinstance(input, SparseConvTensor)
-                self._sparity_dict[k] = input.sparity
-                input = module(input)
-            else:
-                if isinstance(input, SparseConvTensor):
-                    if input.indices.shape[0] != 0:
-                        input.features = module(input.features)
-                else:
-                    input = module(input)
-        return input
-
-    def fused(self):
-        """don't use this.
-
-        no effect.
-        """
-        from .conv import SparseConvolution
-        mods = [v for k, v in self._modules.items()]
-        fused_mods = []
-        idx = 0
-        while idx < len(mods):
-            if is_sparse_conv(mods[idx]):
-                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
-                                                      nn.BatchNorm1d):
-                    new_module = SparseConvolution(
-                        ndim=mods[idx].ndim,
-                        in_channels=mods[idx].in_channels,
-                        out_channels=mods[idx].out_channels,
-                        kernel_size=mods[idx].kernel_size,
-                        stride=mods[idx].stride,
-                        padding=mods[idx].padding,
-                        dilation=mods[idx].dilation,
-                        groups=mods[idx].groups,
-                        bias=True,
-                        subm=mods[idx].subm,
-                        output_padding=mods[idx].output_padding,
-                        transposed=mods[idx].transposed,
-                        inverse=mods[idx].inverse,
-                        indice_key=mods[idx].indice_key,
-                        fused_bn=True,
-                    )
-                    new_module.load_state_dict(mods[idx].state_dict(), False)
-                    new_module.to(mods[idx].weight.device)
-                    conv = new_module
-                    bn = mods[idx + 1]
-                    conv.bias.data.zero_()
-                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
-                        torch.sqrt(bn.running_var) + bn.eps)
-                    conv.bias.data[:] = (
-                        conv.bias.data - bn.running_mean) * bn.weight.data / (
-                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
-                    fused_mods.append(conv)
-                    idx += 2
-                else:
-                    fused_mods.append(mods[idx])
-                    idx += 1
-            else:
-                fused_mods.append(mods[idx])
-                idx += 1
-        return SparseSequential(*fused_mods)
-
-
-class ToDense(SparseModule):
-    """convert SparseConvTensor to NCHW dense tensor."""
-
-    def forward(self, x: SparseConvTensor):
-        return x.dense()
-
-
-class RemoveGrid(SparseModule):
-    """remove pre-allocated grid buffer."""
-
-    def forward(self, x: SparseConvTensor):
-        x.grid = None
-        return x
--- a/mmdet3d/ops/spconv/ops.py
+++ b/mmdet3d/ops/spconv/ops.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from . import sparse_conv_ext
-
-
-def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        size = (input_size[i] + 2 * padding[i] - dilation[i] *
-                (kernel_size[i] - 1) - 1) // stride[i] + 1
-        if kernel_size[i] == -1:
-            output_size.append(1)
-        else:
-            output_size.append(size)
-    return output_size
-
-
-def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
-                           output_padding):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        if kernel_size[i] == -1:
-            raise ValueError("deconv don't support kernel_size < 0")
-        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
-            i] + output_padding[i]
-        output_size.append(size)
-    return output_size
-
-
-def get_indice_pairs(indices,
-                     batch_size,
-                     spatial_shape,
-                     ksize=3,
-                     stride=1,
-                     padding=0,
-                     dilation=1,
-                     out_padding=0,
-                     subm=False,
-                     transpose=False,
-                     grid=None):
-    ndim = indices.shape[1] - 1
-    if not isinstance(ksize, (list, tuple)):
-        ksize = [ksize] * ndim
-    if not isinstance(stride, (list, tuple)):
-        stride = [stride] * ndim
-    if not isinstance(padding, (list, tuple)):
-        padding = [padding] * ndim
-    if not isinstance(dilation, (list, tuple)):
-        dilation = [dilation] * ndim
-    if not isinstance(out_padding, (list, tuple)):
-        out_padding = [out_padding] * ndim
-
-    for d, s in zip(dilation, stride):
-        assert any([s == 1, d == 1]), "don't support this."
-
-    if not subm:
-        if transpose:
-            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
-                                               padding, dilation, out_padding)
-        else:
-            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
-                                             padding, dilation)
-
-    else:
-        out_shape = spatial_shape
-    if grid is None:
-        if ndim == 2:
-            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_2d
-        elif ndim == 3:
-            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_3d
-        elif ndim == 4:
-            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_4d
-        else:
-            raise NotImplementedError
-        return get_indice_pairs_func(indices, batch_size, out_shape,
-                                     spatial_shape, ksize, stride, padding,
-                                     dilation, out_padding, int(subm),
-                                     int(transpose))
-    else:
-        if ndim == 2:
-            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_grid_2d
-        elif ndim == 3:
-            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_grid_3d
-        else:
-            raise NotImplementedError
-        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
-                                     spatial_shape, ksize, stride, padding,
-                                     dilation, out_padding, int(subm),
-                                     int(transpose))
-
-
-def indice_conv(features,
-                filters,
-                indice_pairs,
-                indice_pair_num,
-                num_activate_out,
-                inverse=False,
-                subm=False):
-    if filters.dtype == torch.float32:
-        return sparse_conv_ext.indice_conv_fp32(features, filters,
-                                                indice_pairs, indice_pair_num,
-                                                num_activate_out, int(inverse),
-                                                int(subm))
-    elif filters.dtype == torch.half:
-        return sparse_conv_ext.indice_conv_half(features, filters,
-                                                indice_pairs, indice_pair_num,
-                                                num_activate_out, int(inverse),
-                                                int(subm))
-    else:
-        raise NotImplementedError
-
-
-def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
-                      num_activate_out, inverse, subm):
-    if features.dtype == torch.half:
-        func = sparse_conv_ext.fused_indice_conv_half
-    elif filters.dtype == torch.float32:
-        func = sparse_conv_ext.fused_indice_conv_fp32
-    else:
-        raise NotImplementedError
-
-    return func(features, filters, bias, indice_pairs, indice_pair_num,
-                num_activate_out, int(inverse), int(subm))
-
-
-def indice_conv_backward(features,
-                         filters,
-                         out_bp,
-                         indice_pairs,
-                         indice_pair_num,
-                         inverse=False,
-                         subm=False):
-    if filters.dtype == torch.float32:
-        return sparse_conv_ext.indice_conv_backward_fp32(
-            features, filters, out_bp, indice_pairs, indice_pair_num,
-            int(inverse), int(subm))
-    elif filters.dtype == torch.half:
-        return sparse_conv_ext.indice_conv_backward_half(
-            features, filters, out_bp, indice_pairs, indice_pair_num,
-            int(inverse), int(subm))
-    else:
-        raise NotImplementedError
-
-
-def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
-    if features.dtype == torch.float32:
-        return sparse_conv_ext.indice_maxpool_fp32(features, indice_pairs,
-                                                   indice_pair_num,
-                                                   num_activate_out)
-    elif features.dtype == torch.half:
-        return sparse_conv_ext.indice_maxpool_half(features, indice_pairs,
-                                                   indice_pair_num,
-                                                   num_activate_out)
-    else:
-        raise NotImplementedError
-
-
-def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
-                            indice_pair_num):
-    if features.dtype == torch.float32:
-        return sparse_conv_ext.indice_maxpool_backward_fp32(
-            features, out_features, out_bp, indice_pairs, indice_pair_num)
-    elif features.dtype == torch.half:
-        return sparse_conv_ext.indice_maxpool_backward_half(
-            features, out_features, out_bp, indice_pairs, indice_pair_num)
-    else:
-        raise NotImplementedError
--- a/mmdet3d/ops/spconv/pool.py
+++ b/mmdet3d/ops/spconv/pool.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import functional as Fsp
-from . import ops
-from .modules import SparseModule
-from .structure import SparseConvTensor
-
-
-class SparseMaxPool(SparseModule):
-
-    def __init__(self,
-                 ndim,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 subm=False):
-        super(SparseMaxPool, self).__init__()
-        if not isinstance(kernel_size, (list, tuple)):
-            kernel_size = [kernel_size] * ndim
-        if not isinstance(stride, (list, tuple)):
-            stride = [stride] * ndim
-        if not isinstance(padding, (list, tuple)):
-            padding = [padding] * ndim
-        if not isinstance(dilation, (list, tuple)):
-            dilation = [dilation] * ndim
-
-        self.ndim = ndim
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.subm = subm
-        self.dilation = dilation
-
-    def forward(self, input):
-        assert isinstance(input, SparseConvTensor)
-        features = input.features
-        device = features.device
-        indices = input.indices
-        spatial_shape = input.spatial_shape
-        batch_size = input.batch_size
-        if not self.subm:
-            out_spatial_shape = ops.get_conv_output_size(
-                spatial_shape, self.kernel_size, self.stride, self.padding,
-                self.dilation)
-        else:
-            out_spatial_shape = spatial_shape
-        outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
-            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
-            self.padding, self.dilation, 0, self.subm)
-
-        out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
-                                          indice_pairs_num.to(device),
-                                          outids.shape[0])
-        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
-                                      batch_size)
-        out_tensor.indice_dict = input.indice_dict
-        out_tensor.grid = input.grid
-        return out_tensor
-
-
-class SparseMaxPool2d(SparseMaxPool):
-
-    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
-        super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding,
-                                              dilation)
-
-
-class SparseMaxPool3d(SparseMaxPool):
-
-    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
-        super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding,
-                                              dilation)
--- a/mmdet3d/ops/spconv/src/all.cc
+++ b/mmdet3d/ops/spconv/src/all.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cuda_runtime_api.h>
-#include <spconv/fused_spconv_ops.h>
-#include <spconv/pool_ops.h>
-#include <spconv/spconv_ops.h>
-#include <torch/extension.h>
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("get_indice_pairs_2d", &spconv::getIndicePair<2>,
-        "get_indice_pairs_2d");
-  m.def("get_indice_pairs_3d", &spconv::getIndicePair<3>,
-        "get_indice_pairs_3d");
-  m.def("get_indice_pairs_4d", &spconv::getIndicePair<4>,
-        "get_indice_pairs_4d");
-  m.def("get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>,
-        "get_indice_pairs_grid_2d");
-  m.def("get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>,
-        "get_indice_pairs_grid_3d");
-  m.def("indice_conv_fp32", &spconv::indiceConv<float>, "indice_conv_fp32");
-  m.def("indice_conv_backward_fp32", &spconv::indiceConvBackward<float>,
-        "indice_conv_backward_fp32");
-  m.def("indice_conv_half", &spconv::indiceConv<at::Half>, "indice_conv_half");
-  m.def("indice_conv_backward_half", &spconv::indiceConvBackward<at::Half>,
-        "indice_conv_backward_half");
-  m.def("fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>,
-        "fused_indice_conv_fp32");
-  m.def("fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>,
-        "fused_indice_conv_half");
-  m.def("indice_maxpool_fp32", &spconv::indiceMaxPool<float>,
-        "indice_maxpool_fp32");
-  m.def("indice_maxpool_backward_fp32", &spconv::indiceMaxPoolBackward<float>,
-        "indice_maxpool_backward_fp32");
-  m.def("indice_maxpool_half", &spconv::indiceMaxPool<at::Half>,
-        "indice_maxpool_half");
-  m.def("indice_maxpool_backward_half",
-        &spconv::indiceMaxPoolBackward<at::Half>,
-        "indice_maxpool_backward_half");
-}
--- a/mmdet3d/ops/spconv/src/indice.cc
+++ b/mmdet3d/ops/spconv/src/indice.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/geometry.h>
-#include <spconv/indice.h>
-#include <spconv/spconv_ops.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-namespace functor {
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    if (transpose)
-      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
-          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
-          outSpatialShape.data());
-    else
-      return getIndicePairsConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
-          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
-          outSpatialShape.data());
-  }
-};
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    return getIndicePairsSubM<Index, IndexGrid, NDim>(
-        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
-  }
-};
-}  // namespace functor
-
-#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \
-  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
-                                                       NDIM>;               \
-  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
-                                                       NDIM>;
-
-#define DECLARE_CPU_INDEX(Index)          \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
-
-DECLARE_CPU_INDEX(int);
-DECLARE_CPU_INDEX(long);
-
-#undef DECLARE_CPU_INDEX
-#undef DECLARE_CPU_SPECS_INDEX_NDIM
-
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/indice_cuda.cu
+++ b/mmdet3d/ops/spconv/src/indice_cuda.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <spconv/indice.cu.h>
-#include <spconv/indice.h>
-#include <spconv/mp_helper.h>
-#include <tensorview/helper_launch.h>
-#include <tensorview/tensorview.h>
-#include <utility/timer.h>
-
-#include <chrono>
-#include <limits>
-#include <type_traits>
-
-namespace spconv {
-namespace functor {
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose) {
-    Index batchSize = gridsOut.dim(0);
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0) return 0;
-    // auto timer = spconv::CudaContextTimer<>();
-    if (transpose)
-      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                              indiceNum, indicePairUnique, kernelSize, stride,
-                              padding, dilation, outSpatialShape);
-    else
-      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                              indiceNum, indicePairUnique, kernelSize, stride,
-                              padding, dilation, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
-    return 1;
-  }
-};
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    Index batchSize = gridsOut.dim(0);
-    auto kernelVolume = indicePairs.dim(0);
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0) return 0;
-    Index numAct = indicePairUnique.dim(0) - 1;
-    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
-                            indicePairUnique, outSpatialShape, batchSize);
-    TV_CHECK_CUDA_ERR();
-    assignIndicePairsKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
-                            indicePairUnique, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    if (resetGrid) {
-      resetGridKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
-      TV_CHECK_CUDA_ERR();
-    }
-    return numAct;
-  }
-};
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0) return 0;
-    // auto timer = spconv::CudaContextTimer<>();
-    prepareSubMGridKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
-                            kernelSize, stride, padding, dilation,
-                            outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
-    if (resetGrid) {
-      resetGridSubMKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
-                              numActIn);
-      TV_CHECK_CUDA_ERR();
-    }
-    return numActIn;
-  }
-};
-}  // namespace functor
-
-#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
-  template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int,   \
-                                                       NDIM>;                 \
-  template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
-                                                         NDIM>;               \
-  template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
-                                                         NDIM>;               \
-  template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int,   \
-                                                       NDIM>;
-
-#define DECLARE_GPU_INDEX(Index)          \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
-
-DECLARE_GPU_INDEX(int);
-
-#undef DECLARE_GPU_INDEX
-#undef DECLARE_GPU_SPECS_INDEX_NDIM
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/maxpool.cc
+++ b/mmdet3d/ops/spconv/src/maxpool.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/maxpool.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-namespace functor {
-template <typename T, typename Index>
-struct SparseMaxPoolForwardFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const Index> indices, int size) {
-    int stride = outFeatures.dim(1);
-    auto outFeaturesData = outFeatures.data();
-    auto inFeaturesData = inFeatures.data();
-    auto indicesIn = indices.subview(0).data();
-    auto indicesOut = indices.subview(1).data();
-    Index idxi, idxo;
-    for (int row = 0; row < size; row++) {
-      idxi = indicesIn[row] * stride;
-      idxo = indicesOut[row] * stride;
-      for (int plane = 0; plane < stride; ++plane)
-        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
-          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
-    }
-  }
-};
-
-template <typename T, typename Index>
-struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU &d, tv::TensorView<const T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const T> dout, tv::TensorView<T> din,
-                  tv::TensorView<const Index> indices, int size) {
-    int stride = outFeatures.dim(1);
-    auto outFeaturesData = outFeatures.data();
-    auto inFeaturesData = inFeatures.data();
-    auto doutData = dout.data();
-    auto dinData = din.data();
-    auto indicesIn = indices.subview(0).data();
-    auto indicesOut = indices.subview(1).data();
-    Index idxi, idxo;
-    for (int row = 0; row < size; row++) {
-      idxi = indicesIn[row] * stride;
-      idxo = indicesOut[row] * stride;
-      for (int plane = 0; plane < stride; ++plane)
-        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
-          dinData[idxi + plane] += doutData[idxo + plane];
-    }
-  }
-};
-}  // namespace functor
-
-#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
-  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
-  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
-
-#define DECLARE_CPU_SPECS(T)         \
-  DECLARE_CPU_SPECS_T_INDEX(T, int); \
-  DECLARE_CPU_SPECS_T_INDEX(T, long);
-
-DECLARE_CPU_SPECS(float);
-DECLARE_CPU_SPECS(double);
-DECLARE_CPU_SPECS(at::Half);
-
-#undef DECLARE_CPU_SPECS
-#undef DECLARE_CPU_SPECS_T_INDEX
-
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/maxpool_cuda.cu
+++ b/mmdet3d/ops/spconv/src/maxpool_cuda.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <spconv/maxpool.h>
-#include <spconv/mp_helper.h>
-#include <tensorview/helper_kernel.cu.h>
-#include <tensorview/helper_launch.h>
-#include <tensorview/tensorview.h>
-
-#include <chrono>
-#include <limits>
-#include <type_traits>
-
-namespace spconv {
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
-                                      const Index *indicesIn,
-                                      const Index *indicesOut, int numHot,
-                                      int numPlanes) {
-  T in, out;
-  int ILPStrideY[NumILP];
-  Index idxo, idxi;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in > out) {
-          outFeatures[idxo] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdGenericBlockKernel(T *outFeatures,
-                                             const T *inFeatures,
-                                             const Index *indicesIn,
-                                             const Index *indicesOut,
-                                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in > out) {
-          outFeatures[RO[ilp] + iy] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
-                                         const Index *indicesIn,
-                                         const Index *indicesOut, int numHot,
-                                         int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] > bufo[i]) {
-          bufo[i] = bufi[i];
-        }
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idxo] =
-          reinterpret_cast<VecType *>(bufo)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
-                                        const Index *indicesIn,
-                                        const Index *indicesOut, int numHot,
-                                        int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in > out) {
-            outFeatures[RO[ilp] + iy] = in;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures,
-                                      const T *dout, T *din,
-                                      const Index *indicesIn,
-                                      const Index *indicesOut, int numHot,
-                                      int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  T in, out;
-  Index idxo, idxi;
-  int ILPStrideY[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  dout += blockIdx.y * NumTLP;
-  din += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in == out) {
-          din[idxi] += dout[idxo];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
-                                             const T *inFeatures, const T *dout,
-                                             T *din, const Index *indicesIn,
-                                             const Index *indicesOut,
-                                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in == out) {
-          din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void maxPoolBwdVecBlockKernel(const T *outFeatures,
-                                         const T *inFeatures, const T *dout,
-                                         T *din, const Index *indicesIn,
-                                         const Index *indicesOut, int numHot,
-                                         int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  T bufdi[vecloadFactor];
-  T bufdo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<const VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-      reinterpret_cast<VecType *>(bufdo)[0] =
-          reinterpret_cast<const VecType *>(dout)[idxo];
-      reinterpret_cast<VecType *>(bufdi)[0] =
-          reinterpret_cast<VecType *>(din)[idxi];
-
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] == bufo[i]) {
-          bufdi[i] += bufdo[i];
-        }
-      }
-      reinterpret_cast<VecType *>(din)[idxi] =
-          reinterpret_cast<VecType *>(bufdi)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdGenericKernel(const T *outFeatures,
-                                        const T *inFeatures, const T *dout,
-                                        T *din, const Index *indicesIn,
-                                        const Index *indicesOut, int numHot,
-                                        int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in == out) {
-            din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-          }
-        }
-      }
-    }
-  }
-}
-
-namespace functor {
-template <typename T, typename Index>
-struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0) return;
-    int numPlanes = inFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
-                                 &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (numHotBlock >= NumTLP) {
-            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    indices.subview(0).data(),
-                                    indices.subview(1).data(), numHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-
-          if (size > numHotBlock) {
-            maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                       indices.subview(0).data() + numHotBlock,
-                                       indices.subview(1).data() + numHotBlock,
-                                       size - numHotBlock, numPlanes);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (numHotBlock >= NumTLP) {
-        maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(),
-                indices.subview(0).data(), indices.subview(1).data(),
-                numHotBlock, numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-
-      if (size > numHotBlock) {
-        maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(),
-                indices.subview(0).data() + numHotBlock,
-                indices.subview(1).data() + numHotBlock, size - numHotBlock,
-                numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-    }
-  }
-};
-
-template <typename T, typename Index>
-struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const T> dout, tv::TensorView<T> din,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0) return;
-    int numPlanes = inFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
-                                 &indices, &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (numHotBlock >= NumTLP) {
-            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    dout.data(), din.data(),
-                                    indices.subview(0).data(),
-                                    indices.subview(1).data(), numHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-
-          if (size > numHotBlock) {
-            maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                       dout.data(), din.data(),
-                                       indices.subview(0).data() + numHotBlock,
-                                       indices.subview(1).data() + numHotBlock,
-                                       size - numHotBlock, numPlanes);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (numHotBlock >= NumTLP) {
-        maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
-                indices.subview(0).data(), indices.subview(1).data(),
-                numHotBlock, numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-
-      if (size > numHotBlock) {
-        maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
-                indices.subview(0).data() + numHotBlock,
-                indices.subview(1).data() + numHotBlock, size - numHotBlock,
-                numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-    }
-  }
-};
-
-}  // namespace functor
-
-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                \
-  template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>; \
-  template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
-
-#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
-
-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
-
-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/reordering.cc
+++ b/mmdet3d/ops/spconv/src/reordering.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/reordering.h>
-#include <torch/script.h>
-
-namespace spconv {
-namespace functor {
-template <typename T, typename Index>
-struct SparseGatherFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<T> buffer,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const Index> indices, int size) {
-    int numPlanes = features.dim(1);
-    for (int i = 0; i < size; ++i) {
-      std::memcpy(buffer.data() + i * numPlanes,
-                  features.data() + indices[i] * numPlanes,
-                  sizeof(T) * numPlanes);
-    }
-  }
-};
-
-template <typename T, typename Index>
-struct SparseScatterAddFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> buffer,
-                  tv::TensorView<const Index> indices, int size, bool stable) {
-    int numPlanes = outFeatures.dim(1);
-    const T* buf = buffer.data();
-    T* out = outFeatures.data();
-    for (int i = 0; i < size; ++i) {
-      buf = buffer.data() + i * numPlanes;
-      out = outFeatures.data() + indices[i] * numPlanes;
-      for (int j = 0; j < numPlanes; ++j) {
-        out[j] += buf[j];
-      }
-    }
-  }
-};
-
-}  // namespace functor
-
-#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                        \
-  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
-  template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
-
-#define DECLARE_CPU_SPECS(T)         \
-  DECLARE_CPU_SPECS_T_INDEX(T, int); \
-  DECLARE_CPU_SPECS_T_INDEX(T, long);
-
-DECLARE_CPU_SPECS(float);
-DECLARE_CPU_SPECS(double);
-DECLARE_CPU_SPECS(at::Half);
-
-#undef DECLARE_CPU_SPECS
-#undef DECLARE_CPU_SPECS_T_INDEX
-
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/reordering_cuda.cu
+++ b/mmdet3d/ops/spconv/src/reordering_cuda.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <spconv/mp_helper.h>
-#include <spconv/reordering.cu.h>
-#include <spconv/reordering.h>
-#include <tensorview/helper_kernel.cu.h>
-#include <tensorview/helper_launch.h>
-#include <tensorview/tensorview.h>
-#include <utility/timer.h>
-
-#include <chrono>
-#include <limits>
-#include <type_traits>
-
-namespace spconv {
-namespace functor {
-template <typename T, typename Index>
-struct SparseGatherFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0) return;
-    int numPlanes = features.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
-                                 &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-      int nHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (nHotBlock >= NumTLP) {
-            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                <<<dim3(numPlanes / NumTLP, size / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(buffer.data(), features.data(),
-                                    indices.data(), nHotBlock,
-                                    numPlanes / vecloadFactor);
-
-            TV_CHECK_CUDA_ERR();
-          }
-          if (size - nHotBlock > 0) {
-            gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                <<<dim3(1, numPlanes / NumTLP),
-                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
-                                    features.data(), indices.data() + nHotBlock,
-                                    size - nHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      gatherGenericKernel<T, Index, NumTLP, NumILP>
-          <<<dim3(tv::launch::DivUp(size, NumTLP),
-                  tv::launch::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-              buffer.data(), features.data(), indices.data(), size, numPlanes);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-};
-template <typename T, typename Index>
-struct SparseScatterAddFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> buffer,
-                  tv::TensorView<const Index> indices, int size, bool stable) {
-    if (size <= 0) return;
-    int numPlanes = outFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor =
-        sizeof(vecload_type_t) / sizeof(T);  // important for half.
-    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
-                                 &notFound](auto NumTLP) {
-      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-      constexpr int NumILP = NumTLP / 4;
-      int nHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (nHotBlock >= NumTLP) {
-            scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(numPlanes / NumTLP, size / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), buffer.data(),
-                                    indices.data(), nHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          if (size - nHotBlock > 0) {
-            scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(
-                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
-                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      scatterAddGenericKernel<T, Index, NumTLP, NumILP>
-          <<<dim3(tv::launch::DivUp(size, NumTLP),
-                  tv::launch::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-              outFeatures.data(), buffer.data(), indices.data(), size,
-              numPlanes);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-};
-}  // namespace functor
-
-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                        \
-  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
-  template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
-
-#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
-
-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
-
-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/structure.py
+++ b/mmdet3d/ops/spconv/structure.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-
-def scatter_nd(indices, updates, shape):
-    """pytorch edition of tensorflow scatter_nd.
-
-    this function don't contain except handle code. so use this carefully when
-    indice repeats, don't support repeat add which is supported in tensorflow.
-    """
-    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
-    ndim = indices.shape[-1]
-    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
-    flatted_indices = indices.view(-1, ndim)
-    slices = [flatted_indices[:, i] for i in range(ndim)]
-    slices += [Ellipsis]
-    ret[slices] = updates.view(*output_shape)
-    return ret
-
-
-class SparseConvTensor(object):
-
-    def __init__(self,
-                 features,
-                 indices,
-                 spatial_shape,
-                 batch_size,
-                 grid=None):
-        """
-        Args:
-            grid: pre-allocated grid tensor.
-                  should be used when the volume of spatial shape
-                  is very large.
-        """
-        self.features = features
-        self.indices = indices
-        if self.indices.dtype != torch.int32:
-            self.indices.int()
-        self.spatial_shape = spatial_shape
-        self.batch_size = batch_size
-        self.indice_dict = {}
-        self.grid = grid
-
-    @property
-    def spatial_size(self):
-        return np.prod(self.spatial_shape)
-
-    def find_indice_pair(self, key):
-        if key is None:
-            return None
-        if key in self.indice_dict:
-            return self.indice_dict[key]
-        return None
-
-    def dense(self, channels_first=True):
-        output_shape = [self.batch_size] + list(
-            self.spatial_shape) + [self.features.shape[1]]
-        res = scatter_nd(self.indices.long(), self.features, output_shape)
-        if not channels_first:
-            return res
-        ndim = len(self.spatial_shape)
-        trans_params = list(range(0, ndim + 1))
-        trans_params.insert(1, ndim + 1)
-        return res.permute(*trans_params).contiguous()
-
-    @property
-    def sparity(self):
-        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
-                self.batch_size)
--- a/mmdet3d/ops/spconv/test_utils.py
+++ b/mmdet3d/ops/spconv/test_utils.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-
-
-class TestCase(unittest.TestCase):
-
-    def _GetNdArray(self, a):
-        if not isinstance(a, np.ndarray):
-            a = np.array(a)
-        return a
-
-    def assertAllEqual(self, a, b):
-        """Asserts that two numpy arrays have the same values.
-
-        Args:
-        a: the expected numpy ndarray or anything can be converted to one.
-        b: the actual numpy ndarray or anything can be converted to one.
-        """
-        a = self._GetNdArray(a)
-        b = self._GetNdArray(b)
-        self.assertEqual(
-            a.shape, b.shape,
-            'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
-        same = (a == b)
-
-        if a.dtype == np.float32 or a.dtype == np.float64:
-            same = np.logical_or(same,
-                                 np.logical_and(np.isnan(a), np.isnan(b)))
-        if not np.all(same):
-            # Prints more details than np.testing.assert_array_equal.
-            diff = np.logical_not(same)
-            if a.ndim:
-                x = a[np.where(diff)]
-                y = b[np.where(diff)]
-                print('not equal where = ', np.where(diff))
-            else:
-                # np.where is broken for scalars
-                x, y = a, b
-            print('not equal lhs = ', x)
-            print('not equal rhs = ', y)
-            np.testing.assert_array_equal(a, b)
-
-    def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
-        """Asserts that two numpy arrays, or dicts of same, have near values.
-
-        This does not support nested dicts.
-        Args:
-        a: The expected numpy ndarray (or anything can be converted to one), or
-            dict of same. Must be a dict iff `b` is a dict.
-        b: The actual numpy ndarray (or anything can be converted to one), or
-            dict of same. Must be a dict iff `a` is a dict.
-        rtol: relative tolerance.
-        atol: absolute tolerance.
-        Raises:
-        ValueError: if only one of `a` and `b` is a dict.
-        """
-        is_a_dict = isinstance(a, dict)
-        if is_a_dict != isinstance(b, dict):
-            raise ValueError("Can't compare dict to non-dict, %s vs %s." %
-                             (a, b))
-        if is_a_dict:
-            self.assertCountEqual(
-                a.keys(),
-                b.keys(),
-                msg='mismatched keys, expected %s, got %s' %
-                (a.keys(), b.keys()))
-            for k in a:
-                self._assertArrayLikeAllClose(
-                    a[k],
-                    b[k],
-                    rtol=rtol,
-                    atol=atol,
-                    msg='%s: expected %s, got %s.' % (k, a, b))
-        else:
-            self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
-
-    def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
-        a = self._GetNdArray(a)
-        b = self._GetNdArray(b)
-        self.assertEqual(
-            a.shape, b.shape,
-            'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
-        if not np.allclose(a, b, rtol=rtol, atol=atol):
-            # Prints more details than np.testing.assert_allclose.
-            #
-            # NOTE: numpy.allclose (and numpy.testing.assert_allclose)
-            # checks whether two arrays are element-wise equal within a
-            # tolerance. The relative difference (rtol * abs(b)) and the
-            # absolute difference atol are added together to compare against
-            # the absolute difference between a and b.  Here, we want to
-            # print out which elements violate such conditions.
-            cond = np.logical_or(
-                np.abs(a - b) > atol + rtol * np.abs(b),
-                np.isnan(a) != np.isnan(b))
-            if a.ndim:
-                x = a[np.where(cond)]
-                y = b[np.where(cond)]
-                print('not close where = ', np.where(cond))
-            else:
-                # np.where is broken for scalars
-                x, y = a, b
-            print('not close lhs = ', x)
-            print('not close rhs = ', y)
-            print('not close dif = ', np.abs(x - y))
-            print('not close tol = ', atol + rtol * np.abs(y))
-            print('dtype = %s, shape = %s' % (a.dtype, a.shape))
-            np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
-
-
-def params_grid(*params):
-    size = len(params)
-    length = 1
-    for p in params:
-        length *= len(p)
-    sizes = [len(p) for p in params]
-    counter = [0] * size
-    total = []
-    for i in range(length):
-        total.append([0] * size)
-    for i in range(length):
-        for j in range(size):
-            total[i][j] = params[j][counter[j]]
-        counter[size - 1] += 1
-        for c in range(size - 1, -1, -1):
-            if (counter[c] == sizes[c] and c > 0):
-                counter[c - 1] += 1
-                counter[c] = 0
-    return total
-
-
-def generate_sparse_data(shape,
-                         num_points,
-                         num_channels,
-                         integer=False,
-                         data_range=(-1, 1),
-                         with_dense=True,
-                         dtype=np.float32):
-    dense_shape = shape
-    ndim = len(dense_shape)
-    # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
-    num_points = np.array(num_points)
-    # num_points = np.array([3, 2])
-    batch_size = len(num_points)
-    batch_indices = []
-    coors_total = np.stack(
-        np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
-    coors_total = coors_total.reshape(-1, ndim)
-    for i in range(batch_size):
-        np.random.shuffle(coors_total)
-        inds_total = coors_total[:num_points[i]]
-        inds_total = np.pad(
-            inds_total, ((0, 0), (0, 1)), mode='constant', constant_values=i)
-        batch_indices.append(inds_total)
-    if integer:
-        sparse_data = np.random.randint(
-            data_range[0],
-            data_range[1],
-            size=[num_points.sum(), num_channels]).astype(dtype)
-    else:
-        sparse_data = np.random.uniform(
-            data_range[0],
-            data_range[1],
-            size=[num_points.sum(), num_channels]).astype(dtype)
-
-    res = {
-        'features': sparse_data.astype(dtype),
-    }
-    if with_dense:
-        dense_data = np.zeros([batch_size, num_channels, *dense_shape],
-                              dtype=sparse_data.dtype)
-        start = 0
-        for i, inds in enumerate(batch_indices):
-            for j, ind in enumerate(inds):
-                dense_slice = (i, slice(None), *ind[:-1])
-                dense_data[dense_slice] = sparse_data[start + j]
-            start += len(inds)
-        res['features_dense'] = dense_data.astype(dtype)
-    batch_indices = np.concatenate(batch_indices, axis=0)
-    res['indices'] = batch_indices.astype(np.int32)
-    return res
--- a/mmdet3d/ops/voxel/__init__.py
+++ b/mmdet3d/ops/voxel/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .scatter_points import DynamicScatter, dynamic_scatter
-from .voxelize import Voxelization, voxelization
-
-__all__ = ['Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter']
--- a/mmdet3d/ops/voxel/scatter_points.py
+++ b/mmdet3d/ops/voxel/scatter_points.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn
-from torch.autograd import Function
-
-from .voxel_layer import (dynamic_point_to_voxel_backward,
-                          dynamic_point_to_voxel_forward)
-
-
-class _dynamic_scatter(Function):
-
-    @staticmethod
-    def forward(ctx, feats, coors, reduce_type='max'):
-        """convert kitti points(N, >=3) to voxels.
-
-        Args:
-            feats: [N, C] float tensor. points features to be reduced
-                into voxels.
-            coors: [N, ndim] int tensor. corresponding voxel coordinates
-                (specifically multi-dim voxel index) of each points.
-            reduce_type: str. reduce op. support 'max', 'sum' and 'mean'
-        Returns:
-            tuple
-            voxel_feats: [M, C] float tensor. reduced features. input features
-                that shares the same voxel coordinates are reduced to one row
-            coordinates: [M, ndim] int tensor, voxel coordinates.
-        """
-        results = dynamic_point_to_voxel_forward(feats, coors, reduce_type)
-        (voxel_feats, voxel_coors, point2voxel_map,
-         voxel_points_count) = results
-        ctx.reduce_type = reduce_type
-        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
-                              voxel_points_count)
-        ctx.mark_non_differentiable(voxel_coors)
-        return voxel_feats, voxel_coors
-
-    @staticmethod
-    def backward(ctx, grad_voxel_feats, grad_voxel_coors=None):
-        (feats, voxel_feats, point2voxel_map,
-         voxel_points_count) = ctx.saved_tensors
-        grad_feats = torch.zeros_like(feats)
-        # TODO: whether to use index put or use cuda_backward
-        # To use index put, need point to voxel index
-        dynamic_point_to_voxel_backward(grad_feats,
-                                        grad_voxel_feats.contiguous(), feats,
-                                        voxel_feats, point2voxel_map,
-                                        voxel_points_count, ctx.reduce_type)
-        return grad_feats, None, None
-
-
-dynamic_scatter = _dynamic_scatter.apply
-
-
-class DynamicScatter(nn.Module):
-
-    def __init__(self, voxel_size, point_cloud_range, average_points: bool):
-        super(DynamicScatter, self).__init__()
-        """Scatters points into voxels, used in the voxel encoder with
-           dynamic voxelization
-
-        **Note**: The CPU and GPU implementation get the same output, but
-        have numerical difference after summation and division (e.g., 5e-7).
-
-        Args:
-            average_points (bool): whether to use avg pooling to scatter
-                points into voxel voxel_size (list): list [x, y, z] size
-                of three dimension
-            point_cloud_range (list):
-                [x_min, y_min, z_min, x_max, y_max, z_max]
-        """
-        self.voxel_size = voxel_size
-        self.point_cloud_range = point_cloud_range
-        self.average_points = average_points
-
-    def forward_single(self, points, coors):
-        reduce = 'mean' if self.average_points else 'max'
-        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
-
-    def forward(self, points, coors):
-        """
-        Args:
-            input: NC points
-        """
-        if coors.size(-1) == 3:
-            return self.forward_single(points, coors)
-        else:
-            batch_size = coors[-1, 0] + 1
-            voxels, voxel_coors = [], []
-            for i in range(batch_size):
-                inds = torch.where(coors[:, 0] == i)
-                voxel, voxel_coor = self.forward_single(
-                    points[inds], coors[inds][:, 1:])
-                coor_pad = nn.functional.pad(
-                    voxel_coor, (1, 0), mode='constant', value=i)
-                voxel_coors.append(coor_pad)
-                voxels.append(voxel)
-            features = torch.cat(voxels, dim=0)
-            feature_coors = torch.cat(voxel_coors, dim=0)
-
-            return features, feature_coors
-
-    def __repr__(self):
-        tmpstr = self.__class__.__name__ + '('
-        tmpstr += 'voxel_size=' + str(self.voxel_size)
-        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
-        tmpstr += ', average_points=' + str(self.average_points)
-        tmpstr += ')'
-        return tmpstr
--- a/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
+++ b/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
-#include <ATen/TensorUtils.h>
-#include <torch/extension.h>
-// #include "voxelization.h"
-
-namespace {
-
-template <typename T_int>
-void determin_max_points_kernel(
-    torch::TensorAccessor<T_int, 2> coor,
-    torch::TensorAccessor<T_int, 1> point_to_voxelidx,
-    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
-    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
-    int& max_points, const int num_points) {
-  int voxelidx, num;
-  for (int i = 0; i < num_points; ++i) {
-    if (coor[i][0] == -1) continue;
-    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
-
-    // record voxel
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      voxel_num += 1;
-      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
-    }
-
-    // put points into voxel
-    num = num_points_per_voxel[voxelidx];
-    point_to_voxelidx[i] = num;
-    num_points_per_voxel[voxelidx] += 1;
-
-    // update max points per voxel
-    max_points = std::max(max_points, num + 1);
-  }
-
-  return;
-}
-
-template <typename T, typename T_int>
-void scatter_point_to_voxel_kernel(
-    const torch::TensorAccessor<T, 2> points,
-    torch::TensorAccessor<T_int, 2> coor,
-    torch::TensorAccessor<T_int, 1> point_to_voxelidx,
-    torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
-    torch::TensorAccessor<T, 3> voxels,
-    torch::TensorAccessor<T_int, 2> voxel_coors, const int num_features,
-    const int num_points, const int NDim) {
-  for (int i = 0; i < num_points; ++i) {
-    int num = point_to_voxelidx[i];
-    int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
-    for (int k = 0; k < num_features; ++k) {
-      voxels[voxelidx][num][k] = points[i][k];
-    }
-    for (int k = 0; k < NDim; ++k) {
-      voxel_coors[voxelidx][k] = coor[i][k];
-    }
-  }
-}
-
-}  // namespace
-
-namespace voxelization {
-
-std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
-    const at::Tensor& points, const at::Tensor& voxel_mapping,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range) {
-  // current version tooks about 0.02s_0.03s for one frame on cpu
-  // check device
-  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
-
-  const int NDim = voxel_mapping.size(1);
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-
-  std::vector<int> grid_size(NDim);
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-
-  at::Tensor num_points_per_voxel = at::zeros(
-      {
-          num_points,
-      },
-      voxel_mapping.options());
-  at::Tensor coor_to_voxelidx = -at::ones(
-      {grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
-  at::Tensor point_to_voxelidx = -at::ones(
-      {
-          num_points,
-      },
-      voxel_mapping.options());
-
-  int voxel_num = 0;
-  int max_points = 0;
-  AT_DISPATCH_ALL_TYPES(voxel_mapping.scalar_type(), "determin_max_point", [&] {
-    determin_max_points_kernel<scalar_t>(
-        voxel_mapping.accessor<scalar_t, 2>(),
-        point_to_voxelidx.accessor<scalar_t, 1>(),
-        num_points_per_voxel.accessor<scalar_t, 1>(),
-        coor_to_voxelidx.accessor<scalar_t, 3>(), voxel_num, max_points,
-        num_points);
-  });
-
-  at::Tensor voxels =
-      at::zeros({voxel_num, max_points, num_features}, points.options());
-  at::Tensor voxel_coors =
-      at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
-
-  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "scatter_point_to_voxel", [&] {
-    scatter_point_to_voxel_kernel<scalar_t, int>(
-        points.accessor<scalar_t, 2>(), voxel_mapping.accessor<int, 2>(),
-        point_to_voxelidx.accessor<int, 1>(),
-        coor_to_voxelidx.accessor<int, 3>(), voxels.accessor<scalar_t, 3>(),
-        voxel_coors.accessor<int, 2>(), num_features, num_points, NDim);
-  });
-
-  at::Tensor num_points_per_voxel_out =
-      num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
-  return {voxels, voxel_coors, num_points_per_voxel_out};
-}
-
-}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
+++ b/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/types.h>
-
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-
-typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
-
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-namespace {
-int const threadsPerBlock = 512;
-int const maxGridDim = 50000;
-}  // namespace
-
-__device__ __forceinline__ static void reduceMax(float *address, float val) {
-  int *address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_i, assumed,
-                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
-  } while (assumed != old || __int_as_float(old) < val);
-}
-
-__device__ __forceinline__ static void reduceMax(double *address, double val) {
-  unsigned long long *address_as_ull =
-      reinterpret_cast<unsigned long long *>(address);
-  unsigned long long old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(
-        address_as_ull, assumed,
-        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
-  } while (assumed != old || __longlong_as_double(old) < val);
-}
-
-// get rid of meaningless warnings when compiling host code
-#ifdef __CUDA_ARCH__
-__device__ __forceinline__ static void reduceAdd(float *address, float val) {
-#if (__CUDA_ARCH__ < 200)
-#warning \
-    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
-  int *address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_i, assumed,
-                    __float_as_int(val + __int_as_float(assumed)));
-  } while (assumed != old);
-#else
-  atomicAdd(address, val);
-#endif
-}
-
-__device__ __forceinline__ static void reduceAdd(double *address, double val) {
-#if (__CUDA_ARCH__ < 600)
-#warning \
-    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
-  unsigned long long *address_as_ull =
-      reinterpret_cast<unsigned long long *>(address);
-  unsigned long long old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-  } while (assumed != old);
-#else
-  atomicAdd(address, val);
-#endif
-}
-#endif
-
-template <typename T>
-__global__ void
-feats_reduce_kernel(const T *feats, const int32_t *coors_map,
-                    T *reduced_feats, // shall be 0 at initialization
-                    const int num_input, const int num_feats,
-                    const reduce_t reduce_type) {
-  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
-       x += gridDim.x * blockDim.x) {
-    int32_t reduce_to = coors_map[x];
-    if (reduce_to == -1) continue;
-
-    const T *feats_offset = feats + x * num_feats;
-    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
-    if (reduce_type == reduce_t::MAX) {
-      for (int i = 0; i < num_feats; i++) {
-        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
-      }
-    } else {
-      for (int i = 0; i < num_feats; i++) {
-        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void add_reduce_traceback_grad_kernel(
-    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
-    const int32_t *reduce_count, const int num_input, const int num_feats,
-    const reduce_t reduce_type) {
-  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
-       x += gridDim.x * blockDim.x) {
-    int32_t reduce_to = coors_map[x];
-    if (reduce_to == -1) {
-      continue;
-    }
-
-    const int input_offset = x * num_feats;
-    T *grad_feats_offset = grad_feats + input_offset;
-    const int reduced_offset = reduce_to * num_feats;
-    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
-
-    if (reduce_type == reduce_t::SUM) {
-      for (int i = 0; i < num_feats; i++) {
-        grad_feats_offset[i] = grad_reduced_feats_offset[i];
-      }
-    } else if (reduce_type == reduce_t::MEAN) {
-      for (int i = 0; i < num_feats; i++) {
-        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
-                               static_cast<T>(reduce_count[reduce_to]);
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void max_reduce_traceback_scatter_idx_kernel(
-    const T *feats, const T *reduced_feats, int32_t *reduce_from,
-    const int32_t *coors_map, const int num_input, const int num_feats) {
-  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
-       x += gridDim.x * blockDim.x) {
-    int32_t reduce_to = coors_map[x];
-
-    const int input_offset = x * num_feats;
-    const T *feats_offset = feats + input_offset;
-
-    if (reduce_to == -1) {
-      continue;
-    }
-
-    const int reduced_offset = reduce_to * num_feats;
-    const T *reduced_feats_offset = reduced_feats + reduced_offset;
-    int32_t *reduce_from_offset = reduce_from + reduced_offset;
-
-    for (int i = 0; i < num_feats; i++) {
-      if (feats_offset[i] == reduced_feats_offset[i]) {
-        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
-                                               const T *grad_reduced_feats,
-                                               const int32_t *reduce_from,
-                                               const int num_reduced,
-                                               const int num_feats) {
-  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_reduced;
-       x += gridDim.x * blockDim.x) {
-    const int reduced_offset = x * num_feats;
-    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
-    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
-
-    for (int i = 0; i < num_feats; i++) {
-      grad_feats[scatter_to_offset[i] * num_feats + i] =
-          grad_reduced_feats_offset[i];
-    }
-  }
-}
-
-namespace voxelization {
-
-std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
-    const at::Tensor &feats, const at::Tensor &coors,
-    const reduce_t reduce_type) {
-  CHECK_INPUT(feats);
-  CHECK_INPUT(coors);
-
-  const int num_input = feats.size(0);
-  const int num_feats = feats.size(1);
-
-  if (num_input == 0)
-    return {feats.clone().detach(),
-            coors.clone().detach(),
-            coors.new_empty({0}, torch::kInt32),
-            coors.new_empty({0}, torch::kInt32)};
-
-  at::Tensor out_coors;
-  at::Tensor coors_map;
-  at::Tensor reduce_count;
-
-  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
-
-  std::tie(out_coors, coors_map, reduce_count) =
-      at::unique_dim(coors_clean, 0, true, true, true);
-
-  if (out_coors.index({0, 0}).lt(0).item<bool>()) {
-    // the first element of out_coors (-1,-1,-1) and should be removed
-    out_coors = out_coors.slice(0, 1);
-    reduce_count = reduce_count.slice(0, 1);
-    coors_map = coors_map - 1;
-  }
-
-  coors_map = coors_map.to(torch::kInt32);
-  reduce_count = reduce_count.to(torch::kInt32);
-
-  auto reduced_feats =
-      at::empty({out_coors.size(0), num_feats}, feats.options());
-
-  AT_DISPATCH_FLOATING_TYPES(
-      feats.scalar_type(), "feats_reduce_kernel", ([&] {
-    if (reduce_type == reduce_t::MAX)
-      reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
-    else
-      reduced_feats.fill_(static_cast<scalar_t>(0));
-
-    dim3 blocks(std::min(at::cuda::ATenCeilDiv(num_input, threadsPerBlock),
-                         maxGridDim));
-    dim3 threads(threadsPerBlock);
-    feats_reduce_kernel<<<blocks, threads>>>(
-        feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
-        reduced_feats.data_ptr<scalar_t>(), num_input, num_feats, reduce_type);
-    if (reduce_type == reduce_t::MEAN)
-      reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
-  }));
-  AT_CUDA_CHECK(cudaGetLastError());
-
-  return {reduced_feats, out_coors, coors_map, reduce_count};
-}
-
-void dynamic_point_to_voxel_backward_gpu(at::Tensor &grad_feats,
-                                         const at::Tensor &grad_reduced_feats,
-                                         const at::Tensor &feats,
-                                         const at::Tensor &reduced_feats,
-                                         const at::Tensor &coors_map,
-                                         const at::Tensor &reduce_count,
-                                         const reduce_t reduce_type) {
-  CHECK_INPUT(grad_feats);
-  CHECK_INPUT(grad_reduced_feats);
-  CHECK_INPUT(feats);
-  CHECK_INPUT(reduced_feats);
-  CHECK_INPUT(coors_map);
-  CHECK_INPUT(reduce_count);
-
-  const int num_input = feats.size(0);
-  const int num_reduced = reduced_feats.size(0);
-  const int num_feats = feats.size(1);
-
-  grad_feats.fill_(0);
-  // copy voxel grad to points
-
-  if (num_input == 0 || num_reduced == 0) return;
-
-  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
-    AT_DISPATCH_FLOATING_TYPES(
-        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
-        ([&] {
-          dim3 blocks(std::min(
-              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
-          dim3 threads(threadsPerBlock);
-          add_reduce_traceback_grad_kernel<<<blocks, threads>>>(
-              grad_feats.data_ptr<scalar_t>(),
-              grad_reduced_feats.data_ptr<scalar_t>(),
-              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
-              num_input, num_feats, reduce_type);
-        }));
-    AT_CUDA_CHECK(cudaGetLastError());
-  } else {
-    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
-                                coors_map.options().dtype(torch::kInt32));
-    AT_DISPATCH_FLOATING_TYPES(
-        grad_reduced_feats.scalar_type(),
-        "max_reduce_traceback_scatter_idx_kernel", ([&] {
-          dim3 blocks(std::min(
-              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
-          dim3 threads(threadsPerBlock);
-          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads>>>(
-              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
-              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
-              num_input, num_feats);
-        }));
-    AT_CUDA_CHECK(cudaGetLastError());
-
-    AT_DISPATCH_FLOATING_TYPES(
-        grad_reduced_feats.scalar_type(),
-        "max_reduce_traceback_scatter_idx_kernel", ([&] {
-          dim3 blocks(std::min(
-              at::cuda::ATenCeilDiv(num_reduced, threadsPerBlock), maxGridDim));
-          dim3 threads(threadsPerBlock);
-          max_reduce_scatter_grad_kernel<<<blocks, threads>>>(
-              grad_feats.data_ptr<scalar_t>(),
-              grad_reduced_feats.data_ptr<scalar_t>(),
-              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
-        }));
-    AT_CUDA_CHECK(cudaGetLastError());
-  }
-  return;
-}
-
-}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/voxelization.cpp
+++ b/mmdet3d/ops/voxel/src/voxelization.cpp
-#include <torch/extension.h>
-#include "voxelization.h"
-
-namespace voxelization {
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("hard_voxelize", &hard_voxelize, "hard voxelize");
-  m.def("dynamic_voxelize", &dynamic_voxelize, "dynamic voxelization");
-  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward, "dynamic point to voxel forward");
-  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward, "dynamic point to voxel backward");
-}
-
-} // namespace voxelization