release v1.6.1 of mmcv

fdeee889 · limm · df465820 · fdeee889 · fdeee889 · fdeee889
Commit fdeee889 authored May 25, 2025 by limm
20 changed files
--- a/mmcv/ops/csrc/common/mps/MPSLibrary.mm
+++ b/mmcv/ops/csrc/common/mps/MPSLibrary.mm
+#include "MPSLibrary.h"
+#include <c10/util/CallOnce.h>
+#include "MPSDevice.h"
+
+static std::unique_ptr<MPSLibraryManager> mps_library_manager;
+static c10::once_flag mpsdev_init;
+
+MPSLibraryManager* MPSLibraryManager::getInstance() {
+  c10::call_once(mpsdev_init, [] {
+    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
+  });
+  return mps_library_manager.get();
+}
+
+MPSLibraryManager::~MPSLibraryManager() {}
+
+MPSLibraryManager::MPSLibraryManager() {}
+
+bool MPSLibraryManager::hasLibrary(const std::string& name) {
+  return _library_map.find(name) != _library_map.end();
+}
+
+MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
+  if (_library_map.find(library_url) != _library_map.end()) {
+    return _library_map[library_url].get();
+  }
+  _library_map.emplace(std::make_pair(
+      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
+  return _library_map[library_url].get();
+}
+
+MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
+                                                      const std::string& source) {
+  NSString* ns_name = [NSString stringWithCString:name.c_str()];
+  if (_library_map.find(name) != _library_map.end()) {
+    NSLog(@"Library %@ already exist.", ns_name);
+    return nullptr;
+  }
+
+  _library_map.emplace(
+      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
+  return _library_map[name].get();
+}
+
+MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
+    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
+                                                                                 error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* code_str = [NSString stringWithCString:sources.c_str()];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
+                                                                                  options:nil
+                                                                                    error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary::~MPSLibrary() {
+  [_library release];
+  _library = nil;
+}
+
+MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
+  if (_pso_map.find(function_name) != _pso_map.end()) {
+    return _pso_map[function_name];
+  }
+
+  MTLComputePipelineState_t pso;
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // create function
+    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
+    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
+    if (func == nil) {
+      NSLog(@"Failed to created pipeline state object, error %@.", error);
+      exit(1);
+    }
+    // create pipeline
+    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
+                                                                                     error:&error];
+    _pso_map.emplace(std::make_pair(function_name, pso));
+  }
+  return _pso_map[function_name];
+}
--- a/mmcv/ops/csrc/common/mps/MPSStream.h
+++ b/mmcv/ops/csrc/common/mps/MPSStream.h
+//  Copyright © 2022 Apple Inc.
+
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+#include "MPSDevice.h"
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+typedef id<MTLCommandQueue> MTLCommandQueue_t;
+typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
+typedef id<MTLSharedEvent> MTLSharedEvent_t;
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLCommandQueue_t;
+typedef void* MTLCommandQueue;
+typedef void* MTLCommandBuffer_t;
+typedef void* MTLCommandBuffer;
+typedef void* MTLSharedEvent_t;
+typedef void* dispatch_queue_t;
+typedef void* MTLDevice_t;
+#define nil NULL;
+#endif
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStream {
+ public:
+  enum Unchecked { UNCHECKED };
+  /// Construct a MPSStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a MPS stream.
+  explicit MPSStream(Stream stream);
+
+  ~MPSStream();
+  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
+  dispatch_queue_t queue() const { return _serialQueue; }
+
+  MTLCommandBuffer_t commandBuffer();
+  void commit(bool flush);
+  void commitAndWait();
+  void synchronize();
+
+  void flush();
+
+  /// Get the MPS device index that this stream is associated with.
+  c10::DeviceIndex device_index() const { return _stream.device_index(); }
+
+  MTLCommandQueue_t stream() const { return _commandQueue; };
+
+  MTLDevice_t device() const { return [_commandQueue device]; }
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const { return _stream; }
+
+ private:
+  Stream _stream;
+  MTLCommandQueue_t _commandQueue = nil;
+  MTLCommandBuffer_t _commandBuffer = nil;
+  void _flush(bool commitAndWait) const;
+
+  dispatch_queue_t _serialQueue = nullptr;
+};
+
+/**
+ * Get the current MPS stream
+ */
+TORCH_API MPSStream* getCurrentMPSStream();
+
+/**
+ * Get the default MPS stream
+ */
+TORCH_API MPSStream* getDefaultMPSStream();
+
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStreamImpl {
+ public:
+  /**
+   * Gets single instance of the MPSStream.
+   */
+  static MPSStream* getInstance();
+
+ private:
+  static MPSStream* _stream;
+  MPSStreamImpl();
+};
+
+//-----------------------------------------------------------------
+//  MPSEvent
+//-----------------------------------------------------------------
+
+struct TORCH_API MPSEvent {
+  MPSEvent();
+  // MPSEvent(id<MTLDevice> device);
+
+  ~MPSEvent();
+  MTLSharedEvent_t event() const { return _event; }
+
+  void recordEvent(MPSStream* stream);
+  void waitForEvent(MPSStream* queue);  // waits on the cpu
+  bool queryEvent();
+  uint64_t getCurrentValue() { return _currentValue; }
+  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
+
+ private:
+  bool _isRecorded = false;
+  uint64_t _currentValue = 0;
+  MTLSharedEvent_t _event;
+};
+
+typedef MPSEvent* mpsEvent_t;
+
+}  // namespace mps
+}  // namespace at
--- a/mmcv/ops/csrc/common/mps/MPSUtils.h
+++ b/mmcv/ops/csrc/common/mps/MPSUtils.h
+#ifndef _MPS_UTILS_H_
+#define _MPS_UTILS_H_
+#include <torch/extension.h>
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+typedef id<MTLBuffer> MTLBuffer_t;
+typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
+#else
+typedef void* MTLBuffer;
+typedef void* MTLBuffer_t;
+typedef void* MTLComputeCommandEncoder;
+typedef void* MTLComputeCommandEncoder_t;
+#endif
+
+// utils
+static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
+  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
+}
+
+template <typename T,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
+
+template <typename T,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
+}
+
+template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBytes:&t length:sizeof(t) atIndex:index];
+}
+
+inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
+
+template <typename T, typename... Args>
+void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
+  setMTLArg(encoder, index, std::forward<T>(t));
+  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
+  [encoder setComputePipelineState:pso];
+  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
+}
+#endif
--- a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
 #ifndef PYTORCH_CPP_HELPER
 #define PYTORCH_CPP_HELPER
-#include <torch/extension.h>
+#include <torch/types.h>

 #include <vector>

 using namespace at;

-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
 #define CHECK_CUDA(x) \
  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_MLU(x) \
+  TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
 #define CHECK_CPU(x) \
-  TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor")
+  TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
 #define CHECK_CONTIGUOUS(x) \
  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_CUDA_INPUT(x) \
  CHECK_CUDA(x);            \
  CHECK_CONTIGUOUS(x)
+#define CHECK_MLU_INPUT(x) \
+  CHECK_MLU(x);            \
+  CHECK_CONTIGUOUS(x)
 #define CHECK_CPU_INPUT(x) \
  CHECK_CPU(x);            \
  CHECK_CONTIGUOUS(x)

--- a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef PYTORCH_MLU_HELPER_HPP_
+#define PYTORCH_MLU_HELPER_HPP_
+
+#ifdef MMCV_WITH_MLU
+#include "aten.h"
+
+#define NFU_ALIGN_SIZE 128
+
+#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
+
+#define PAD_DOWN(x, y) (((x) / (y)) * (y))
+
+#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+
+#endif
+
+#endif  // PYTORCH_MLU_HELPER_HPP_
--- a/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
+++ b/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARAMS_GRID_H_
+#define PARAMS_GRID_H_
+#include <tuple>
+#include <vector>
+
+namespace detail {
+template <class scalar_t>
+int getTotalSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <class scalar_t, class... TArgs>
+int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
+  return arg.size() * getTotalSize(args...);
+}
+
+template <typename scalar_t>
+int getSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <int Idx, class TT, class scalar_t>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+}
+
+template <int Idx, class TT, class scalar_t, class... TArgs>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
+              std::vector<TArgs> &... args) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+  assigner<Idx + 1>(src, counter, args...);
+}
+}  // namespace detail
+
+template <class... TArgs>
+std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
+  int length = detail::getTotalSize(args...);
+  std::vector<int> sizes = {detail::getSize(args)...};
+  int size = sizes.size();
+
+  std::vector<std::tuple<TArgs...>> params(length);
+  std::vector<int> counter(size);
+  for (int i = 0; i < length; ++i) {
+    detail::assigner<0>(params[i], counter, args...);
+    counter[size - 1] += 1;
+    for (int c = size - 1; c >= 0; --c) {
+      if (counter[c] == sizes[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return params;
+}
+
+#endif
--- a/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
+++ b/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
+//          Copyright Louis Delacroix 2010 - 2014.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+//
+// A pretty printing library for C++
+//
+// Usage:
+// Include this header, and operator<< will "just work".
+
+#ifndef H_PRETTY_PRINT
+#define H_PRETTY_PRINT
+
+#include <cstddef>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+#include <valarray>
+
+namespace pretty_print {
+namespace detail {
+// SFINAE type trait to detect whether T::const_iterator exists.
+
+struct sfinae_base {
+  using yes = char;
+  using no = yes[2];
+};
+
+template <typename T>
+struct has_const_iterator : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &test(typename C::const_iterator *);
+  template <typename C>
+  static no &test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
+  using type = T;
+};
+
+template <typename T>
+struct has_begin_end : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &
+  f(typename std::enable_if<
+      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
+                                            const>(&C::begin)),
+                   typename C::const_iterator (C::*)() const>::value>::type *);
+
+  template <typename C>
+  static no &f(...);
+
+  template <typename C>
+  static yes &g(typename std::enable_if<
+                std::is_same<decltype(static_cast<typename C::const_iterator (
+                                          C::*)() const>(&C::end)),
+                             typename C::const_iterator (C::*)() const>::value,
+                void>::type *);
+
+  template <typename C>
+  static no &g(...);
+
+ public:
+  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
+  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
+};
+
+}  // namespace detail
+
+// Holds the delimiter values for a specific character type
+
+template <typename TChar>
+struct delimiters_values {
+  using char_type = TChar;
+  const char_type *prefix;
+  const char_type *delimiter;
+  const char_type *postfix;
+};
+
+// Defines the delimiter values for a specific container and character type
+
+template <typename T, typename TChar>
+struct delimiters {
+  using type = delimiters_values<TChar>;
+  static const type values;
+};
+
+// Functor to print containers. You can use this directly if you want
+// to specify a non-default delimiters type. The printing logic can
+// be customized by specializing the nested template.
+
+template <typename T, typename TChar = char,
+          typename TCharTraits = ::std::char_traits<TChar>,
+          typename TDelimiters = delimiters<T, TChar>>
+struct print_container_helper {
+  using delimiters_type = TDelimiters;
+  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
+
+  template <typename U>
+  struct printer {
+    static void print_body(const U &c, ostream_type &stream) {
+      using std::begin;
+      using std::end;
+
+      auto it = begin(c);
+      const auto the_end = end(c);
+
+      if (it != the_end) {
+        for (;;) {
+          stream << *it;
+
+          if (++it == the_end) break;
+
+          if (delimiters_type::values.delimiter != NULL)
+            stream << delimiters_type::values.delimiter;
+        }
+      }
+    }
+  };
+
+  print_container_helper(const T &container) : container_(container) {}
+
+  inline void operator()(ostream_type &stream) const {
+    if (delimiters_type::values.prefix != NULL)
+      stream << delimiters_type::values.prefix;
+
+    printer<T>::print_body(container_, stream);
+
+    if (delimiters_type::values.postfix != NULL)
+      stream << delimiters_type::values.postfix;
+  }
+
+ private:
+  const T &container_;
+};
+
+// Specialization for pairs
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename T1, typename T2>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::pair<T1, T2>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+
+  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
+    stream << c.first;
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+    stream << c.second;
+  }
+};
+
+// Specialization for tuples
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename... Args>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::tuple<Args...>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+  using element_type = std::tuple<Args...>;
+
+  template <std::size_t I>
+  struct Int {};
+
+  static void print_body(const element_type &c, ostream_type &stream) {
+    tuple_print(c, stream, Int<0>());
+  }
+
+  static void tuple_print(const element_type &, ostream_type &,
+                          Int<sizeof...(Args)>) {}
+
+  static void tuple_print(
+      const element_type &c, ostream_type &stream,
+      typename std::conditional<sizeof...(Args) != 0, Int<0>,
+                                std::nullptr_t>::type) {
+    stream << std::get<0>(c);
+    tuple_print(c, stream, Int<1>());
+  }
+
+  template <std::size_t N>
+  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+
+    stream << std::get<N>(c);
+
+    tuple_print(c, stream, Int<N + 1>());
+  }
+};
+
+// Prints a print_container_helper to the specified stream.
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &stream,
+    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
+  helper(stream);
+  return stream;
+}
+
+// Basic is_container template; specialize to derive from std::true_type for all
+// desired container types
+
+template <typename T>
+struct is_container
+    : public std::integral_constant<bool,
+                                    detail::has_const_iterator<T>::value &&
+                                        detail::has_begin_end<T>::beg_value &&
+                                        detail::has_begin_end<T>::end_value> {};
+
+template <typename T, std::size_t N>
+struct is_container<T[N]> : std::true_type {};
+
+template <std::size_t N>
+struct is_container<char[N]> : std::false_type {};
+
+template <typename T>
+struct is_container<std::valarray<T>> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_container<std::pair<T1, T2>> : std::true_type {};
+
+template <typename... Args>
+struct is_container<std::tuple<Args...>> : std::true_type {};
+
+// Default delimiters
+
+template <typename T>
+struct delimiters<T, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T>
+const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
+template <typename T>
+struct delimiters<T, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T>
+const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
+                                                                   L"]"};
+
+// Delimiters for (multi)set and unordered_(multi)set
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
+                                                                  "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
+        "{", ", ", "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
+    L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+               wchar_t>::values = {L"{", L", ", L"}"};
+
+// Delimiters for pair and tuple
+
+template <typename T1, typename T2>
+struct delimiters<std::pair<T1, T2>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
+    "(", ", ", ")"};
+template <typename T1, typename T2>
+struct delimiters<::std::pair<T1, T2>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<wchar_t>
+    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
+
+template <typename... Args>
+struct delimiters<std::tuple<Args...>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename... Args>
+const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
+    "(", ", ", ")"};
+template <typename... Args>
+struct delimiters<::std::tuple<Args...>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename... Args>
+const delimiters_values<wchar_t>
+    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
+
+// Type-erasing helper class for easy use of custom delimiters.
+// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
+// and MyDelims needs to be defined for TChar. Usage: "cout <<
+// pretty_print::custom_delims<MyDelims>(x)".
+
+struct custom_delims_base {
+  virtual ~custom_delims_base() {}
+  virtual std::ostream &stream(::std::ostream &) = 0;
+  virtual std::wostream &stream(::std::wostream &) = 0;
+};
+
+template <typename T, typename Delims>
+struct custom_delims_wrapper : custom_delims_base {
+  custom_delims_wrapper(const T &t_) : t(t_) {}
+
+  std::ostream &stream(std::ostream &s) {
+    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
+               t);
+  }
+
+  std::wostream &stream(std::wostream &s) {
+    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
+                                       Delims>(t);
+  }
+
+ private:
+  const T &t;
+};
+
+template <typename Delims>
+struct custom_delims {
+  template <typename Container>
+  custom_delims(const Container &c)
+      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
+
+  std::unique_ptr<custom_delims_base> base;
+};
+
+template <typename TChar, typename TCharTraits, typename Delims>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
+  return p.base->stream(s);
+}
+
+// A wrapper for a C-style array given as pointer-plus-size.
+// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
+
+template <typename T>
+struct array_wrapper_n {
+  typedef const T *const_iterator;
+  typedef T value_type;
+
+  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
+  inline const_iterator begin() const { return _array; }
+  inline const_iterator end() const { return _array + _n; }
+
+ private:
+  const T *const _array;
+  size_t _n;
+};
+
+// A wrapper for hash-table based containers that offer local iterators to each
+// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
+// 5 of container m.)
+
+template <typename T>
+struct bucket_print_wrapper {
+  typedef typename T::const_local_iterator const_iterator;
+  typedef typename T::size_type size_type;
+
+  const_iterator begin() const { return m_map.cbegin(n); }
+
+  const_iterator end() const { return m_map.cend(n); }
+
+  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
+
+ private:
+  const T &m_map;
+  const size_type n;
+};
+
+}  // namespace pretty_print
+
+// Global accessor functions for the convenience wrappers
+
+template <typename T>
+inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
+                                                           size_t n) {
+  return pretty_print::array_wrapper_n<T>(a, n);
+}
+
+template <typename T>
+pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
+                                                   typename T::size_type n) {
+  return pretty_print::bucket_print_wrapper<T>(m, n);
+}
+
+// Main magic entry point: An overload snuck into namespace std.
+// Can we do better?
+
+namespace std {
+// Prints a container to the stream using default delimiters
+
+template <typename T, typename TChar, typename TCharTraits>
+inline typename enable_if<::pretty_print::is_container<T>::value,
+                          basic_ostream<TChar, TCharTraits> &>::type
+operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
+  return stream
+         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
+                container);
+}
+}  // namespace std
+
+#endif  // H_PRETTY_PRINT
--- a/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
+++ b/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <pybind11/embed.h>
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <spconv/tensorview/tensorview.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+
+template <typename scalar_t, typename TPyObject>
+std::vector<scalar_t> array2Vector(TPyObject arr) {
+  py::array arr_np = arr;
+  size_t size = arr.attr("size").template cast<size_t>();
+  py::array_t<scalar_t> arr_cc = arr_np;
+  std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
+  return data;
+}
+
+template <typename scalar_t>
+std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
+  std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
+  return data;
+}
+
+template <typename scalar_t, typename TPyObject>
+tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
+  py::array arr_np = arr;
+  py::array_t<scalar_t> arr_cc = arr_np;
+  tv::Shape shape;
+  for (int i = 0; i < arr_cc.ndim(); ++i) {
+    shape.push_back(arr_cc.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
+}
+template <typename scalar_t>
+tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
+  tv::Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
+}
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPCONV_GEOMETRY_H_
+#define SPCONV_GEOMETRY_H_
+
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <iostream>
+#include <limits>
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
+                                    const Index *kernelSize,
+                                    const Index *stride, const Index *padding,
+                                    const Index *dilation,
+                                    const Index *outSpatialShape, Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
+                 stride[i] + padding[i]) /
+                stride[i];
+    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
+  }
+
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+        // break;
+      }
+      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPosTranspose(
+    const Index *input_pos, const Index *kernelSize, const Index *stride,
+    const Index *padding, const Index *dilation, const Index *outSpatialShape,
+    Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    lowers[i] = input_pos[i] * stride[i] - padding[i];
+    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
+  }
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+      }
+      offset += m * (val - lowers[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<Index> indicesOut,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *kernelSize, const Index *stride,
+                         const Index *padding, const Index *dilation,
+                         const Index *outSpatialShape) {
+  // indicesOut: num_active * kernelVolume * (NDim + 1)
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
+                           tv::TensorView<Index> indicesOut,
+                           tv::TensorView<IndexGrid> gridsOut,
+                           tv::TensorView<Index> indicePairs,
+                           tv::TensorView<Index> indiceNum,
+                           const Index *kernelSize, const Index *stride,
+                           const Index *padding, const Index *dilation,
+                           const Index *outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *const kernelSize,
+                         const Index *const stride, const Index *const padding,
+                         const Index *dilation,
+                         const Index *const outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  // Index validPoints[kernelVolume * (NDim + 1)];
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int j = 0; j < numActIn; ++j) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
+                                         outSpatialShape) +
+            spatialVolume * indicesIn(j, 0);
+    gridsOut[index] = j;
+  }
+  for (int j = 0; j < numActIn; ++j) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+              spatialVolume * indicesIn(j, 0);
+      if (gridsOut[index] > -1) {
+        indicePairs(offset, 0, indiceNum[offset]) = j;
+        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+      }
+    }
+  }
+  return numActIn;
+}
+
+#endif
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
+#define SPARSE_CONV_INDICE_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+}  // namespace functor
+
+#endif
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
+#define SPARSE_MAXPOOL_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor {
+  void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size);
+};
+}  // namespace functor
+
+#endif
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
+#ifndef MP_HELPER_H_
+#define MP_HELPER_H_
+#include <type_traits>
+#include <utility>
+
+template <class... T>
+struct mp_list {};
+
+template <class T, T... I>
+using mp_list_c = mp_list<std::integral_constant<T, I>...>;
+
+namespace detail {
+
+template <class... T, class F>
+constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
+  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
+}
+
+template <class F>
+constexpr F mp_for_each_impl(mp_list<>, F &&f) {
+  return std::forward<F>(f);
+}
+
+}  // namespace detail
+
+namespace detail {
+
+template <class A, template <class...> class B>
+struct mp_rename_impl {
+  // An error "no type named 'type'" here means that the first argument to
+  // mp_rename is not a list
+};
+
+template <template <class...> class A, class... T, template <class...> class B>
+struct mp_rename_impl<A<T...>, B> {
+  using type = B<T...>;
+};
+
+}  // namespace detail
+
+template <class A, template <class...> class B>
+using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;
+
+template <class L, class F>
+constexpr F mp_for_each(F &&f) {
+  return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
+                                    std::forward<F>(f));
+}
+
+#endif
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <math.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_mean(py::array_t<DType> points,
+                               py::array_t<DType> voxels,
+                               py::array_t<DType> means, py::array_t<int> coors,
+                               py::array_t<int> num_points_per_voxel,
+                               py::array_t<int> coor_to_voxelidx,
+                               std::vector<DType> voxel_size,
+                               std::vector<DType> coors_range, int max_points,
+                               int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto means_rw = means.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+      for (int k = 0; k < num_features; ++k) {
+        means_rw(voxelidx, k) +=
+            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+      }
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    num = num_points_per_voxel_rw(i);
+    for (int j = num; j < max_points; ++j) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(i, j, k) = means_rw(i, k);
+      }
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_height(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+        height_rw(voxelidx, k) =
+            std::min(points_rw(i, k), height_rw(voxelidx, k));
+        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    for (int k = 0; k < num_features; ++k) {
+      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
+                    py::array_t<DType> height, py::array_t<DType> maxs,
+                    py::array_t<int> coor_to_voxelidx,
+                    std::vector<DType> voxel_size,
+                    std::vector<DType> coors_range, int max_voxels, DType eps) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<1>();
+  auto maxs_rw = maxs.template mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+    }
+    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
+    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
+  }
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
+      mask(i) = 0;
+    }
+  }
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_with_filtering(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<int> voxel_mask, py::array_t<DType> mins,
+    py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels, int block_factor, int block_size,
+    DType height_threshold) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto mins_rw = mins.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+
+  DType max_value, min_value;
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int block_shape_H = grid_size[1] / block_factor;
+  int block_shape_W = grid_size[0] / block_factor;
+  int voxelidx, num;
+  int block_coor[2];
+  int startx, stopx, starty, stopy;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      block_coor[0] = coor[1] / block_factor;
+      block_coor[1] = coor[2] / block_factor;
+      mins_rw(block_coor[0], block_coor[1]) =
+          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) =
+          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor[1] = coors_rw(i, 1);
+    coor[2] = coors_rw(i, 2);
+    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
+    block_coor[0] = coor[1] / block_factor;
+    block_coor[1] = coor[2] / block_factor;
+    min_value = mins_rw(block_coor[0], block_coor[1]);
+    max_value = maxs_rw(block_coor[0], block_coor[1]);
+    startx = std::max(0, block_coor[0] - block_size / 2);
+    stopx =
+        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
+    starty = std::max(0, block_coor[1] - block_size / 2);
+    stopy =
+        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
+
+    for (int j = startx; j < stopx; ++j) {
+      for (int k = starty; k < stopy; ++k) {
+        min_value = std::min(min_value, mins_rw(j, k));
+        max_value = std::max(max_value, maxs_rw(j, k));
+      }
+    }
+    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
+  }
+  return voxel_num;
+}
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_REORDERING_FUNCTOR_H_
+#define SPARSE_REORDERING_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseGatherFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseScatterAddFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size,
+                  bool stable = false);
+};
+}  // namespace functor
+
+#endif
--- a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
+++ b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
+#pragma once
+namespace tv {
+namespace detail {
+
+template <typename scalar_t>
+class KernelLoop {
+  struct Iterator {
+    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
+        : index_(index), delta_(delta) {}
+    __forceinline__ __device__ scalar_t operator*() const { return index_; }
+    __forceinline__ __device__ Iterator &operator++() {
+      index_ += delta_;
+      return *this;
+    }
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      if (!other.delta_) {
+        return less;
+      }
+      if (!delta_) {
+        return greater;
+      }
+      return less || greater;
+    }
+
+   private:
+    scalar_t index_;
+    const scalar_t delta_;
+  };
+
+ public:
+  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
+                                        scalar_t end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __forceinline__ __device__ Iterator begin() const {
+    return Iterator{begin_, delta_};
+  }
+  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+ private:
+  scalar_t begin_;
+  scalar_t delta_;
+  scalar_t end_;
+};
+
+}  // namespace detail
+
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
+                                      gridDim.x * blockDim.x * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : KernelLoopY(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
+                                      gridDim.y * blockDim.y * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
+                                      gridDim.z * blockDim.z * NumILP, count);
+}
+
+}  // namespace tv
--- a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
+++ b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
+#pragma once
+// from pytorch.aten
+#include "tensorview.h"
+namespace tv {
+namespace launch {
+
+template <typename T1, typename T2>
+inline int DivUp(const T1 a, const T2 b) {
+  return (a + b - 1) / b;
+}
+
+constexpr int CUDA_NUM_THREADS = 1024;
+inline int getBlocks(const int N) {
+  TV_ASSERT_RT_ERR(N > 0,
+                   "CUDA kernel launch blocks must be positive, but got N=", N);
+  return DivUp(N, CUDA_NUM_THREADS);
+}
+}  // namespace launch
+}  // namespace tv
--- a/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
+++ b/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <type_traits>
+#include <vector>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace tv {
+
+#ifdef __NVCC__
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#define TV_ASSERT(expr) assert(expr)
+#elif defined(__CUDACC_RTC__)
+#define TV_ASSERT(expr) assert(expr)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#else
+#define TV_ASSERT(x) assert(x)
+#define TV_HOST_DEVICE_INLINE inline
+#define TV_HOST_DEVICE
+#endif
+
+#define TV_REQUIRE(expr, ...) \
+  {                           \
+    if (!(expr)) {            \
+      printf(__VA_ARGS__);    \
+      assert(expr);           \
+    }                         \
+  }
+
+#define TV_DEVICE_REQUIRE(expr, ...)                      \
+  {                                                       \
+    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
+    assert(expr);                                         \
+  }
+
+template <class SStream, class T>
+void sstream_print(SStream &ss, T val) {
+  ss << val;
+}
+
+template <class SStream, class T, class... TArgs>
+void sstream_print(SStream &ss, T val, TArgs... args) {
+  ss << val << " ";
+  sstream_print(ss, args...);
+}
+
+#define TV_ASSERT_RT_ERR(expr, ...)                     \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::runtime_error(__macro_s.str());        \
+    }                                                   \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::invalid_argument(__macro_s.str());     \
+    }                                                   \
+  }
+
+#define TV_CHECK_CUDA_ERR()                                    \
+  {                                                            \
+    auto err = cudaGetLastError();                             \
+    if (err != cudaSuccess) {                                  \
+      std::stringstream __macro_s;                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
+      __macro_s << "cuda execution failed with error " << err; \
+      throw std::runtime_error(__macro_s.str());               \
+    }                                                          \
+  }
+
+struct CPU {};
+
+#define TV_MAX_DIM 6
+
+template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
+struct SimpleVector {
+ public:
+  TV_HOST_DEVICE_INLINE SimpleVector(){};
+  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
+    TV_ASSERT(q.size() <= MaxDim);
+    mSize = 0;
+    for (scalar_t s : q) {
+      mArray[mSize++] = s;
+    }
+    mSize = q.size();
+  }
+  SimpleVector(const std::vector<scalar_t> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE SimpleVector(
+      const SimpleVector<scalar_t, MaxDim> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize < MaxDim);
+#endif
+    mArray[mSize] = s;
+    mSize++;
+  }
+  TV_HOST_DEVICE_INLINE void pop_back() {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize > 0);
+#endif
+    mSize--;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
+  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
+
+  typedef size_t size_type;
+
+  class iterator {
+   public:
+    typedef iterator self_type;
+    typedef scalar_t value_type;
+    typedef scalar_t &reference;
+    typedef scalar_t *pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef std::ptrdiff_t difference_type;
+    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  class const_iterator {
+   public:
+    typedef const_iterator self_type;
+    typedef scalar_t value_type;
+    typedef const scalar_t &reference;
+    typedef const scalar_t *pointer;
+    typedef std::ptrdiff_t difference_type;
+    typedef std::forward_iterator_tag iterator_category;
+    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
+
+  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
+
+  TV_HOST_DEVICE_INLINE const_iterator begin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator end() const {
+    return const_iterator(mArray + mSize);
+  }
+  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator cend() const {
+    return const_iterator(mArray + mSize);
+  }
+
+ protected:
+  scalar_t mArray[MaxDim];
+  size_t mSize = 0;
+};
+
+template <typename scalar_t, size_t MaxDim>
+bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  if (lfs.size() != rfs.size()) return false;
+  for (size_t i = 0; i < lfs.size(); ++i) {
+    if (lfs[i] != rfs[i]) return false;
+  }
+  return true;
+}
+
+template <typename scalar_t, size_t MaxDim>
+bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  return !(lfs == rfs);
+}
+
+struct Slice {
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
+    SimpleVector<int, 3> slices{int(ints)...};
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    for (size_t i = 0; i < slices.size(); ++i) {
+      mSlices[i] = slices[i];
+    }
+  }
+
+  TV_HOST_DEVICE_INLINE Slice() {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+  }
+  template <typename scalar_t>
+  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    TV_ASSERT(slice.size() <= 3);
+    int idx = 0;
+    for (scalar_t s : slice) {
+      mSlices[idx] = int(s);
+      ++idx;
+    }
+  }
+  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+
+ protected:
+  int mSlices[3];
+};
+
+template <size_t MaxDim = TV_MAX_DIM>
+struct ShapeBase : public SimpleVector<int, MaxDim> {
+  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
+  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+
+  template <typename scalar_t, template <class...> class Container>
+  ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
+
+  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < end; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && start <= this->mSize);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < this->mSize; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const {
+    if (this->mSize == 0) return 0;
+    size_t s = 1;
+    for (int i = 0; i < int(this->mSize); ++i) {
+      s *= this->mArray[i];
+    }
+    return s;
+  }
+  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+};
+
+using Shape = ShapeBase<TV_MAX_DIM>;
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#ifdef TV_DEBUG
+  TV_ASSERT(sizeof...(indexes) == shape.size());
+#endif
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           std::vector<int> &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           const Shape &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
+                                           const Index *shape) {
+  unsigned offset = 0;
+  unsigned m = 1;
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    offset += m * indexes[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
+                                           const Index *shape) {
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    output[i] = index % shape[i];
+    index -= output[i];
+    index /= shape[i];
+  }
+  return index;
+}
+
+template <int N>
+struct ArrayIndexRowMajor {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return indexes[N - 1] +
+           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
+  }
+};
+
+template <>
+struct ArrayIndexRowMajor<0> {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return 0;
+  }
+};
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *simpleTypeName(scalar_t val = scalar_t());
+template <>
+constexpr const char *simpleTypeName(float val) {
+  return "float32";
+}
+template <>
+constexpr const char *simpleTypeName(double val) {
+  return "float64";
+}
+template <>
+constexpr const char *simpleTypeName(int val) {
+  return "int32";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned val) {
+  return "uint32";
+}
+template <>
+constexpr const char *simpleTypeName(long val) {
+  return "int64";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned long val) {
+  return "uint64";
+}
+};  // namespace detail
+
+template <typename scalar_t, int Rank = -1>
+struct TensorView {
+  TV_HOST_DEVICE_INLINE TensorView() {}
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
+      : mPtr(ptr), mShape(shape) {}
+  template <class... Integers>
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
+      : mPtr(ptr) {
+    mShape = {int(shapes)...};
+  }
+
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      const TensorView<scalar_t, Rank> &tensor) {
+    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    const scalar_t *other_ptr = tensor.data();
+    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
+    return *this;
+  }
+
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      std::initializer_list<T1> seq) {
+    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    for (const T1 &s : seq) *(ptr++) = scalar_t(s);
+    return *this;
+  }
+
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator()() {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
+                                                   T4 i4) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) const {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
+  TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
+  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
+  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
+  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
+    Shape shapes{int(newShapes)...};
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
+      Inds... newShapes) const {
+    Shape shapes{int(newShapes)...};
+    for (size_t i = 0; i < shapes.ndim(); ++i) {
+      if (shapes[i] == -1) {
+        shapes[i] = 1;
+        shapes[i] = size() / shapes.size();
+        break;
+      }
+    }
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
+  }
+  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
+
+  template <class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slice slice, Slices... slices) const {
+    return subview<float, Slice, Slices...>(slice, slices...);
+  }
+  template <class T2 = float, class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slices... slices) const {
+    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
+    Shape new_shape{to_slice(slices)[0]...};
+    Shape start{to_slice(slices)[0]...};
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
+                                                           Integers... ints) {
+    Shape start = {id, ints...};
+    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
+                                      mShape.subshape(sizeof...(ints) + 1));
+  }
+
+  std::string repr() const {
+    std::ostringstream ss;
+    if (empty()) return "";
+    if (mShape.ndim() == 0) {
+      ss << *mPtr;
+      ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+      return ss.str();
+    }
+    Shape counter = mShape;
+    auto tensor_flat = this->view(-1);
+    for (int i = 0; i < counter.ndim(); ++i) {
+      counter[i] = 0;
+      ss << "[";
+    }
+    for (size_t i = 0; i < this->size(); ++i) {
+      ss << tensor_flat(rowArrayIdx(mShape, counter));
+      counter[counter.ndim() - 1] += 1;
+      int inc_count = 0;
+      bool print_comma = true;
+      for (int c = counter.ndim() - 1; c >= 0; --c) {
+        if (counter[c] == this->dim(c) && c > 0) {
+          ++inc_count;
+          counter[c - 1] += 1;
+          counter[c] = 0;
+          print_comma = false;
+        }
+      }
+      if (print_comma && i != this->size() - 1) ss << ", ";
+      for (int j = 0; j < inc_count; ++j) {
+        ss << "]";
+      }
+      if (i != this->size() - 1) {
+        if (inc_count != 0) ss << "\n";
+        for (int j = 0; j < inc_count; ++j) {
+          ss << "[";
+        }
+      }
+    }
+    ss << "]";
+    ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+    return ss.str();
+  }
+
+ protected:
+  // TODO: make this function public.
+  // currently this function is called unexpectedly when using subview({0, 0}).
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
+      SimpleVector<Slice> slice_vec) {
+    Shape new_shape;
+    for (int i = 0; i < slice_vec.size(); ++i) {
+      new_shape.push_back(slice_vec[i][0]);
+    }
+    Shape start = new_shape;
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;  // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+    return Slice{int(s), -1, -1};
+  }
+
+  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
+
+  scalar_t *mPtr = nullptr;
+  Shape mShape;
+};
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
+template <>
+constexpr const char *printfTypeFormat(float val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(double val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(int val) {
+  return "%d";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned val) {
+  return "%u";
+}
+template <>
+constexpr const char *printfTypeFormat(long val) {
+  return "%ld";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned long val) {
+  return "%lu";
+}
+};  // namespace detail
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
+                                    const char *format) {
+  if (tensor.empty()) return;
+  if (tensor.ndim() == 0) {
+    printf(format, tensor());
+    printf("\n");
+    return;
+  }
+  Shape counter = tensor.shape();
+  auto tensor_flat = tensor.view(-1);
+  for (int i = 0; i < counter.ndim(); ++i) {
+    counter[i] = 0;
+    printf("[");
+  }
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
+    counter[counter.ndim() - 1] += 1;
+    int inc_count = 0;
+    bool print_comma = true;
+    for (int c = counter.ndim() - 1; c >= 0; --c) {
+      if (counter[c] == tensor.dim(c) && c > 0) {
+        ++inc_count;
+        counter[c - 1] += 1;
+        counter[c] = 0;
+        print_comma = false;
+      }
+    }
+    if (print_comma && i != tensor.size() - 1) printf(", ");
+    for (int j = 0; j < inc_count; ++j) {
+      printf("]");
+    }
+    if (i != tensor.size() - 1) {
+      if (inc_count != 0) printf("\n");
+      for (int j = 0; j < inc_count; ++j) {
+        printf("[");
+      }
+    }
+  }
+  printf("]\n");
+}
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(TensorView<const scalar_t>(ptr, shape),
+                         detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
+                                    const char *format) {
+  return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
+}
+
+}  // namespace tv
--- a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
@@ -10,6 +10,7 @@
 #include "reduce_ops.h"
 #include "roi_align.h"
 #include "roi_align_rotated.h"
+#include "rotated_feature_align.h"
 #include "soft_nms.h"

 const char *c_MMCVOpDomain = "mmcv";
@@ -17,6 +18,7 @@ SoftNmsOp c_SoftNmsOp;
 NmsOp c_NmsOp;
 MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
 MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
+MMCVRotatedFeatureAlignCustomOp c_MMCVRotatedFeatureAlignCustomOp;
 GridSampleOp c_GridSampleOp;
 MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
 MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
@@ -77,5 +79,10 @@ OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
    return status;
  }

+  if (auto status = ortApi->CustomOpDomain_Add(
+          domain, &c_MMCVRotatedFeatureAlignCustomOp)) {
+    return status;
+  }
+
  return ortApi->AddCustomOpDomain(options, domain);
 }
--- a/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "rotated_feature_align.h"
+
+#include "../ort_mmcv_utils.h"
+
+template <typename T>
+T bilinear_interpolate(const T *input, const int height, const int width, T y,
+                       T x, const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[int(fma(y_low, width, x_low))];
+  T v2 = input[int(fma(y_low, width, x_high))];
+  T v3 = input[int(fma(y_high, width, x_low))];
+  T v4 = input[int(fma(y_high, width, x_high))];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename scalar_t>
+void rotated_feature_align_forward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t *bottom_data,
+    const scalar_t *best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t *top_data) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t *bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+void MMCVRotatedFeatureAlignKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_features = ort_.KernelContext_GetInput(context, 0);
+  const float *features_data = reinterpret_cast<const float *>(
+      ort_.GetTensorData<float>(input_features));
+  const OrtValue *input_best_rbboxes = ort_.KernelContext_GetInput(context, 1);
+  const float *best_rbboxes = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_best_rbboxes));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_features);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+  rotated_feature_align_forward_cpu_kernel<float>(
+      output_size, points_, features_data, best_rbboxes, spatial_scale_,
+      input_channels, input_height, input_width, out);
+}
--- a/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
+++ b/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
+#ifndef ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
+#define ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
+
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+
+struct MMCVRotatedFeatureAlignKernel {
+ public:
+  MMCVRotatedFeatureAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+    points_ = ort_.KernelInfoGetAttribute<int64_t>(info, "points");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  float spatial_scale_;
+  int points_;
+};
+
+struct MMCVRotatedFeatureAlignCustomOp
+    : Ort::CustomOpBase<MMCVRotatedFeatureAlignCustomOp,
+                        MMCVRotatedFeatureAlignKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRotatedFeatureAlignKernel(api, info);
+  }
+
+  const char* GetName() const { return "MMCVRotatedFeatureAlign"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H