support v1.4.0

6f3c5f1c · limm · 6f674c7e · 6f674c7e · 6f674c7e · 6f674c7e
Commit 6f3c5f1c authored Jul 11, 2024 by limm
20 changed files
--- a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef PYTORCH_MLU_HELPER_HPP_
-#define PYTORCH_MLU_HELPER_HPP_
-
-#ifdef MMCV_WITH_MLU
-#include "aten.h"
-
-#define NFU_ALIGN_SIZE 128
-
-#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
-
-#define PAD_DOWN(x, y) (((x) / (y)) * (y))
-
-#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
-
-#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
-
-inline int32_t getJobLimitCapability() {
-  CNcontext drv_ctx;
-  TORCH_CHECK(CN_SUCCESS == cnCtxGetCurrent(&drv_ctx), "cnCtxGetCurrent fails");
-  CNctxConfigParam ctx_conf_param;
-  TORCH_CHECK(
-      CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT,
-                                        &ctx_conf_param),
-      "cnGetCtxConfigParam fails.");
-  return (int32_t)ctx_conf_param.unionLimit;
-}
-
-inline int32_t getCoreNumOfJobLimitCapability() {
-  switch (getJobLimitCapability()) {
-    default:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) *
-             getJobLimitCapability();
-    case CN_KERNEL_CLASS_BLOCK:
-      return 1;
-    case CN_KERNEL_CLASS_UNION:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-    case CN_KERNEL_CLASS_UNION2:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2;
-    case CN_KERNEL_CLASS_UNION4:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4;
-    case CN_KERNEL_CLASS_UNION8:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8;
-    case CN_KERNEL_CLASS_UNION16:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16;
-  }
-}
-
-#endif  // MMCV_WITH_MLU
-
-#endif  // PYTORCH_MLU_HELPER_HPP_
--- a/mmcv/ops/csrc/common/pytorch_npu_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_npu_helper.hpp
-/******************************************************************************
- * Copyright (c) 2022 Huawei Technologies Co., Ltd
- * All rights reserved.
- *
- * Licensed under the BSD 3-Clause License  (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * https://opensource.org/licenses/BSD-3-Clause
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef PYTORCH_NPU_HELPER_HPP_
-#define PYTORCH_NPU_HELPER_HPP_
-
-#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
-#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
-#include <torch_npu/csrc/framework/utils/OpAdapter.h>
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-#define NPU_NAME_SPACE at_npu::native
-
-#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value)
-
-#define CHECK_NPU(x) \
-  TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor")
-
-#endif  // PYTORCH_NPU_HELPER_HPP_
--- a/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
+++ b/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PARAMS_GRID_H_
-#define PARAMS_GRID_H_
-#include <tuple>
-#include <vector>
-
-namespace detail {
-template <class scalar_t>
-int getTotalSize(std::vector<scalar_t> arg) {
-  return arg.size();
-}
-
-template <class scalar_t, class... TArgs>
-int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
-  return arg.size() * getTotalSize(args...);
-}
-
-template <typename scalar_t>
-int getSize(std::vector<scalar_t> arg) {
-  return arg.size();
-}
-
-template <int Idx, class TT, class scalar_t>
-void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-}
-
-template <int Idx, class TT, class scalar_t, class... TArgs>
-void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
-              std::vector<TArgs> &... args) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-  assigner<Idx + 1>(src, counter, args...);
-}
-}  // namespace detail
-
-template <class... TArgs>
-std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
-  int length = detail::getTotalSize(args...);
-  std::vector<int> sizes = {detail::getSize(args)...};
-  int size = sizes.size();
-
-  std::vector<std::tuple<TArgs...>> params(length);
-  std::vector<int> counter(size);
-  for (int i = 0; i < length; ++i) {
-    detail::assigner<0>(params[i], counter, args...);
-    counter[size - 1] += 1;
-    for (int c = size - 1; c >= 0; --c) {
-      if (counter[c] == sizes[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return params;
-}
-
-#endif
--- a/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
+++ b/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
-//          Copyright Louis Delacroix 2010 - 2014.
-// Distributed under the Boost Software License, Version 1.0.
-//    (See accompanying file LICENSE_1_0.txt or copy at
-//          http://www.boost.org/LICENSE_1_0.txt)
-//
-// A pretty printing library for C++
-//
-// Usage:
-// Include this header, and operator<< will "just work".
-
-#ifndef H_PRETTY_PRINT
-#define H_PRETTY_PRINT
-
-#include <cstddef>
-#include <iterator>
-#include <memory>
-#include <ostream>
-#include <set>
-#include <tuple>
-#include <type_traits>
-#include <unordered_set>
-#include <utility>
-#include <valarray>
-
-namespace pretty_print {
-namespace detail {
-// SFINAE type trait to detect whether T::const_iterator exists.
-
-struct sfinae_base {
-  using yes = char;
-  using no = yes[2];
-};
-
-template <typename T>
-struct has_const_iterator : private sfinae_base {
- private:
-  template <typename C>
-  static yes &test(typename C::const_iterator *);
-  template <typename C>
-  static no &test(...);
-
- public:
-  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
-  using type = T;
-};
-
-template <typename T>
-struct has_begin_end : private sfinae_base {
- private:
-  template <typename C>
-  static yes &
-  f(typename std::enable_if<
-      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
-                                            const>(&C::begin)),
-                   typename C::const_iterator (C::*)() const>::value>::type *);
-
-  template <typename C>
-  static no &f(...);
-
-  template <typename C>
-  static yes &g(typename std::enable_if<
-                std::is_same<decltype(static_cast<typename C::const_iterator (
-                                          C::*)() const>(&C::end)),
-                             typename C::const_iterator (C::*)() const>::value,
-                void>::type *);
-
-  template <typename C>
-  static no &g(...);
-
- public:
-  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
-  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
-};
-
-}  // namespace detail
-
-// Holds the delimiter values for a specific character type
-
-template <typename TChar>
-struct delimiters_values {
-  using char_type = TChar;
-  const char_type *prefix;
-  const char_type *delimiter;
-  const char_type *postfix;
-};
-
-// Defines the delimiter values for a specific container and character type
-
-template <typename T, typename TChar>
-struct delimiters {
-  using type = delimiters_values<TChar>;
-  static const type values;
-};
-
-// Functor to print containers. You can use this directly if you want
-// to specify a non-default delimiters type. The printing logic can
-// be customized by specializing the nested template.
-
-template <typename T, typename TChar = char,
-          typename TCharTraits = ::std::char_traits<TChar>,
-          typename TDelimiters = delimiters<T, TChar>>
-struct print_container_helper {
-  using delimiters_type = TDelimiters;
-  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
-
-  template <typename U>
-  struct printer {
-    static void print_body(const U &c, ostream_type &stream) {
-      using std::begin;
-      using std::end;
-
-      auto it = begin(c);
-      const auto the_end = end(c);
-
-      if (it != the_end) {
-        for (;;) {
-          stream << *it;
-
-          if (++it == the_end) break;
-
-          if (delimiters_type::values.delimiter != NULL)
-            stream << delimiters_type::values.delimiter;
-        }
-      }
-    }
-  };
-
-  print_container_helper(const T &container) : container_(container) {}
-
-  inline void operator()(ostream_type &stream) const {
-    if (delimiters_type::values.prefix != NULL)
-      stream << delimiters_type::values.prefix;
-
-    printer<T>::print_body(container_, stream);
-
-    if (delimiters_type::values.postfix != NULL)
-      stream << delimiters_type::values.postfix;
-  }
-
- private:
-  const T &container_;
-};
-
-// Specialization for pairs
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename T1, typename T2>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::pair<T1, T2>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-
-  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
-    stream << c.first;
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-    stream << c.second;
-  }
-};
-
-// Specialization for tuples
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename... Args>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::tuple<Args...>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-  using element_type = std::tuple<Args...>;
-
-  template <std::size_t I>
-  struct Int {};
-
-  static void print_body(const element_type &c, ostream_type &stream) {
-    tuple_print(c, stream, Int<0>());
-  }
-
-  static void tuple_print(const element_type &, ostream_type &,
-                          Int<sizeof...(Args)>) {}
-
-  static void tuple_print(
-      const element_type &c, ostream_type &stream,
-      typename std::conditional<sizeof...(Args) != 0, Int<0>,
-                                std::nullptr_t>::type) {
-    stream << std::get<0>(c);
-    tuple_print(c, stream, Int<1>());
-  }
-
-  template <std::size_t N>
-  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-
-    stream << std::get<N>(c);
-
-    tuple_print(c, stream, Int<N + 1>());
-  }
-};
-
-// Prints a print_container_helper to the specified stream.
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &stream,
-    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
-  helper(stream);
-  return stream;
-}
-
-// Basic is_container template; specialize to derive from std::true_type for all
-// desired container types
-
-template <typename T>
-struct is_container
-    : public std::integral_constant<bool,
-                                    detail::has_const_iterator<T>::value &&
-                                        detail::has_begin_end<T>::beg_value &&
-                                        detail::has_begin_end<T>::end_value> {};
-
-template <typename T, std::size_t N>
-struct is_container<T[N]> : std::true_type {};
-
-template <std::size_t N>
-struct is_container<char[N]> : std::false_type {};
-
-template <typename T>
-struct is_container<std::valarray<T>> : std::true_type {};
-
-template <typename T1, typename T2>
-struct is_container<std::pair<T1, T2>> : std::true_type {};
-
-template <typename... Args>
-struct is_container<std::tuple<Args...>> : std::true_type {};
-
-// Default delimiters
-
-template <typename T>
-struct delimiters<T, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T>
-const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
-template <typename T>
-struct delimiters<T, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T>
-const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
-                                                                   L"]"};
-
-// Delimiters for (multi)set and unordered_(multi)set
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
-                                                                  "}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
-        "{", ", ", "}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
-    L"{", L", ", L"}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-               wchar_t>::values = {L"{", L", ", L"}"};
-
-// Delimiters for pair and tuple
-
-template <typename T1, typename T2>
-struct delimiters<std::pair<T1, T2>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
-    "(", ", ", ")"};
-template <typename T1, typename T2>
-struct delimiters<::std::pair<T1, T2>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<wchar_t>
-    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
-
-template <typename... Args>
-struct delimiters<std::tuple<Args...>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename... Args>
-const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
-    "(", ", ", ")"};
-template <typename... Args>
-struct delimiters<::std::tuple<Args...>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename... Args>
-const delimiters_values<wchar_t>
-    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
-
-// Type-erasing helper class for easy use of custom delimiters.
-// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
-// and MyDelims needs to be defined for TChar. Usage: "cout <<
-// pretty_print::custom_delims<MyDelims>(x)".
-
-struct custom_delims_base {
-  virtual ~custom_delims_base() {}
-  virtual std::ostream &stream(::std::ostream &) = 0;
-  virtual std::wostream &stream(::std::wostream &) = 0;
-};
-
-template <typename T, typename Delims>
-struct custom_delims_wrapper : custom_delims_base {
-  custom_delims_wrapper(const T &t_) : t(t_) {}
-
-  std::ostream &stream(std::ostream &s) {
-    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
-               t);
-  }
-
-  std::wostream &stream(std::wostream &s) {
-    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
-                                       Delims>(t);
-  }
-
- private:
-  const T &t;
-};
-
-template <typename Delims>
-struct custom_delims {
-  template <typename Container>
-  custom_delims(const Container &c)
-      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
-
-  std::unique_ptr<custom_delims_base> base;
-};
-
-template <typename TChar, typename TCharTraits, typename Delims>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
-  return p.base->stream(s);
-}
-
-// A wrapper for a C-style array given as pointer-plus-size.
-// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
-
-template <typename T>
-struct array_wrapper_n {
-  typedef const T *const_iterator;
-  typedef T value_type;
-
-  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
-  inline const_iterator begin() const { return _array; }
-  inline const_iterator end() const { return _array + _n; }
-
- private:
-  const T *const _array;
-  size_t _n;
-};
-
-// A wrapper for hash-table based containers that offer local iterators to each
-// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
-// 5 of container m.)
-
-template <typename T>
-struct bucket_print_wrapper {
-  typedef typename T::const_local_iterator const_iterator;
-  typedef typename T::size_type size_type;
-
-  const_iterator begin() const { return m_map.cbegin(n); }
-
-  const_iterator end() const { return m_map.cend(n); }
-
-  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
-
- private:
-  const T &m_map;
-  const size_type n;
-};
-
-}  // namespace pretty_print
-
-// Global accessor functions for the convenience wrappers
-
-template <typename T>
-inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
-                                                           size_t n) {
-  return pretty_print::array_wrapper_n<T>(a, n);
-}
-
-template <typename T>
-pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
-                                                   typename T::size_type n) {
-  return pretty_print::bucket_print_wrapper<T>(m, n);
-}
-
-// Main magic entry point: An overload snuck into namespace std.
-// Can we do better?
-
-namespace std {
-// Prints a container to the stream using default delimiters
-
-template <typename T, typename TChar, typename TCharTraits>
-inline typename enable_if<::pretty_print::is_container<T>::value,
-                          basic_ostream<TChar, TCharTraits> &>::type
-operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
-  return stream
-         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
-                container);
-}
-}  // namespace std
-
-#endif  // H_PRETTY_PRINT
--- a/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
+++ b/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <pybind11/embed.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <spconv/tensorview/tensorview.h>
-
-#include <algorithm>
-#include <iostream>
-
-namespace py = pybind11;
-
-template <typename scalar_t, typename TPyObject>
-std::vector<scalar_t> array2Vector(TPyObject arr) {
-  py::array arr_np = arr;
-  size_t size = arr.attr("size").template cast<size_t>();
-  py::array_t<scalar_t> arr_cc = arr_np;
-  std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
-  return data;
-}
-
-template <typename scalar_t>
-std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
-  std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
-  return data;
-}
-
-template <typename scalar_t, typename TPyObject>
-tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
-  py::array arr_np = arr;
-  py::array_t<scalar_t> arr_cc = arr_np;
-  tv::Shape shape;
-  for (int i = 0; i < arr_cc.ndim(); ++i) {
-    shape.push_back(arr_cc.shape(i));
-  }
-  return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
-}
-template <typename scalar_t>
-tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
-  tv::Shape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
-}
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPCONV_GEOMETRY_H_
-#define SPCONV_GEOMETRY_H_
-
-#include <utils/spconv/tensorview/tensorview.h>
-
-#include <iostream>
-#include <limits>
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
-                                    const Index *kernelSize,
-                                    const Index *stride, const Index *padding,
-                                    const Index *dilation,
-                                    const Index *outSpatialShape, Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
-                 stride[i] + padding[i]) /
-                stride[i];
-    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
-  }
-
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid) ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPosTranspose(
-    const Index *input_pos, const Index *kernelSize, const Index *stride,
-    const Index *padding, const Index *dilation, const Index *outSpatialShape,
-    Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    lowers[i] = input_pos[i] * stride[i] - padding[i];
-    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
-  }
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-      }
-      offset += m * (val - lowers[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid) ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<Index> indicesOut,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *kernelSize, const Index *stride,
-                         const Index *padding, const Index *dilation,
-                         const Index *outSpatialShape) {
-  // indicesOut: num_active * kernelVolume * (NDim + 1)
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-      if (gridsOut[index] == -1) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        gridsOut[index] = numAct++;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(offset, 0, indiceNum[offset]) = j;
-      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
-    }
-  }
-  return numAct;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
-                           tv::TensorView<Index> indicesOut,
-                           tv::TensorView<IndexGrid> gridsOut,
-                           tv::TensorView<Index> indicePairs,
-                           tv::TensorView<Index> indiceNum,
-                           const Index *kernelSize, const Index *stride,
-                           const Index *padding, const Index *dilation,
-                           const Index *outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-      if (gridsOut[index] == -1) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        gridsOut[index] = numAct++;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(offset, 0, indiceNum[offset]) = j;
-      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
-    }
-  }
-  return numAct;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *const kernelSize,
-                         const Index *const stride, const Index *const padding,
-                         const Index *dilation,
-                         const Index *const outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  // Index validPoints[kernelVolume * (NDim + 1)];
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int j = 0; j < numActIn; ++j) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
-                                         outSpatialShape) +
-            spatialVolume * indicesIn(j, 0);
-    gridsOut[index] = j;
-  }
-  for (int j = 0; j < numActIn; ++j) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-              spatialVolume * indicesIn(j, 0);
-      if (gridsOut[index] > -1) {
-        indicePairs(offset, 0, indiceNum[offset]) = j;
-        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
-      }
-    }
-  }
-  return numActIn;
-}
-
-#endif
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
-#define SPARSE_CONV_INDICE_FUNCTOR_H_
-#include <utils/spconv/tensorview/tensorview.h>
-
-namespace functor {
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1 {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose);
-};
-
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2 {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false);
-};
-
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctor {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false);
-};
-
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false);
-};
-}  // namespace functor
-
-#endif
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
-#define SPARSE_MAXPOOL_FUNCTOR_H_
-#include <utils/spconv/tensorview/tensorview.h>
-
-namespace functor {
-template <typename Device, typename scalar_t, typename Index>
-struct SparseMaxPoolForwardFunctor {
-  void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> inFeatures,
-                  tv::TensorView<const Index> indices, int size);
-};
-
-template <typename Device, typename scalar_t, typename Index>
-struct SparseMaxPoolBackwardFunctor {
-  void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> inFeatures,
-                  tv::TensorView<const scalar_t> fout,
-                  tv::TensorView<scalar_t> fin,
-                  tv::TensorView<const Index> indices, int size);
-};
-}  // namespace functor
-
-#endif
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
-#ifndef MP_HELPER_H_
-#define MP_HELPER_H_
-#include <type_traits>
-#include <utility>
-
-template <class... T>
-struct mp_list {};
-
-template <class T, T... I>
-using mp_list_c = mp_list<std::integral_constant<T, I>...>;
-
-namespace detail {
-
-template <class... T, class F>
-constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
-  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
-}
-
-template <class F>
-constexpr F mp_for_each_impl(mp_list<>, F &&f) {
-  return std::forward<F>(f);
-}
-
-}  // namespace detail
-
-namespace detail {
-
-template <class A, template <class...> class B>
-struct mp_rename_impl {
-  // An error "no type named 'type'" here means that the first argument to
-  // mp_rename is not a list
-};
-
-template <template <class...> class A, class... T, template <class...> class B>
-struct mp_rename_impl<A<T...>, B> {
-  using type = B<T...>;
-};
-
-}  // namespace detail
-
-template <class A, template <class...> class B>
-using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;
-
-template <class L, class F>
-constexpr F mp_for_each(F &&f) {
-  return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
-                                    std::forward<F>(f));
-}
-
-#endif
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <math.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <algorithm>
-#include <iostream>
-
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(py::array_t<DType> points,
-                               py::array_t<DType> voxels,
-                               py::array_t<DType> means, py::array_t<int> coors,
-                               py::array_t<int> num_points_per_voxel,
-                               py::array_t<int> coor_to_voxelidx,
-                               std::vector<DType> voxel_size,
-                               std::vector<DType> coors_range, int max_points,
-                               int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto means_rw = means.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) +=
-            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
-      }
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(i, j, k) = means_rw(i, k);
-      }
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_height(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto height_rw = height.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-        height_rw(voxelidx, k) =
-            std::min(points_rw(i, k), height_rw(voxelidx, k));
-        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    for (int k = 0; k < num_features; ++k) {
-      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
-                    py::array_t<DType> height, py::array_t<DType> maxs,
-                    py::array_t<int> coor_to_voxelidx,
-                    std::vector<DType> voxel_size,
-                    std::vector<DType> coors_range, int max_voxels, DType eps) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto height_rw = height.template mutable_unchecked<1>();
-  auto maxs_rw = maxs.template mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-    }
-    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
-    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
-  }
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
-      mask(i) = 0;
-    }
-  }
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<int> voxel_mask, py::array_t<DType> mins,
-    py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels, int block_factor, int block_size,
-    DType height_threshold) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto mins_rw = mins.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-
-  DType max_value, min_value;
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int block_shape_H = grid_size[1] / block_factor;
-  int block_shape_W = grid_size[0] / block_factor;
-  int voxelidx, num;
-  int block_coor[2];
-  int startx, stopx, starty, stopy;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      block_coor[0] = coor[1] / block_factor;
-      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) =
-          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
-      maxs_rw(block_coor[0], block_coor[1]) =
-          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor[1] = coors_rw(i, 1);
-    coor[2] = coors_rw(i, 2);
-    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
-    block_coor[0] = coor[1] / block_factor;
-    block_coor[1] = coor[2] / block_factor;
-    min_value = mins_rw(block_coor[0], block_coor[1]);
-    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0] - block_size / 2);
-    stopx =
-        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
-    starty = std::max(0, block_coor[1] - block_size / 2);
-    stopy =
-        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
-
-    for (int j = startx; j < stopx; ++j) {
-      for (int k = starty; k < stopy; ++k) {
-        min_value = std::min(min_value, mins_rw(j, k));
-        max_value = std::max(max_value, maxs_rw(j, k));
-      }
-    }
-    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
-  }
-  return voxel_num;
-}
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_REORDERING_FUNCTOR_H_
-#define SPARSE_REORDERING_FUNCTOR_H_
-#include <utils/spconv/tensorview/tensorview.h>
-
-namespace functor {
-template <typename Device, typename scalar_t, typename Index>
-struct SparseGatherFunctor {
-  void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
-                  tv::TensorView<const scalar_t> features,
-                  tv::TensorView<const Index> indices, int size);
-};
-
-template <typename Device, typename scalar_t, typename Index>
-struct SparseScatterAddFunctor {
-  void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
-                  tv::TensorView<const scalar_t> buffer,
-                  tv::TensorView<const Index> indices, int size,
-                  bool stable = false);
-};
-}  // namespace functor
-
-#endif
--- a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
+++ b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
-#pragma once
-namespace tv {
-namespace detail {
-
-template <typename scalar_t>
-class KernelLoop {
-  struct Iterator {
-    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
-        : index_(index), delta_(delta) {}
-    __forceinline__ __device__ scalar_t operator*() const { return index_; }
-    __forceinline__ __device__ Iterator &operator++() {
-      index_ += delta_;
-      return *this;
-    }
-    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
-      bool greater = index_ > other.index_;
-      bool less = index_ < other.index_;
-      if (!other.delta_) {
-        return less;
-      }
-      if (!delta_) {
-        return greater;
-      }
-      return less || greater;
-    }
-
-   private:
-    scalar_t index_;
-    const scalar_t delta_;
-  };
-
- public:
-  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
-                                        scalar_t end)
-      : begin_(begin), delta_(delta), end_(end) {}
-
-  __forceinline__ __device__ Iterator begin() const {
-    return Iterator{begin_, delta_};
-  }
-  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
-
- private:
-  scalar_t begin_;
-  scalar_t delta_;
-  scalar_t end_;
-};
-
-}  // namespace detail
-
-template <typename scalar_t, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
-    scalar_t count) {
-  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
-                                      gridDim.x * blockDim.x * NumILP, count);
-}
-
-// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
-// Usage: for(int i : KernelLoopY(count)) { visit(i); }
-template <typename scalar_t, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
-    scalar_t count) {
-  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
-                                      gridDim.y * blockDim.y * NumILP, count);
-}
-
-// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
-// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
-template <typename scalar_t, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
-    scalar_t count) {
-  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
-                                      gridDim.z * blockDim.z * NumILP, count);
-}
-
-}  // namespace tv
--- a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
+++ b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
-#pragma once
-// from pytorch.aten
-#include "tensorview.h"
-namespace tv {
-namespace launch {
-
-template <typename T1, typename T2>
-inline int DivUp(const T1 a, const T2 b) {
-  return (a + b - 1) / b;
-}
-
-constexpr int CUDA_NUM_THREADS = 1024;
-inline int getBlocks(const int N) {
-  TV_ASSERT_RT_ERR(N > 0,
-                   "CUDA kernel launch blocks must be positive, but got N=", N);
-  return DivUp(N, CUDA_NUM_THREADS);
-}
-}  // namespace launch
-}  // namespace tv
--- a/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
+++ b/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <type_traits>
-#include <vector>
-
-#include "pytorch_cpp_helper.hpp"
-
-namespace tv {
-
-#if defined(__NVCC__) || defined(__HIP__)
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#define TV_ASSERT(expr) assert(expr)
-#elif defined(__CUDACC_RTC__)
-#define TV_ASSERT(expr) assert(expr)
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#else
-#define TV_ASSERT(x) assert(x)
-#define TV_HOST_DEVICE_INLINE inline
-#define TV_HOST_DEVICE
-#endif
-
-#define TV_REQUIRE(expr, ...) \
-  {                           \
-    if (!(expr)) {            \
-      printf(__VA_ARGS__);    \
-      assert(expr);           \
-    }                         \
-  }
-
-#define TV_DEVICE_REQUIRE(expr, ...)                      \
-  {                                                       \
-    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
-    assert(expr);                                         \
-  }
-
-template <class SStream, class T>
-void sstream_print(SStream &ss, T val) {
-  ss << val;
-}
-
-template <class SStream, class T, class... TArgs>
-void sstream_print(SStream &ss, T val, TArgs... args) {
-  ss << val << " ";
-  sstream_print(ss, args...);
-}
-
-#define TV_ASSERT_RT_ERR(expr, ...)                     \
-  {                                                     \
-    if (!(expr)) {                                      \
-      std::stringstream __macro_s;                      \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
-      __macro_s << #expr << " assert failed. ";         \
-      tv::sstream_print(__macro_s, __VA_ARGS__);        \
-      throw std::runtime_error(__macro_s.str());        \
-    }                                                   \
-  }
-
-#define TV_ASSERT_INVALID_ARG(expr, ...)                \
-  {                                                     \
-    if (!(expr)) {                                      \
-      std::stringstream __macro_s;                      \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
-      __macro_s << #expr << " assert failed. ";         \
-      tv::sstream_print(__macro_s, __VA_ARGS__);        \
-      throw std::invalid_argument(__macro_s.str());     \
-    }                                                   \
-  }
-
-#define TV_CHECK_CUDA_ERR()                                    \
-  {                                                            \
-    auto err = cudaGetLastError();                             \
-    if (err != cudaSuccess) {                                  \
-      std::stringstream __macro_s;                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
-      __macro_s << "cuda execution failed with error " << err; \
-      throw std::runtime_error(__macro_s.str());               \
-    }                                                          \
-  }
-
-struct CPU {};
-
-#define TV_MAX_DIM 6
-
-template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
-struct SimpleVector {
- public:
-  TV_HOST_DEVICE_INLINE SimpleVector(){};
-  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
-    TV_ASSERT(q.size() <= MaxDim);
-    mSize = 0;
-    for (scalar_t s : q) {
-      mArray[mSize++] = s;
-    }
-    mSize = q.size();
-  }
-  SimpleVector(const std::vector<scalar_t> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      mArray[i] = arr[i];
-    }
-    mSize = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE SimpleVector(
-      const SimpleVector<scalar_t, MaxDim> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      mArray[i] = arr[i];
-    }
-    mSize = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < mSize);
-#endif
-    return mArray[idx];
-  }
-  TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < mSize);
-#endif
-    return mArray[idx];
-  }
-  TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
-#ifdef TV_DEBUG
-    TV_ASSERT(mSize < MaxDim);
-#endif
-    mArray[mSize] = s;
-    mSize++;
-  }
-  TV_HOST_DEVICE_INLINE void pop_back() {
-#ifdef TV_DEBUG
-    TV_ASSERT(mSize > 0);
-#endif
-    mSize--;
-  }
-
-  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
-  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
-  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
-
-  typedef size_t size_type;
-
-  class iterator {
-   public:
-    typedef iterator self_type;
-    typedef scalar_t value_type;
-    typedef scalar_t &reference;
-    typedef scalar_t *pointer;
-    typedef std::forward_iterator_tag iterator_category;
-    typedef std::ptrdiff_t difference_type;
-    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
-      return ptr_ != rhs.ptr_;
-    }
-
-   private:
-    pointer ptr_;
-  };
-
-  class const_iterator {
-   public:
-    typedef const_iterator self_type;
-    typedef scalar_t value_type;
-    typedef const scalar_t &reference;
-    typedef const scalar_t *pointer;
-    typedef std::ptrdiff_t difference_type;
-    typedef std::forward_iterator_tag iterator_category;
-    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
-      return ptr_ != rhs.ptr_;
-    }
-
-   private:
-    pointer ptr_;
-  };
-
-  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
-
-  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
-
-  TV_HOST_DEVICE_INLINE const_iterator begin() const {
-    return const_iterator(mArray);
-  }
-
-  TV_HOST_DEVICE_INLINE const_iterator end() const {
-    return const_iterator(mArray + mSize);
-  }
-  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
-    return const_iterator(mArray);
-  }
-
-  TV_HOST_DEVICE_INLINE const_iterator cend() const {
-    return const_iterator(mArray + mSize);
-  }
-
- protected:
-  scalar_t mArray[MaxDim];
-  size_t mSize = 0;
-};
-
-template <typename scalar_t, size_t MaxDim>
-bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
-                const SimpleVector<scalar_t, MaxDim> &rfs) {
-  if (lfs.size() != rfs.size()) return false;
-  for (size_t i = 0; i < lfs.size(); ++i) {
-    if (lfs[i] != rfs[i]) return false;
-  }
-  return true;
-}
-
-template <typename scalar_t, size_t MaxDim>
-bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
-                const SimpleVector<scalar_t, MaxDim> &rfs) {
-  return !(lfs == rfs);
-}
-
-struct Slice {
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
-    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
-    SimpleVector<int, 3> slices{int(ints)...};
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
-    for (size_t i = 0; i < slices.size(); ++i) {
-      mSlices[i] = slices[i];
-    }
-  }
-
-  TV_HOST_DEVICE_INLINE Slice() {
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
-  }
-  template <typename scalar_t>
-  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
-    TV_ASSERT(slice.size() <= 3);
-    int idx = 0;
-    for (scalar_t s : slice) {
-      mSlices[idx] = int(s);
-      ++idx;
-    }
-  }
-  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return mSlices[idx];
-  }
-  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return mSlices[idx];
-  }
-
- protected:
-  int mSlices[3];
-};
-
-template <size_t MaxDim = TV_MAX_DIM>
-struct ShapeBase : public SimpleVector<int, MaxDim> {
-  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
-  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
-      : SimpleVector<int, MaxDim>(shape) {}
-
-  template <typename scalar_t, template <class...> class Container>
-  ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
-  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
-      : SimpleVector<int, MaxDim>(shape) {}
-  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
-
-  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
-#endif
-    ShapeBase<MaxDim> shape;
-    for (int i = start; i < end; ++i) {
-      shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && start <= this->mSize);
-#endif
-    ShapeBase<MaxDim> shape;
-    for (int i = start; i < this->mSize; ++i) {
-      shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-
-  TV_HOST_DEVICE_INLINE size_t size() const {
-    if (this->mSize == 0) return 0;
-    size_t s = 1;
-    for (int i = 0; i < int(this->mSize); ++i) {
-      s *= this->mArray[i];
-    }
-    return s;
-  }
-  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
-    ShapeBase<MaxDim> shape;
-    for (int i = 0; i < this->mSize; ++i) {
-      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
-    ShapeBase<MaxDim> shape;
-    for (int i = 0; i < this->mSize; ++i) {
-      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-};
-
-using Shape = ShapeBase<TV_MAX_DIM>;
-
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#ifdef TV_DEBUG
-  TV_ASSERT(sizeof...(indexes) == shape.size());
-#endif
-#pragma unroll
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           std::vector<int> &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = shape.size() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#pragma unroll
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           const Shape &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
-                                           const Index *shape) {
-  unsigned offset = 0;
-  unsigned m = 1;
-#pragma unroll
-  for (int i = NDim - 1; i >= 0; --i) {
-    offset += m * indexes[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
-                                           const Index *shape) {
-#pragma unroll
-  for (int i = NDim - 1; i >= 0; --i) {
-    output[i] = index % shape[i];
-    index -= output[i];
-    index /= shape[i];
-  }
-  return index;
-}
-
-template <int N>
-struct ArrayIndexRowMajor {
-  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
-                                            const Shape &indexes) {
-    return indexes[N - 1] +
-           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
-  }
-};
-
-template <>
-struct ArrayIndexRowMajor<0> {
-  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
-                                            const Shape &indexes) {
-    return 0;
-  }
-};
-
-namespace detail {
-template <typename scalar_t>
-constexpr const char *simpleTypeName(scalar_t val = scalar_t());
-template <>
-constexpr const char *simpleTypeName(float val) {
-  return "float32";
-}
-template <>
-constexpr const char *simpleTypeName(double val) {
-  return "float64";
-}
-template <>
-constexpr const char *simpleTypeName(int val) {
-  return "int32";
-}
-template <>
-constexpr const char *simpleTypeName(unsigned val) {
-  return "uint32";
-}
-template <>
-constexpr const char *simpleTypeName(long val) {
-  return "int64";
-}
-template <>
-constexpr const char *simpleTypeName(unsigned long val) {
-  return "uint64";
-}
-};  // namespace detail
-
-template <typename scalar_t, int Rank = -1>
-struct TensorView {
-  TV_HOST_DEVICE_INLINE TensorView() {}
-  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
-      : mPtr(ptr), mShape(shape) {}
-  template <class... Integers>
-  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
-      : mPtr(ptr) {
-    mShape = {int(shapes)...};
-  }
-
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
-      const TensorView<scalar_t, Rank> &tensor) {
-    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
-               "\n");
-    scalar_t *ptr = mPtr;
-    const scalar_t *other_ptr = tensor.data();
-    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
-    return *this;
-  }
-
-  template <typename T1>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
-      std::initializer_list<T1> seq) {
-    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
-               "\n");
-    scalar_t *ptr = mPtr;
-    for (const T1 &s : seq) *(ptr++) = scalar_t(s);
-    return *this;
-  }
-
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
-#ifdef TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               mShape.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 mShape[i]);
-    }
-#endif
-    return mPtr[rowArrayIdx(mShape, int(inds)...)];
-  }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
-#ifdef TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               mShape.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 mShape[i]);
-    }
-#endif
-    return mPtr[rowArrayIdx(mShape, int(inds)...)];
-  }
-  TV_HOST_DEVICE_INLINE scalar_t &operator()() {
-#if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mPtr != nullptr,
-                      "you want get value but the view is empty.%s", "\n");
-    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
-                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
-#else
-    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               mShape.ndim());
-#endif
-#endif
-    return mPtr[0];
-  }
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
-#if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mPtr != nullptr,
-                      "you want get value but the view is empty.%s", "\n");
-    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
-                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
-#else
-    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               mShape.ndim());
-#endif
-#endif
-    return mPtr[0];
-  }
-
-  template <class T1>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
-#if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
-                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
-#else
-    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
-#endif
-#endif
-    return mPtr[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
-                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-#else
-    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-#endif
-#endif
-    return mPtr[i1 * mShape[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
-                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-#else
-    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-#endif
-#endif
-    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
-                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
-                      mShape[3]);
-#else
-    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
-#endif
-#endif
-    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
-  }
-
-  template <class T1>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
-                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-#else
-    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-#endif
-#endif
-    return mPtr[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
-                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-#else
-    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-
-#endif
-#endif
-    return mPtr[i1 * mShape[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
-                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-#else
-    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-#endif
-#endif
-    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
-                                                   T4 i4) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
-                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
-                      mShape[3]);
-#else
-    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
-#endif
-#endif
-    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
-  }
-
-  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
-                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
-#else
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-               int(idx), size());
-#endif
-#endif
-    return mPtr[idx];
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
-      SimpleVector<Slice> slice_vec) {
-    return _subview(slice_vec);
-  }
-  TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
-      SimpleVector<Slice> slice_vec) const {
-    return _subview(slice_vec);
-  }
-  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
-  TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
-  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
-  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
-  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
-  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
-    Shape shapes{int(newShapes)...};
-    TV_ASSERT(shapes.size() == size());
-    mShape = shapes;
-    return *this;
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
-    TV_ASSERT(shapes.size() == size());
-    mShape = shapes;
-    return *this;
-  }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
-      Inds... newShapes) const {
-    Shape shapes{int(newShapes)...};
-    for (size_t i = 0; i < shapes.ndim(); ++i) {
-      if (shapes[i] == -1) {
-        shapes[i] = 1;
-        shapes[i] = size() / shapes.size();
-        break;
-      }
-    }
-    TV_ASSERT(shapes.size() == size());
-    return TensorView<scalar_t, Rank>(mPtr, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
-    TV_ASSERT(shapes.size() == size());
-    return TensorView<scalar_t, Rank>(mPtr, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
-    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
-    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
-  }
-  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
-
-  template <class... Slices>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
-      Slice slice, Slices... slices) const {
-    return subview<float, Slice, Slices...>(slice, slices...);
-  }
-  template <class T2 = float, class... Slices>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
-      Slices... slices) const {
-    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
-    Shape new_shape{to_slice(slices)[0]...};
-    Shape start{to_slice(slices)[0]...};
-    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
-    TV_ASSERT(new_shape.ndim() != 0);
-    size_t idxsize = new_shape.ndim();
-    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
-      new_shape.push_back(0);
-      start.push_back(0);
-    }
-#pragma unroll
-    for (size_t i = 0; i < sizeof...(Slices); ++i) {
-      if (slice_vec[i][1] != -1) {
-        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
-        TV_ASSERT(new_shape[i] >= 0);
-      } else {
-        new_shape[i] = 1;
-      }
-    }
-    auto offset = rowArrayIdx(mShape, start);
-#pragma unroll
-    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
-      new_shape[i] = mShape[i];
-      TV_ASSERT(new_shape[i] >= 0);
-    }
-    Shape reduced_shape;
-#pragma unroll
-    for (size_t i = 0; i < sizeof...(Slices); ++i) {
-      if (slice_vec[i][1] != -1) {
-        reduced_shape.push_back(new_shape[i]);
-      }
-    }
-#pragma unroll
-    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
-      reduced_shape.push_back(new_shape[i]);
-    }
-    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
-  }
-
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
-                                                           Integers... ints) {
-    Shape start = {id, ints...};
-    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
-      start.push_back(0);
-    }
-    return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
-                                      mShape.subshape(sizeof...(ints) + 1));
-  }
-
-  std::string repr() const {
-    std::ostringstream ss;
-    if (empty()) return "";
-    if (mShape.ndim() == 0) {
-      ss << *mPtr;
-      ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
-      return ss.str();
-    }
-    Shape counter = mShape;
-    auto tensor_flat = this->view(-1);
-    for (int i = 0; i < counter.ndim(); ++i) {
-      counter[i] = 0;
-      ss << "[";
-    }
-    for (size_t i = 0; i < this->size(); ++i) {
-      ss << tensor_flat(rowArrayIdx(mShape, counter));
-      counter[counter.ndim() - 1] += 1;
-      int inc_count = 0;
-      bool print_comma = true;
-      for (int c = counter.ndim() - 1; c >= 0; --c) {
-        if (counter[c] == this->dim(c) && c > 0) {
-          ++inc_count;
-          counter[c - 1] += 1;
-          counter[c] = 0;
-          print_comma = false;
-        }
-      }
-      if (print_comma && i != this->size() - 1) ss << ", ";
-      for (int j = 0; j < inc_count; ++j) {
-        ss << "]";
-      }
-      if (i != this->size() - 1) {
-        if (inc_count != 0) ss << "\n";
-        for (int j = 0; j < inc_count; ++j) {
-          ss << "[";
-        }
-      }
-    }
-    ss << "]";
-    ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
-    return ss.str();
-  }
-
- protected:
-  // TODO: make this function public.
-  // currently this function is called unexpectedly when using subview({0, 0}).
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
-      SimpleVector<Slice> slice_vec) {
-    Shape new_shape;
-    for (int i = 0; i < slice_vec.size(); ++i) {
-      new_shape.push_back(slice_vec[i][0]);
-    }
-    Shape start = new_shape;
-    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
-    TV_ASSERT(new_shape.ndim() != 0);
-    size_t idxsize = new_shape.ndim();
-    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
-      new_shape.push_back(0);
-      start.push_back(0);
-    }
-    for (size_t i = 0; i < slice_vec.size(); ++i) {
-      if (slice_vec[i][1] != -1) {
-        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
-        TV_ASSERT(new_shape[i] >= 0);
-      } else {
-        new_shape[i] = 1;  // reduce dim
-      }
-    }
-    auto offset = rowArrayIdx(mShape, start);
-    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
-      new_shape[i] = mShape[i];
-      TV_ASSERT(new_shape[i] >= 0);
-    }
-    Shape reduced_shape;
-    for (size_t i = 0; i < slice_vec.size(); ++i) {
-      if (slice_vec[i][1] != -1) {
-        reduced_shape.push_back(new_shape[i]);
-      }
-    }
-    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
-      reduced_shape.push_back(new_shape[i]);
-    }
-    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
-  }
-  template <typename T1>
-  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
-    return Slice{int(s), -1, -1};
-  }
-
-  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
-
-  scalar_t *mPtr = nullptr;
-  Shape mShape;
-};
-
-template <typename Os, typename scalar_t, int Rank>
-Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
-  os << dt.repr();
-  return os;
-}
-
-template <typename Os, typename scalar_t, int Rank>
-Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
-  os << dt.repr();
-  return os;
-}
-
-namespace detail {
-template <typename scalar_t>
-constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
-template <>
-constexpr const char *printfTypeFormat(float val) {
-  return "%.2f";
-}
-template <>
-constexpr const char *printfTypeFormat(double val) {
-  return "%.2f";
-}
-template <>
-constexpr const char *printfTypeFormat(int val) {
-  return "%d";
-}
-template <>
-constexpr const char *printfTypeFormat(unsigned val) {
-  return "%u";
-}
-template <>
-constexpr const char *printfTypeFormat(long val) {
-  return "%ld";
-}
-template <>
-constexpr const char *printfTypeFormat(unsigned long val) {
-  return "%lu";
-}
-};  // namespace detail
-
-template <typename scalar_t>
-TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
-                                    const char *format) {
-  if (tensor.empty()) return;
-  if (tensor.ndim() == 0) {
-    printf(format, tensor());
-    printf("\n");
-    return;
-  }
-  Shape counter = tensor.shape();
-  auto tensor_flat = tensor.view(-1);
-  for (int i = 0; i < counter.ndim(); ++i) {
-    counter[i] = 0;
-    printf("[");
-  }
-  for (size_t i = 0; i < tensor.size(); ++i) {
-    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
-    counter[counter.ndim() - 1] += 1;
-    int inc_count = 0;
-    bool print_comma = true;
-    for (int c = counter.ndim() - 1; c >= 0; --c) {
-      if (counter[c] == tensor.dim(c) && c > 0) {
-        ++inc_count;
-        counter[c - 1] += 1;
-        counter[c] = 0;
-        print_comma = false;
-      }
-    }
-    if (print_comma && i != tensor.size() - 1) printf(", ");
-    for (int j = 0; j < inc_count; ++j) {
-      printf("]");
-    }
-    if (i != tensor.size() - 1) {
-      if (inc_count != 0) printf("\n");
-      for (int j = 0; j < inc_count; ++j) {
-        printf("[");
-      }
-    }
-  }
-  printf("]\n");
-}
-
-template <typename scalar_t>
-TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
-  using Traw = typename std::remove_const<scalar_t>::type;
-  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
-}
-template <typename scalar_t>
-TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
-  using Traw = typename std::remove_const<scalar_t>::type;
-  return printTensorView(TensorView<const scalar_t>(ptr, shape),
-                         detail::printfTypeFormat<Traw>());
-}
-template <typename scalar_t>
-TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
-                                    const char *format) {
-  return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
-}
-
-}  // namespace tv
--- a/mmcv/ops/csrc/onnxruntime/corner_pool.h
+++ b/mmcv/ops/csrc/onnxruntime/corner_pool.h
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_CORNER_POOL_H
+#define ONNXRUNTIME_CORNER_POOL_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVCornerPoolKernel {
+ public:
+  MMCVCornerPoolKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "mode");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+
+  int64_t mode_;
+};
+
+struct MMCVCornerPoolCustomOp
+    : Ort::CustomOpBase<MMCVCornerPoolCustomOp, MMCVCornerPoolKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCornerPoolKernel(api, info);
+  }
+
+  const char* GetName() const { return "MMCVCornerPool"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_CORNER_POOL_H
--- a/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "corner_pool.h"
+
+#include "../ort_mmcv_utils.h"
+
+void TopPoolForwardCPU(const float *input, float *output, const int batch_size,
+                       const int channels, const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int w = 0; w < width; w++) {
+        // directly copy the most bottom value from input to output
+        output[index_n_c + (height - 1) * width + w] =
+            input[index_n_c + (height - 1) * width + w];
+        // do top_pool
+        for (int h = height - 2; h >= 0; h--) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + (h + 1) * width + w],
+                       input[index_n_c + h * width + w]);
+        }  // for h
+      }    // for w
+    }      // for c
+  }        // for n
+}
+
+void BottomPoolForwardCPU(const float *input, float *output,
+                          const int batch_size, const int channels,
+                          const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int w = 0; w < width; w++) {
+        // directly copy the most top value from input to output
+        output[index_n_c + w] = input[index_n_c + w];
+        // do top_pool
+        for (int h = 1; h < height; h++) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + (h - 1) * width + w],
+                       input[index_n_c + h * width + w]);
+        }  // for h
+      }    // for w
+    }      // for c
+  }        // for n
+}
+
+void LeftPoolForwardCPU(const float *input, float *output, const int batch_size,
+                        const int channels, const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int h = 0; h < height; h++) {
+        // directly copy the most right value from input to output
+        output[index_n_c + h * width + width - 1] =
+            input[index_n_c + h * width + width - 1];
+        // do left_pool
+        for (int w = width - 2; w >= 0; w--) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + h * width + w + 1],
+                       input[index_n_c + h * width + w]);
+        }  // for w
+      }    // for h
+    }      // for c
+  }        // for n
+}
+
+void RightPoolForwardCPU(const float *input, float *output,
+                         const int batch_size, const int channels,
+                         const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int h = 0; h < height; h++) {
+        // directly copy the most left value from input to output
+        output[index_n_c + h * width] = input[index_n_c + h * width];
+        // do right_pool
+        for (int w = 1; w < width; w++) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + h * width + w - 1],
+                       input[index_n_c + h * width + w]);
+        }  // for w
+      }    // for h
+    }      // for c
+  }        // for n
+}
+
+void MMCVCornerPoolKernel::Compute(OrtKernelContext *context) {
+  const int mode = int(mode_);
+  typedef float T;
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const T *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(input));
+
+  // get output memory
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  T *output_data = ort_.GetTensorMutableData<T>(output);
+
+  // 'top': 0, 'bottom': 1, 'left': 2, 'right':3
+  assert(mode == 0 || mode == 1 || mode == 2 || mode == 3);
+
+  // do corner_pool
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+  if (mode == 0)
+    TopPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                      input_height, input_width);
+  else if (mode == 1)
+    BottomPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                         input_height, input_width);
+  else if (mode == 2)
+    LeftPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                       input_height, input_width);
+  else
+    RightPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                        input_height, input_width);
+}
--- a/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+void gemm_ref_fp32_deform(const float *A, const float *B, const float *V,
+                          const float *H, const int32_t trans_A,
+                          const int32_t trans_B, const int32_t M,
+                          const int32_t N, const int32_t K, const float alpha,
+                          const float beta, float *Y) {
+  if (!trans_A && !trans_B) {  // MK, KN; NN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && !trans_B) {  // KM, KN; TN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && trans_B) {  // KM, NK; TT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (!trans_A && trans_B) {  // MK, NK; NT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+}
+
+float bilinear_interpolate(const float *src, const int64_t src_h,
+                           const int64_t src_w, const float h, const float w) {
+  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
+    return 0;
+  }
+
+  int64_t h_low = floor(h);
+  int64_t w_low = floor(w);
+  int64_t h_high = h_low + 1;
+  int64_t w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+  float v3 = 0;
+  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+  float v4 = 0;
+  if (h_high <= src_h - 1 && w_high <= src_w - 1)
+    v4 = src[h_high * src_w + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+void deformable_im2col(const float *input, const float *offset,
+                       const int64_t src_h, const int64_t src_w,
+                       const int64_t kernel_h, const int64_t kernel_w,
+                       const int64_t pad_h, const int64_t pad_w,
+                       const int64_t stride_h, const int64_t stride_w,
+                       const int64_t dilation_h, const int64_t dilation_w,
+                       const int64_t channels, const int64_t offset_groups,
+                       const int64_t dst_h, const int64_t dst_w,
+                       float *columns) {
+  const int64_t indices = channels * dst_h * dst_w;
+  for (int64_t index = 0; index != indices; ++index) {
+    const int64_t w_col = index % dst_w;
+    const int64_t h_col = (index / dst_w) % dst_h;
+    const int64_t c_im = index / (dst_w * dst_h);
+    const int64_t c_col = c_im * kernel_h * kernel_w;
+
+    int64_t c_per_offset_grp = channels / offset_groups;
+    const int64_t grp_idx = c_im / c_per_offset_grp;
+    auto columns_ptr =
+        columns + (c_col * (dst_h * dst_w) + h_col * dst_w + w_col);
+    auto input_ptr = input + c_im * (src_h * src_w);
+    auto offset_ptr =
+        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        const int data_offset_h_ptr =
+            ((2 * (kh * kernel_w + kw)) * dst_h + h_col) * dst_w + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (kh * kernel_w + kw) + 1) * dst_h + h_col) * dst_w + w_col;
+
+        const float offset_h = offset_ptr[data_offset_h_ptr];
+        const float offset_w = offset_ptr[data_offset_w_ptr];
+        const float ih =
+            (h_col * stride_h - pad_h) + kh * dilation_h + offset_h;
+        const float iw =
+            (w_col * stride_w - pad_w) + kw * dilation_w + offset_w;
+        *columns_ptr = bilinear_interpolate(input_ptr, src_h, src_w, ih, iw);
+        columns_ptr += dst_h * dst_w;
+      }
+    }
+  }
+}
+
+void deformable_conv_forward(
+    const float *src, const float *offset, const float *filter,
+    const int64_t batch, const int64_t src_c, const int64_t src_h,
+    const int64_t src_w, const int64_t dst_c, const int64_t dst_h,
+    const int64_t dst_w, const int64_t group, const int64_t offset_group,
+    const int64_t channels, const int64_t num_output, const int64_t kernel_h,
+    const int64_t kernel_w, const int64_t stride_h, const int64_t stride_w,
+    const int64_t pad_h, const int64_t pad_w, const int64_t dilation_h,
+    const int64_t dilation_w, float *columns, float *dst) {
+  const int64_t ic_per_gp = channels / group;
+  const int64_t oc_per_gp = num_output / group;
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t g = 0; g < group; ++g) {
+      deformable_im2col(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          src_h, src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
+          columns);
+      float *dst_ptr =
+          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+
+      memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+
+      gemm_ref_fp32_deform(
+          filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
+          nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
+          ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
+    }
+  }
+}
+
+MMCVDeformConvKernel::MMCVDeformConvKernel(OrtApi api,
+                                           const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  std::vector<int64_t> stride =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+  stride_height_ = stride[0];
+  stride_width_ = stride[1];
+  std::vector<int64_t> padding =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+  padding_height_ = padding[0];
+  padding_width_ = padding[1];
+  std::vector<int64_t> dilation =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+  dilation_height_ = dilation[0];
+  dilation_width_ = dilation[1];
+  deformable_group_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void MMCVDeformConvKernel::Compute(OrtKernelContext *context) {
+  const int64_t stride_height = stride_height_;
+  const int64_t stride_width = stride_width_;
+  const int64_t padding_height = padding_height_;
+  const int64_t padding_width = padding_width_;
+  const int64_t dilation_height = dilation_height_;
+  const int64_t dilation_width = dilation_width_;
+  const int64_t deformable_group = deformable_group_;
+  const int64_t group = group_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
+  const float *offset_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
+
+  const OrtValue *filter = ort_.KernelContext_GetInput(context, 2);
+  const float *filter_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions filter_dims(ort_, filter);
+
+  int64_t batch_size = input_dims[0];
+  int64_t in_channels = input_dims[1];
+  int64_t in_height = input_dims[2];
+  int64_t in_width = input_dims[3];
+  int64_t out_channels = filter_dims[0];
+  int64_t kernel_height = filter_dims[2];
+  int64_t kernel_width = filter_dims[3];
+
+  // get output memory
+  int64_t out_height = floor((in_height + 2 * padding_height -
+                              dilation_height * (kernel_height - 1) - 1) /
+                                 stride_height +
+                             1);
+  int64_t out_width = floor(
+      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
+          stride_width +
+      1);
+
+  std::vector<int64_t> output_dims = {batch_size, out_channels, out_height,
+                                      out_width};
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  // allocate tmp memory
+  int64_t column_len = (in_channels / group) * kernel_height * kernel_width *
+                       out_height * out_width;
+  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
+  deformable_conv_forward(
+      input_data, offset_data, filter_data, batch_size, in_channels, in_height,
+      in_width, out_channels, out_height, out_width, group, deformable_group,
+      in_channels, out_channels, kernel_height, kernel_width, stride_height,
+      stride_width, padding_height, padding_width, dilation_height,
+      dilation_width, columns, out_ptr);
+}
--- a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include <cmath>
+
+#include "../ort_mmcv_utils.h"
+#include "grid_sample.h"
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) < (b)) ? (b) : (a))
+#define CLIP_COORDINATES(in, out, clip_limit) \
+  out = MIN((clip_limit - 1), MAX(in, 0))
+
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
+
+GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+  interpolation_mode_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
+enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
+
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    return ((coord + 1) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1),
+                  std::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
+                                           int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename scalar_t>
+static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
+                                           int64_t padding_mode,
+                                           bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    }
+    coord = clip_coordinates(coord, size);
+  }
+  return coord;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index(scalar_t coord,
+                                                         int64_t size,
+                                                         int64_t padding_mode,
+                                                         bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
+                                    int64_t W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x,
+                                         scalar_t y, int64_t W, int64_t H,
+                                         int64_t sW, int64_t sH,
+                                         int64_t padding_mode,
+                                         bool align_corners) {
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename scalar_t>
+static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4],
+                                                   scalar_t t) {
+  scalar_t A = -0.75;
+
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
+                                      scalar_t x3, scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+void GridSampleKernel::Compute(OrtKernelContext *context) {
+  const bool align_corners = align_corners_;
+  const int64_t padding_mode = padding_mode_;
+  const int64_t interpolation_mode = interpolation_mode_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
+  const float *grid_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions grid_dims(ort_, grid);
+  int64_t N = input_dims[0];
+  int64_t C = input_dims[1];
+  int64_t inp_H = input_dims[2];
+  int64_t inp_W = input_dims[3];
+  int64_t out_H = grid_dims[1];
+  int64_t out_W = grid_dims[2];
+
+  std::vector<int64_t> output_dims = {N, C, out_H, out_W};
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
+  int64_t inp_sC = input_dims[2] * input_dims[3];
+  int64_t inp_sH = input_dims[3];
+  int64_t inp_sW = 1;
+  int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
+  int64_t grid_sH = grid_dims[2] * grid_dims[3];
+  int64_t grid_sW = grid_dims[3];
+  int64_t grid_sCoor = 1;
+  int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
+  int64_t out_sC = output_dims[2] * output_dims[3];
+  int64_t out_sH = output_dims[3];
+  int64_t out_sW = 1;
+
+  // loop over each output pixel
+  for (int64_t n = 0; n < N; ++n) {
+    const float *grid_ptr_N = grid_data + n * grid_sN;
+    const float *inp_ptr_N = input_data + n * inp_sN;
+    for (int64_t h = 0; h < out_H; ++h) {
+      for (int64_t w = 0; w < out_W; ++w) {
+        const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+        float x = *grid_ptr_NHW;
+        float y = grid_ptr_NHW[grid_sCoor];
+
+        float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode,
+                                                     align_corners);
+        float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode,
+                                                     align_corners);
+
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+          // get corner pixel values from (x, y)
+          // for 4d, we use north-east-south-west
+          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
+          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
+
+          int64_t ix_ne = ix_nw + 1;
+          int64_t iy_ne = iy_nw;
+
+          int64_t ix_sw = ix_nw;
+          int64_t iy_sw = iy_nw + 1;
+
+          int64_t ix_se = ix_nw + 1;
+          int64_t iy_se = iy_nw + 1;
+
+          // get surfaces to each neighbor:
+          float nw = (ix_se - ix) * (iy_se - iy);
+          float ne = (ix - ix_sw) * (iy_sw - iy);
+          float sw = (ix_ne - ix) * (iy - iy_ne);
+          float se = (ix - ix_nw) * (iy - iy_nw);
+
+          // calculate bilinear weighted pixel value and set output pixel
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            auto res = static_cast<float>(0);
+            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+            }
+            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+            }
+            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+            }
+            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+            }
+            *out_ptr_NCHW = res;
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+          int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+          int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+
+          // assign nearest neighbor pixel value to output pixel
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          const float *inp_ptr_NC = inp_ptr_N;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+              *out_ptr_NCHW =
+                  inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+            } else {
+              *out_ptr_NCHW = static_cast<float>(0);
+            }
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+          // grid_sampler_compute_source_index will "clip the value" of idx
+          // depends on the padding,
+          // which would cause calculation to be wrong,
+          // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
+          // = floor(x) = -1
+          // There would be more problem in reflection padding, since the -1 and
+          // +1 direction is not fixed in boundary condition
+          ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+          iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+
+          float ix_nw = std::floor(ix);
+          float iy_nw = std::floor(iy);
+
+          const float tx = ix - ix_nw;
+          const float ty = iy - iy_nw;
+
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            float coefficients[4];
+
+            // Interpolate 4 values in the x direction
+            for (int64_t i = 0; i < 4; ++i) {
+              coefficients[i] = cubic_interp1d<float>(
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  tx);
+            }
+
+            // Interpolate in the y direction
+            *out_ptr_NCHW =
+                cubic_interp1d<float>(coefficients[0], coefficients[1],
+                                      coefficients[2], coefficients[3], ty);
+          }
+        }
+      }
+    }
+  }
+}
--- a/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "modulated_deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+float bilinear_interpolate_2d(const float *src, const int64_t src_h,
+                              const int64_t src_w, const float h,
+                              const float w) {
+  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
+    return 0;
+  }
+
+  int64_t h_low = floor(h);
+  int64_t w_low = floor(w);
+  int64_t h_high = h_low + 1;
+  int64_t w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+  float v3 = 0;
+  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+  float v4 = 0;
+  if (h_high <= src_h - 1 && w_high <= src_w - 1)
+    v4 = src[h_high * src_w + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+// output: (channels * kernel_h * kernel_w, dst_h * dst_w)
+void deformable_im2col_2d(const float *input, const float *offset,
+                          const float *mask, const int64_t src_h,
+                          const int64_t src_w, const int64_t kernel_h,
+                          const int64_t kernel_w, const int64_t pad_h,
+                          const int64_t pad_w, const int64_t stride_h,
+                          const int64_t stride_w, const int64_t dilation_h,
+                          const int64_t dilation_w, const int64_t channels,
+                          const int64_t offset_groups, const int64_t dst_h,
+                          const int64_t dst_w, const bool use_mask,
+                          float *columns) {
+  const int64_t workload = channels * dst_h * dst_w;
+  for (int64_t index = 0; index != workload; ++index) {
+    const int64_t ow = index % dst_w;
+    const int64_t oh = (index / dst_w) % dst_h;
+    const int64_t ic = index / (dst_w * dst_h);
+    const int64_t oc = ic * kernel_h * kernel_w;
+
+    int64_t c_per_offset_grp = channels / offset_groups;
+    const int64_t grp_idx = ic / c_per_offset_grp;
+
+    auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
+    auto input_ptr = input + ic * (src_h * src_w);
+    auto offset_ptr =
+        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+    auto mask_ptr = mask;
+    if (use_mask) {
+      mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
+    }
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        const int64_t mask_idx = kh * kernel_w + kw;
+        const int64_t offset_idx = 2 * mask_idx;
+
+        float mask_value = 1;
+        if (use_mask) {
+          mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        }
+
+        const float offset_h =
+            offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        const float offset_w =
+            offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
+        const float ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
+        const float iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
+        *columns_ptr = mask_value *
+                       bilinear_interpolate_2d(input_ptr, src_h, src_w, ih, iw);
+        columns_ptr += dst_h * dst_w;
+      }
+    }
+  }
+}
+
+void gemm_ref_fp32(const float *A, const float *B, const float *V,
+                   const float *H, const int32_t trans_A, const int32_t trans_B,
+                   const int32_t M, const int32_t N, const int32_t K,
+                   const float alpha, const float beta, float *Y) {
+  if (!trans_A && !trans_B) {  // MK, KN; NN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && !trans_B) {  // KM, KN; TN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && trans_B) {  // KM, NK; TT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (!trans_A && trans_B) {  // MK, NK; NT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+}
+
+void deformable_conv2d_ref_fp32(
+    const float *src, const float *offset, const float *mask,
+    const float *filter, const float *bias, const int64_t batch,
+    const int64_t src_c, const int64_t src_h, const int64_t src_w,
+    const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
+    const int64_t group, const int64_t offset_group, const int64_t channels,
+    const int64_t num_output, const int64_t kernel_h, const int64_t kernel_w,
+    const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
+    const int64_t pad_w, const int64_t dilation_h, const int64_t dilation_w,
+    float *columns, float *dst) {
+  const int64_t ic_per_gp = channels / group;
+  const int64_t oc_per_gp = num_output / group;
+
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t g = 0; g < group; ++g) {
+      deformable_im2col_2d(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h,
+          src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
+          mask != nullptr, columns);
+      float *dst_ptr =
+          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+      if (bias != nullptr) {
+        const float *bias_ptr = bias + g * oc_per_gp;
+        for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
+          for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
+            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
+          }
+        }
+      } else {
+        memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+      }
+      gemm_ref_fp32(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w,
+                    columns, nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
+                    ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
+    }
+  }
+}
+
+MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(
+    OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  std::vector<int64_t> stride =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+  stride_height_ = stride[0];
+  stride_width_ = stride[1];
+  std::vector<int64_t> padding =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+  padding_height_ = padding[0];
+  padding_width_ = padding[1];
+  std::vector<int64_t> dilation =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+  dilation_height_ = dilation[0];
+  dilation_width_ = dilation[1];
+  deformable_group_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
+  const int64_t stride_height = stride_height_;
+  const int64_t stride_width = stride_width_;
+  const int64_t padding_height = padding_height_;
+  const int64_t padding_width = padding_width_;
+  const int64_t dilation_height = dilation_height_;
+  const int64_t dilation_width = dilation_width_;
+  const int64_t deformable_group = deformable_group_;
+  const int64_t group = group_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
+  const float *offset_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
+
+  const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
+  const float *mask_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
+
+  const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
+  const float *filter_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
+
+  const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
+  const float *bias_data =
+      (bias != nullptr)
+          ? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
+          : nullptr;
+  // const float *bias_data = nullptr;
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions filter_dims(ort_, filter);
+
+  int64_t batch = input_dims[0];
+  int64_t channels = input_dims[1];
+  int64_t in_height = input_dims[2];
+  int64_t in_width = input_dims[3];
+  int64_t num_output = filter_dims[0];
+  int64_t kernel_height = filter_dims[2];
+  int64_t kernel_width = filter_dims[3];
+
+  // get output memory
+  int64_t out_height = floor((in_height + 2 * padding_height -
+                              dilation_height * (kernel_height - 1) - 1) /
+                                 stride_height +
+                             1);
+  int64_t out_width = floor(
+      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
+          stride_width +
+      1);
+
+  std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  // allocate tmp memory
+  int64_t column_len = (channels / group) * kernel_height * kernel_width *
+                       out_height * out_width;
+  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
+
+  deformable_conv2d_ref_fp32(
+      input_data, offset_data, mask_data, filter_data, bias_data, batch,
+      channels, in_height, in_width, num_output, out_height, out_width, group,
+      deformable_group, channels, num_output, kernel_height, kernel_width,
+      stride_height, stride_width, padding_height, padding_width,
+      dilation_height, dilation_width, columns, out_ptr);
+}
--- a/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "nms.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <iterator>
+#include <numeric>  // std::iota
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+NmsKernel::NmsKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void NmsKernel::Compute(OrtKernelContext *context) {
+  const float iou_threshold = iou_threshold_;
+  const int64_t offset = offset_;
+
+  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
+  const float *boxes_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(boxes));
+  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
+  const float *scores_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(scores));
+
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+
+  int64_t nboxes = boxes_dim[0];
+  assert(boxes_dim[1] == 4);
+
+  // allocate tmp memory
+  float *tmp_boxes = (float *)allocator_.Alloc(sizeof(float) * nboxes * 4);
+  float *sc = (float *)allocator_.Alloc(sizeof(float) * nboxes);
+  float *areas = (float *)allocator_.Alloc(sizeof(float) * nboxes);
+  bool *select = (bool *)allocator_.Alloc(sizeof(bool) * nboxes);
+  for (int64_t i = 0; i < nboxes; i++) {
+    select[i] = true;
+  }
+
+  memcpy(tmp_boxes, boxes_data, sizeof(float) * nboxes * 4);
+  memcpy(sc, scores_data, sizeof(float) * nboxes);
+
+  // sort scores
+  std::vector<float> tmp_sc;
+  for (int i = 0; i < nboxes; i++) {
+    tmp_sc.push_back(sc[i]);
+  }
+  std::vector<int64_t> order(tmp_sc.size());
+  std::iota(order.begin(), order.end(), 0);
+  std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2) {
+    return tmp_sc[id1] > tmp_sc[id2];
+  });
+
+  // area = (x2 - x1 + offset) * (y2 - y1 + offset)
+  for (int64_t i = 0; i < nboxes; i++) {
+    areas[i] = (tmp_boxes[i * 4 + 2] - tmp_boxes[i * 4 + 0] + offset) *
+               (tmp_boxes[i * 4 + 3] - tmp_boxes[i * 4 + 1] + offset);
+  }
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = tmp_boxes[i * 4 + 0];
+    auto iy1 = tmp_boxes[i * 4 + 1];
+    auto ix2 = tmp_boxes[i * 4 + 2];
+    auto iy2 = tmp_boxes[i * 4 + 3];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, tmp_boxes[j * 4 + 0]);
+      auto yy1 = std::max(iy1, tmp_boxes[j * 4 + 1]);
+      auto xx2 = std::min(ix2, tmp_boxes[j * 4 + 2]);
+      auto yy2 = std::min(iy2, tmp_boxes[j * 4 + 3]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  std::vector<int64_t> res_order;
+  for (int i = 0; i < nboxes; i++) {
+    if (select[i]) {
+      res_order.push_back(order[i]);
+    }
+  }
+
+  std::vector<int64_t> inds_dims({res_order.size()});
+
+  OrtValue *res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(),
+                                               inds_dims.size());
+  int64_t *res_data = ort_.GetTensorMutableData<int64_t>(res);
+
+  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+}