working on tensor core test

01ed382c · yan.yan · 3517290c · 3517290c · 3517290c · 3517290c
Commit 01ed382c authored Oct 18, 2021 by yan.yan
20 changed files
--- a/include/tensorview/tools.h
+++ b/include/tensorview/tools.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <chrono>
-#ifdef TV_CUDA
-#include <cuda_runtime_api.h>
-#endif
-#include <iostream>
-
-namespace tv {
-
-#ifdef TV_CUDA
-template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
-  CudaContextTimer() {
-    cudaDeviceSynchronize();
-    mCurTime = std::chrono::steady_clock::now();
-  }
-  typename TimeT::rep report() {
-    cudaDeviceSynchronize();
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-  template <int Count, typename F>
-  double benchmark(F &&f, int start = int(Count) * 0.3) {
-    // std::vector<TimeT::rep> times;
-    auto res = typename TimeT::rep();
-    int count = 0;
-    cudaDeviceSynchronize();
-    for (int i = 0; i < Count; ++i) {
-      std::forward<F>(f)();
-      auto time = report();
-      if (i >= start) {
-        // times.push_back(time)
-        res += time;
-        count += 1;
-      }
-    }
-    return res / double(count);
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-#endif
-
-template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
-  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
-  typename TimeT::rep report() {
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-
-} // namespace tv
--- a/include/tensorview/torch_utils.h
+++ b/include/tensorview/torch_utils.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "mp_helper.h"
-#include <ATen/ATen.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-#ifdef TV_CUDA
-#include <ATen/cuda/CUDAContext.h>
-#endif
-
-namespace tv {
-
-#ifdef TV_CUDA
-struct TorchGPU : public tv::GPU {
-  virtual cudaStream_t getStream() const override {
-    return at::cuda::getCurrentCUDAStream();
-  }
-};
-#endif
-namespace detail {
-template <typename T> struct TypeToTorchDtypeTraits;
-
-template <> struct TypeToTorchDtypeTraits<int32_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt32;
-};
-template <> struct TypeToTorchDtypeTraits<int16_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt16;
-};
-template <> struct TypeToTorchDtypeTraits<int8_t> {
-  static constexpr decltype(torch::kInt8) value = torch::kInt8;
-};
-template <> struct TypeToTorchDtypeTraits<int64_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt64;
-};
-template <> struct TypeToTorchDtypeTraits<uint8_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kUInt8;
-};
-template <> struct TypeToTorchDtypeTraits<bool> {
-  static constexpr decltype(torch::kInt32) value = torch::kBool;
-};
-template <> struct TypeToTorchDtypeTraits<float> {
-  static constexpr decltype(torch::kInt32) value = torch::kFloat32;
-};
-template <> struct TypeToTorchDtypeTraits<double> {
-  static constexpr decltype(torch::kInt32) value = torch::kFloat64;
-};
-template <> struct TypeToTorchDtypeTraits<at::Half> {
-  static constexpr decltype(torch::kInt32) value = torch::kHalf;
-};
-
-using all_torch_types_t = std::tuple<float, double, int8_t, int16_t, int32_t,
-                                     int64_t, uint8_t, bool, at::Half>;
-
-} // namespace detail
-
-template <typename T>
-constexpr decltype(torch::kInt32) torch_type_v =
-    detail::TypeToTorchDtypeTraits<T>::value;
-
-template <class... Ts, typename F>
-void dispatch_torch(at::ScalarType t, F &&f) {
-  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
-  bool notFound = true;
-  tv::mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    if (detail::TypeToTorchDtypeTraits<decltype(I)>::value == t) {
-      std::forward<F>(f)(decltype(I)());
-      notFound = false;
-    }
-  });
-  if (notFound) {
-    std::stringstream ss;
-    tv::mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
-      ss << tv::detail::TypeToString<decltype(I)>::value << " ";
-    });
-    TV_THROW_RT_ERR("unknown type", t, ", available:", ss.str());
-  }
-}
-
-template <class T> struct DispatchTorch;
-
-template <template <class...> class T, class... Args>
-struct DispatchTorch<T<Args...>> {
-  template <typename F> inline void operator()(at::ScalarType t, F &&f) {
-    return dispatch_torch<Args...>(t, std::forward<F>(f));
-  }
-};
-
-template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
-  DispatchTorch<detail::all_torch_types_t>()(tensor.scalar_type(), [&](auto I) {
-    using Ttensor = decltype(I);
-    constexpr bool val = std::is_same<std::remove_cv_t<T>, Ttensor>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-  });
-}
-
-template <typename T, int Rank = -1,
-          template <class> class PtrTraits = DefaultPtrTraits,
-          typename Tindex = int>
-TensorView<T, Rank, PtrTraits, Tindex> torch2tv(const torch::Tensor &tensor) {
-  using tv_shape_t =
-      typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
-  check_torch_dtype<T>(tensor);
-  // TODO stride
-  if (Rank > 0) {
-    TV_ASSERT_INVALID_ARG(tensor.dim() == Rank, "error");
-  }
-  tv_shape_t shape;
-  for (auto i : tensor.sizes()) {
-    shape.push_back(i);
-  }
-  return tv::TensorView<T, Rank, PtrTraits, Tindex>(
-      tensor.data_ptr<std::remove_const_t<T>>(), shape);
-}
-
-template <typename T>
-torch::Tensor torch_slice_first_axis(torch::Tensor tensor, T start, T end) {
-  // only torch >= 1.5 have tensor slice.
-  torch::Tensor res;
-  auto tensor_shape = tensor.sizes();
-  std::vector<int64_t> shape(tensor_shape.begin(), tensor_shape.end());
-  shape[0] = end - start;
-  uint8_t *ptr = reinterpret_cast<uint8_t *>(tensor.data_ptr());
-  res = torch::from_blob(ptr + start * tensor.stride(0) * tensor.itemsize(),
-                         torch::IntArrayRef(shape), tensor.options());
-  return res;
-}
-
-namespace detail {
-template <> struct TypeToString<at::Half> {
-  static constexpr const char *value = "half";
-};
-} // namespace detail
-} // namespace tv
\ No newline at end of file
--- a/include/torch_utils.h
+++ b/include/torch_utils.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-
-#include <ATen/ATen.h>
-#include <torch/script.h>
-#ifdef TV_CUDA
-#include <ATen/cuda/CUDAContext.h>
-#endif
-
-namespace tv {
-
-#ifdef TV_CUDA
-struct TorchGPU : public tv::GPU {
-  virtual cudaStream_t getStream() const override {
-    return at::cuda::getCurrentCUDAStream();
-  }
-};
-#endif
-template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
-  switch (tensor.scalar_type()) {
-  case at::ScalarType::Double: {
-    auto val = std::is_same<std::remove_const_t<T>, double>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  case at::ScalarType::Float: {
-    auto val = std::is_same<std::remove_const_t<T>, float>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  case at::ScalarType::Int: {
-    auto val = std::is_same<std::remove_const_t<T>, int>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  case at::ScalarType::Half: {
-    auto val = std::is_same<std::remove_const_t<T>, at::Half>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  case at::ScalarType::Long: {
-    auto val = std::is_same<std::remove_const_t<T>, long>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  default:
-    TV_ASSERT_RT_ERR(false, "error");
-  }
-}
-namespace detail {
-template <typename T> struct TypeToTorchDtypeTraits;
-
-template <> struct TypeToTorchDtypeTraits<int32_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt32;
-};
-
-template <> struct TypeToTorchDtypeTraits<int64_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt64;
-};
-
-template <> struct TypeToTorchDtypeTraits<float> {
-  static constexpr decltype(torch::kInt32) value = torch::kFloat32;
-};
-template <> struct TypeToTorchDtypeTraits<double> {
-  static constexpr decltype(torch::kInt32) value = torch::kFloat64;
-};
-template <> struct TypeToTorchDtypeTraits<at::Half> {
-  static constexpr decltype(torch::kInt32) value = torch::kHalf;
-};
-
-} // namespace detail
-
-template <typename T>
-constexpr decltype(torch::kInt32) torch_type_v =
-    detail::TypeToTorchDtypeTraits<T>::value;
-
-template <typename T> tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
-  check_torch_dtype<T>(tensor);
-  tv::Shape shape;
-  for (auto i : tensor.sizes()) {
-    shape.push_back(i);
-  }
-  return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
-}
-namespace detail {
-template <> struct TypeToString<at::Half> {
-  static constexpr const char *value = "half";
-};
-} // namespace detail
-template <class... Ts, typename F>
-void dispatch_torch(at::ScalarType t, F &&f) {
-  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
-  bool notFound = true;
-  spconv::tv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    if (torch_type_v<decltype(I)> == t) {
-      std::forward<F>(f)(decltype(I)());
-      notFound = false;
-    }
-  });
-  if (notFound) {
-    std::stringstream ss;
-    spconv::tv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
-      ss << tv::detail::TypeToString<decltype(I)>::value << " ";
-    });
-    TV_THROW_RT_ERR("unknown type", t, ", available: ", ss.str());
-  }
-}
-
-} // namespace tv
\ No newline at end of file
--- a/include/tsl/robin_growth_policy.h
+++ b/include/tsl/robin_growth_policy.h
-/**
- * MIT License
- *
- * Copyright (c) 2017 Tessil
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_GROWTH_POLICY_H
-#define TSL_ROBIN_GROWTH_POLICY_H
-
-#include <algorithm>
-#include <array>
-#include <climits>
-#include <cmath>
-#include <cstddef>
-#include <iterator>
-#include <limits>
-#include <ratio>
-#include <stdexcept>
-
-#ifdef TSL_DEBUG
-#define tsl_rh_assert(expr) assert(expr)
-#else
-#define tsl_rh_assert(expr) (static_cast<void>(0))
-#endif
-
-/**
- * If exceptions are enabled, throw the exception passed in parameter, otherwise
- * call std::terminate.
- */
-#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) ||                     \
-     (defined(_MSC_VER) && defined(_CPPUNWIND))) &&                            \
-    !defined(TSL_NO_EXCEPTIONS)
-#define TSL_RH_THROW_OR_TERMINATE(ex, msg) throw ex(msg)
-#else
-#ifdef NDEBUG
-#define TSL_RH_THROW_OR_TERMINATE(ex, msg) std::terminate()
-#else
-#include <cstdio>
-#define TSL_RH_THROW_OR_TERMINATE(ex, msg)                                     \
-  do {                                                                         \
-    std::fprintf(stderr, msg);                                                 \
-    std::terminate();                                                          \
-  } while (0)
-#endif
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#define TSL_RH_LIKELY(exp) (__builtin_expect(!!(exp), true))
-#else
-#define TSL_RH_LIKELY(exp) (exp)
-#endif
-
-namespace tsl {
-namespace rh {
-
-/**
- * Grow the hash table by a factor of GrowthFactor keeping the bucket count to a
- * power of two. It allows the table to use a mask operation instead of a modulo
- * operation to map a hash to a bucket.
- *
- * GrowthFactor must be a power of two >= 2.
- */
-template <std::size_t GrowthFactor> class power_of_two_growth_policy {
-public:
-  /**
-   * Called on the hash table creation and on rehash. The number of buckets for
-   * the table is passed in parameter. This number is a minimum, the policy may
-   * update this value with a higher value if needed (but not lower).
-   *
-   * If 0 is given, min_bucket_count_in_out must still be 0 after the policy
-   * creation and bucket_for_hash must always return 0 in this case.
-   */
-  explicit power_of_two_growth_policy(std::size_t &min_bucket_count_in_out) {
-    if (min_bucket_count_in_out > max_bucket_count()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    if (min_bucket_count_in_out > 0) {
-      min_bucket_count_in_out =
-          round_up_to_power_of_two(min_bucket_count_in_out);
-      m_mask = min_bucket_count_in_out - 1;
-    } else {
-      m_mask = 0;
-    }
-  }
-
-  /**
-   * Return the bucket [0, bucket_count()) to which the hash belongs.
-   * If bucket_count() is 0, it must always return 0.
-   */
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return hash & m_mask;
-  }
-
-  /**
-   * Return the number of buckets that should be used on next growth.
-   */
-  std::size_t next_bucket_count() const {
-    if ((m_mask + 1) > max_bucket_count() / GrowthFactor) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    return (m_mask + 1) * GrowthFactor;
-  }
-
-  /**
-   * Return the maximum number of buckets supported by the policy.
-   */
-  std::size_t max_bucket_count() const {
-    // Largest power of two.
-    return (std::numeric_limits<std::size_t>::max() / 2) + 1;
-  }
-
-  /**
-   * Reset the growth policy as if it was created with a bucket count of 0.
-   * After a clear, the policy must always return 0 when bucket_for_hash is
-   * called.
-   */
-  void clear() noexcept { m_mask = 0; }
-
-private:
-  static std::size_t round_up_to_power_of_two(std::size_t value) {
-    if (is_power_of_two(value)) {
-      return value;
-    }
-
-    if (value == 0) {
-      return 1;
-    }
-
-    --value;
-    for (std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
-      value |= value >> i;
-    }
-
-    return value + 1;
-  }
-
-  static constexpr bool is_power_of_two(std::size_t value) {
-    return value != 0 && (value & (value - 1)) == 0;
-  }
-
-protected:
-  static_assert(is_power_of_two(GrowthFactor) && GrowthFactor >= 2,
-                "GrowthFactor must be a power of two >= 2.");
-
-  std::size_t m_mask;
-};
-
-/**
- * Grow the hash table by GrowthFactor::num / GrowthFactor::den and use a modulo
- * to map a hash to a bucket. Slower but it can be useful if you want a slower
- * growth.
- */
-template <class GrowthFactor = std::ratio<3, 2>> class mod_growth_policy {
-public:
-  explicit mod_growth_policy(std::size_t &min_bucket_count_in_out) {
-    if (min_bucket_count_in_out > max_bucket_count()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    if (min_bucket_count_in_out > 0) {
-      m_mod = min_bucket_count_in_out;
-    } else {
-      m_mod = 1;
-    }
-  }
-
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return hash % m_mod;
-  }
-
-  std::size_t next_bucket_count() const {
-    if (m_mod == max_bucket_count()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    const double next_bucket_count =
-        std::ceil(double(m_mod) * REHASH_SIZE_MULTIPLICATION_FACTOR);
-    if (!std::isnormal(next_bucket_count)) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    if (next_bucket_count > double(max_bucket_count())) {
-      return max_bucket_count();
-    } else {
-      return std::size_t(next_bucket_count);
-    }
-  }
-
-  std::size_t max_bucket_count() const { return MAX_BUCKET_COUNT; }
-
-  void clear() noexcept { m_mod = 1; }
-
-private:
-  static constexpr double REHASH_SIZE_MULTIPLICATION_FACTOR =
-      1.0 * GrowthFactor::num / GrowthFactor::den;
-  static const std::size_t MAX_BUCKET_COUNT =
-      std::size_t(double(std::numeric_limits<std::size_t>::max() /
-                         REHASH_SIZE_MULTIPLICATION_FACTOR));
-
-  static_assert(REHASH_SIZE_MULTIPLICATION_FACTOR >= 1.1,
-                "Growth factor should be >= 1.1.");
-
-  std::size_t m_mod;
-};
-
-namespace detail {
-
-static constexpr const std::array<std::size_t, 40> PRIMES = {
-    {1ul,         5ul,         17ul,         29ul,         37ul,
-     53ul,        67ul,        79ul,         97ul,         131ul,
-     193ul,       257ul,       389ul,        521ul,        769ul,
-     1031ul,      1543ul,      2053ul,       3079ul,       6151ul,
-     12289ul,     24593ul,     49157ul,      98317ul,      196613ul,
-     393241ul,    786433ul,    1572869ul,    3145739ul,    6291469ul,
-     12582917ul,  25165843ul,  50331653ul,   100663319ul,  201326611ul,
-     402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul}};
-
-template <unsigned int IPrime>
-static constexpr std::size_t mod(std::size_t hash) {
-  return hash % PRIMES[IPrime];
-}
-
-// MOD_PRIME[iprime](hash) returns hash % PRIMES[iprime]. This table allows for
-// faster modulo as the compiler can optimize the modulo code better with a
-// constant known at the compilation.
-static constexpr const std::array<std::size_t (*)(std::size_t), 40> MOD_PRIME =
-    {{&mod<0>,  &mod<1>,  &mod<2>,  &mod<3>,  &mod<4>,  &mod<5>,  &mod<6>,
-      &mod<7>,  &mod<8>,  &mod<9>,  &mod<10>, &mod<11>, &mod<12>, &mod<13>,
-      &mod<14>, &mod<15>, &mod<16>, &mod<17>, &mod<18>, &mod<19>, &mod<20>,
-      &mod<21>, &mod<22>, &mod<23>, &mod<24>, &mod<25>, &mod<26>, &mod<27>,
-      &mod<28>, &mod<29>, &mod<30>, &mod<31>, &mod<32>, &mod<33>, &mod<34>,
-      &mod<35>, &mod<36>, &mod<37>, &mod<38>, &mod<39>}};
-
-} // namespace detail
-
-/**
- * Grow the hash table by using prime numbers as bucket count. Slower than
- * tsl::rh::power_of_two_growth_policy in general but will probably distribute
- * the values around better in the buckets with a poor hash function.
- *
- * To allow the compiler to optimize the modulo operation, a lookup table is
- * used with constant primes numbers.
- *
- * With a switch the code would look like:
- * \code
- * switch(iprime) { // iprime is the current prime of the hash table
- *     case 0: hash % 5ul;
- *             break;
- *     case 1: hash % 17ul;
- *             break;
- *     case 2: hash % 29ul;
- *             break;
- *     ...
- * }
- * \endcode
- *
- * Due to the constant variable in the modulo the compiler is able to optimize
- * the operation by a series of multiplications, substractions and shifts.
- *
- * The 'hash % 5' could become something like 'hash - (hash * 0xCCCCCCCD) >> 34)
- * * 5' in a 64 bits environement.
- */
-class prime_growth_policy {
-public:
-  explicit prime_growth_policy(std::size_t &min_bucket_count_in_out) {
-    auto it_prime = std::lower_bound(
-        detail::PRIMES.begin(), detail::PRIMES.end(), min_bucket_count_in_out);
-    if (it_prime == detail::PRIMES.end()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    m_iprime = static_cast<unsigned int>(
-        std::distance(detail::PRIMES.begin(), it_prime));
-    if (min_bucket_count_in_out > 0) {
-      min_bucket_count_in_out = *it_prime;
-    } else {
-      min_bucket_count_in_out = 0;
-    }
-  }
-
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return detail::MOD_PRIME[m_iprime](hash);
-  }
-
-  std::size_t next_bucket_count() const {
-    if (m_iprime + 1 >= detail::PRIMES.size()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    return detail::PRIMES[m_iprime + 1];
-  }
-
-  std::size_t max_bucket_count() const { return detail::PRIMES.back(); }
-
-  void clear() noexcept { m_iprime = 0; }
-
-private:
-  unsigned int m_iprime;
-
-  static_assert(std::numeric_limits<decltype(m_iprime)>::max() >=
-                    detail::PRIMES.size(),
-                "The type of m_iprime is not big enough.");
-};
-
-} // namespace rh
-} // namespace tsl
-
-#endif
--- a/include/tsl/robin_hash.h
+++ b/include/tsl/robin_hash.h
-/**
- * MIT License
- *
- * Copyright (c) 2017 Tessil
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_HASH_H
-#define TSL_ROBIN_HASH_H
-
-#include "robin_growth_policy.h"
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-namespace tsl {
-
-namespace detail_robin_hash {
-
-template <typename T> struct make_void { using type = void; };
-
-template <typename T, typename = void>
-struct has_is_transparent : std::false_type {};
-
-template <typename T>
-struct has_is_transparent<T,
-                          typename make_void<typename T::is_transparent>::type>
-    : std::true_type {};
-
-template <typename U> struct is_power_of_two_policy : std::false_type {};
-
-template <std::size_t GrowthFactor>
-struct is_power_of_two_policy<tsl::rh::power_of_two_growth_policy<GrowthFactor>>
-    : std::true_type {};
-
-// Only available in C++17, we need to be compatible with C++11
-template <class T> const T &clamp(const T &v, const T &lo, const T &hi) {
-  return std::min(hi, std::max(lo, v));
-}
-
-using truncated_hash_type = std::uint_least32_t;
-
-/**
- * Helper class that stores a truncated hash if StoreHash is true and nothing
- * otherwise.
- */
-template <bool StoreHash> class bucket_entry_hash {
-public:
-  bool bucket_hash_equal(std::size_t /*hash*/) const noexcept { return true; }
-
-  truncated_hash_type truncated_hash() const noexcept { return 0; }
-
-protected:
-  void set_hash(truncated_hash_type /*hash*/) noexcept {}
-};
-
-template <> class bucket_entry_hash<true> {
-public:
-  bool bucket_hash_equal(std::size_t hash) const noexcept {
-    return m_hash == truncated_hash_type(hash);
-  }
-
-  truncated_hash_type truncated_hash() const noexcept { return m_hash; }
-
-protected:
-  void set_hash(truncated_hash_type hash) noexcept {
-    m_hash = truncated_hash_type(hash);
-  }
-
-private:
-  truncated_hash_type m_hash;
-};
-
-/**
- * Each bucket entry has:
- * - A value of type `ValueType`.
- * - An integer to store how far the value of the bucket, if any, is from its
- * ideal bucket (ex: if the current bucket 5 has the value 'foo' and
- * `hash('foo') % nb_buckets` == 3, `dist_from_ideal_bucket()` will return 2 as
- * the current value of the bucket is two buckets away from its ideal bucket) If
- * there is no value in the bucket (i.e. `empty()` is true)
- * `dist_from_ideal_bucket()` will be < 0.
- * - A marker which tells us if the bucket is the last bucket of the bucket
- * array (useful for the iterator of the hash table).
- * - If `StoreHash` is true, 32 bits of the hash of the value, if any, are also
- * stored in the bucket. If the size of the hash is more than 32 bits, it is
- * truncated. We don't store the full hash as storing the hash is a potential
- * opportunity to use the unused space due to the alignement of the bucket_entry
- * structure. We can thus potentially store the hash without any extra space
- *   (which would not be possible with 64 bits of the hash).
- */
-template <typename ValueType, bool StoreHash>
-class bucket_entry : public bucket_entry_hash<StoreHash> {
-  using bucket_hash = bucket_entry_hash<StoreHash>;
-
-public:
-  using value_type = ValueType;
-  using distance_type = std::int_least16_t;
-
-  bucket_entry() noexcept
-      : bucket_hash(),
-        m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-        m_last_bucket(false) {
-    tsl_rh_assert(empty());
-  }
-
-  bucket_entry(bool last_bucket) noexcept
-      : bucket_hash(),
-        m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-        m_last_bucket(last_bucket) {
-    tsl_rh_assert(empty());
-  }
-
-  bucket_entry(const bucket_entry &other) noexcept(
-      std::is_nothrow_copy_constructible<value_type>::value)
-      : bucket_hash(other),
-        m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-        m_last_bucket(other.m_last_bucket) {
-    if (!other.empty()) {
-      ::new (static_cast<void *>(std::addressof(m_value)))
-          value_type(other.value());
-      m_dist_from_ideal_bucket = other.m_dist_from_ideal_bucket;
-    }
-  }
-
-  /**
-   * Never really used, but still necessary as we must call resize on an empty
-   * `std::vector<bucket_entry>`. and we need to support move-only types. See
-   * robin_hash constructor for details.
-   */
-  bucket_entry(bucket_entry &&other) noexcept(
-      std::is_nothrow_move_constructible<value_type>::value)
-      : bucket_hash(std::move(other)),
-        m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-        m_last_bucket(other.m_last_bucket) {
-    if (!other.empty()) {
-      ::new (static_cast<void *>(std::addressof(m_value)))
-          value_type(std::move(other.value()));
-      m_dist_from_ideal_bucket = other.m_dist_from_ideal_bucket;
-    }
-  }
-
-  bucket_entry &operator=(const bucket_entry &other) noexcept(
-      std::is_nothrow_copy_constructible<value_type>::value) {
-    if (this != &other) {
-      clear();
-
-      bucket_hash::operator=(other);
-      if (!other.empty()) {
-        ::new (static_cast<void *>(std::addressof(m_value)))
-            value_type(other.value());
-      }
-
-      m_dist_from_ideal_bucket = other.m_dist_from_ideal_bucket;
-      m_last_bucket = other.m_last_bucket;
-    }
-
-    return *this;
-  }
-
-  bucket_entry &operator=(bucket_entry &&) = delete;
-
-  ~bucket_entry() noexcept { clear(); }
-
-  void clear() noexcept {
-    if (!empty()) {
-      destroy_value();
-      m_dist_from_ideal_bucket = EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET;
-    }
-  }
-
-  bool empty() const noexcept {
-    return m_dist_from_ideal_bucket == EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET;
-  }
-
-  value_type &value() noexcept {
-    tsl_rh_assert(!empty());
-    return *reinterpret_cast<value_type *>(std::addressof(m_value));
-  }
-
-  const value_type &value() const noexcept {
-    tsl_rh_assert(!empty());
-    return *reinterpret_cast<const value_type *>(std::addressof(m_value));
-  }
-
-  distance_type dist_from_ideal_bucket() const noexcept {
-    return m_dist_from_ideal_bucket;
-  }
-
-  bool last_bucket() const noexcept { return m_last_bucket; }
-
-  void set_as_last_bucket() noexcept { m_last_bucket = true; }
-
-  template <typename... Args>
-  void set_value_of_empty_bucket(distance_type dist_from_ideal_bucket,
-                                 truncated_hash_type hash,
-                                 Args &&... value_type_args) {
-    tsl_rh_assert(dist_from_ideal_bucket >= 0);
-    tsl_rh_assert(empty());
-
-    ::new (static_cast<void *>(std::addressof(m_value)))
-        value_type(std::forward<Args>(value_type_args)...);
-    this->set_hash(hash);
-    m_dist_from_ideal_bucket = dist_from_ideal_bucket;
-
-    tsl_rh_assert(!empty());
-  }
-
-  void swap_with_value_in_bucket(distance_type &dist_from_ideal_bucket,
-                                 truncated_hash_type &hash, value_type &value) {
-    tsl_rh_assert(!empty());
-
-    using std::swap;
-    swap(value, this->value());
-    swap(dist_from_ideal_bucket, m_dist_from_ideal_bucket);
-
-    // Avoid warning of unused variable if StoreHash is false
-    (void)hash;
-    if (StoreHash) {
-      const truncated_hash_type tmp_hash = this->truncated_hash();
-      this->set_hash(hash);
-      hash = tmp_hash;
-    }
-  }
-
-  static truncated_hash_type truncate_hash(std::size_t hash) noexcept {
-    return truncated_hash_type(hash);
-  }
-
-private:
-  void destroy_value() noexcept {
-    tsl_rh_assert(!empty());
-    value().~value_type();
-  }
-
-private:
-  using storage = typename std::aligned_storage<sizeof(value_type),
-                                                alignof(value_type)>::type;
-
-  static const distance_type EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET = -1;
-
-  distance_type m_dist_from_ideal_bucket;
-  bool m_last_bucket;
-  storage m_value;
-};
-
-/**
- * Internal common class used by `robin_map` and `robin_set`.
- *
- * ValueType is what will be stored by `robin_hash` (usually `std::pair<Key, T>`
- * for map and `Key` for set).
- *
- * `KeySelect` should be a `FunctionObject` which takes a `ValueType` in
- * parameter and returns a reference to the key.
- *
- * `ValueSelect` should be a `FunctionObject` which takes a `ValueType` in
- * parameter and returns a reference to the value. `ValueSelect` should be void
- * if there is no value (in a set for example).
- *
- * The strong exception guarantee only holds if the expression
- * `std::is_nothrow_swappable<ValueType>::value &&
- * std::is_nothrow_move_constructible<ValueType>::value` is true.
- *
- * Behaviour is undefined if the destructor of `ValueType` throws.
- */
-template <class ValueType, class KeySelect, class ValueSelect, class Hash,
-          class KeyEqual, class Allocator, bool StoreHash, class GrowthPolicy>
-class robin_hash : private Hash, private KeyEqual, private GrowthPolicy {
-private:
-  template <typename U>
-  using has_mapped_type =
-      typename std::integral_constant<bool, !std::is_same<U, void>::value>;
-
-  static_assert(
-      noexcept(std::declval<GrowthPolicy>().bucket_for_hash(std::size_t(0))),
-      "GrowthPolicy::bucket_for_hash must be noexcept.");
-  static_assert(noexcept(std::declval<GrowthPolicy>().clear()),
-                "GrowthPolicy::clear must be noexcept.");
-
-public:
-  template <bool IsConst> class robin_iterator;
-
-  using key_type = typename KeySelect::key_type;
-  using value_type = ValueType;
-  using size_type = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  using hasher = Hash;
-  using key_equal = KeyEqual;
-  using allocator_type = Allocator;
-  using reference = value_type &;
-  using const_reference = const value_type &;
-  using pointer = value_type *;
-  using const_pointer = const value_type *;
-  using iterator = robin_iterator<false>;
-  using const_iterator = robin_iterator<true>;
-
-private:
-  /**
-   * Either store the hash because we are asked by the `StoreHash` template
-   * parameter or store the hash because it doesn't cost us anything in size and
-   * can be used to speed up rehash.
-   */
-  static constexpr bool STORE_HASH =
-      StoreHash ||
-      ((sizeof(tsl::detail_robin_hash::bucket_entry<value_type, true>) ==
-        sizeof(tsl::detail_robin_hash::bucket_entry<value_type, false>)) &&
-       (sizeof(std::size_t) == sizeof(truncated_hash_type) ||
-        is_power_of_two_policy<GrowthPolicy>::value) &&
-       // Don't store the hash for primitive types with default hash.
-       (!std::is_arithmetic<key_type>::value ||
-        !std::is_same<Hash, std::hash<key_type>>::value));
-
-  /**
-   * Only use the stored hash on lookup if we are explictly asked. We are not
-   * sure how slow the KeyEqual operation is. An extra comparison may slow
-   * things down with a fast KeyEqual.
-   */
-  static constexpr bool USE_STORED_HASH_ON_LOOKUP = StoreHash;
-
-  /**
-   * We can only use the hash on rehash if the size of the hash type is the same
-   * as the stored one or if we use a power of two modulo. In the case of the
-   * power of two modulo, we just mask the least significant bytes, we just have
-   * to check that the truncated_hash_type didn't truncated more bytes.
-   */
-  static bool USE_STORED_HASH_ON_REHASH(size_type bucket_count) {
-    (void)bucket_count;
-    if (STORE_HASH && sizeof(std::size_t) == sizeof(truncated_hash_type)) {
-      return true;
-    } else if (STORE_HASH && is_power_of_two_policy<GrowthPolicy>::value) {
-      tsl_rh_assert(bucket_count > 0);
-      return (bucket_count - 1) <=
-             std::numeric_limits<truncated_hash_type>::max();
-    } else {
-      return false;
-    }
-  }
-
-  using bucket_entry =
-      tsl::detail_robin_hash::bucket_entry<value_type, STORE_HASH>;
-  using distance_type = typename bucket_entry::distance_type;
-
-  using buckets_allocator = typename std::allocator_traits<
-      allocator_type>::template rebind_alloc<bucket_entry>;
-  using buckets_container_type = std::vector<bucket_entry, buckets_allocator>;
-
-public:
-  /**
-   * The 'operator*()' and 'operator->()' methods return a const reference and
-   * const pointer respectively to the stored value type.
-   *
-   * In case of a map, to get a mutable reference to the value associated to a
-   * key (the '.second' in the stored pair), you have to call 'value()'.
-   *
-   * The main reason for this is that if we returned a `std::pair<Key, T>&`
-   * instead of a `const std::pair<Key, T>&`, the user may modify the key which
-   * will put the map in a undefined state.
-   */
-  template <bool IsConst> class robin_iterator {
-    friend class robin_hash;
-
-  private:
-    using bucket_entry_ptr =
-        typename std::conditional<IsConst, const bucket_entry *,
-                                  bucket_entry *>::type;
-
-    robin_iterator(bucket_entry_ptr bucket) noexcept : m_bucket(bucket) {}
-
-  public:
-    using iterator_category = std::forward_iterator_tag;
-    using value_type = const typename robin_hash::value_type;
-    using difference_type = std::ptrdiff_t;
-    using reference = value_type &;
-    using pointer = value_type *;
-
-    robin_iterator() noexcept {}
-
-    // Copy constructor from iterator to const_iterator.
-    template <bool TIsConst = IsConst,
-              typename std::enable_if<TIsConst>::type * = nullptr>
-    robin_iterator(const robin_iterator<!TIsConst> &other) noexcept
-        : m_bucket(other.m_bucket) {}
-
-    robin_iterator(const robin_iterator &other) = default;
-    robin_iterator(robin_iterator &&other) = default;
-    robin_iterator &operator=(const robin_iterator &other) = default;
-    robin_iterator &operator=(robin_iterator &&other) = default;
-
-    const typename robin_hash::key_type &key() const {
-      return KeySelect()(m_bucket->value());
-    }
-
-    template <class U = ValueSelect,
-              typename std::enable_if<has_mapped_type<U>::value &&
-                                      IsConst>::type * = nullptr>
-    const typename U::value_type &value() const {
-      return U()(m_bucket->value());
-    }
-
-    template <class U = ValueSelect,
-              typename std::enable_if<has_mapped_type<U>::value &&
-                                      !IsConst>::type * = nullptr>
-    typename U::value_type &value() {
-      return U()(m_bucket->value());
-    }
-
-    reference operator*() const { return m_bucket->value(); }
-
-    pointer operator->() const { return std::addressof(m_bucket->value()); }
-
-    robin_iterator &operator++() {
-      while (true) {
-        if (m_bucket->last_bucket()) {
-          ++m_bucket;
-          return *this;
-        }
-
-        ++m_bucket;
-        if (!m_bucket->empty()) {
-          return *this;
-        }
-      }
-    }
-
-    robin_iterator operator++(int) {
-      robin_iterator tmp(*this);
-      ++*this;
-
-      return tmp;
-    }
-
-    friend bool operator==(const robin_iterator &lhs,
-                           const robin_iterator &rhs) {
-      return lhs.m_bucket == rhs.m_bucket;
-    }
-
-    friend bool operator!=(const robin_iterator &lhs,
-                           const robin_iterator &rhs) {
-      return !(lhs == rhs);
-    }
-
-  private:
-    bucket_entry_ptr m_bucket;
-  };
-
-public:
-#if defined(__cplusplus) && __cplusplus >= 201402L
-  robin_hash(size_type bucket_count, const Hash &hash, const KeyEqual &equal,
-             const Allocator &alloc,
-             float min_load_factor = DEFAULT_MIN_LOAD_FACTOR,
-             float max_load_factor = DEFAULT_MAX_LOAD_FACTOR)
-      : Hash(hash), KeyEqual(equal), GrowthPolicy(bucket_count),
-        m_buckets_data(
-            [&]() {
-              if (bucket_count > max_bucket_count()) {
-                TSL_RH_THROW_OR_TERMINATE(
-                    std::length_error,
-                    "The map exceeds its maximum bucket count.");
-              }
-
-              return bucket_count;
-            }(),
-            alloc),
-        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data()),
-        m_bucket_count(bucket_count), m_nb_elements(0),
-        m_grow_on_next_insert(false), m_try_skrink_on_next_insert(false) {
-    if (m_bucket_count > 0) {
-      tsl_rh_assert(!m_buckets_data.empty());
-      m_buckets_data.back().set_as_last_bucket();
-    }
-
-    this->min_load_factor(min_load_factor);
-    this->max_load_factor(max_load_factor);
-  }
-#else
-  /**
-   * C++11 doesn't support the creation of a std::vector with a custom allocator
-   * and 'count' default-inserted elements. The needed contructor `explicit
-   * vector(size_type count, const Allocator& alloc = Allocator());` is only
-   * available in C++14 and later. We thus must resize after using the
-   * `vector(const Allocator& alloc)` constructor.
-   *
-   * We can't use `vector(size_type count, const T& value, const Allocator&
-   * alloc)` as it requires the value T to be copyable.
-   */
-  robin_hash(size_type bucket_count, const Hash &hash, const KeyEqual &equal,
-             const Allocator &alloc,
-             float min_load_factor = DEFAULT_MIN_LOAD_FACTOR,
-             float max_load_factor = DEFAULT_MAX_LOAD_FACTOR)
-      : Hash(hash), KeyEqual(equal), GrowthPolicy(bucket_count),
-        m_buckets_data(alloc), m_buckets(static_empty_bucket_ptr()),
-        m_bucket_count(bucket_count), m_nb_elements(0),
-        m_grow_on_next_insert(false), m_try_skrink_on_next_insert(false) {
-    if (bucket_count > max_bucket_count()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The map exceeds its maxmimum bucket count.");
-    }
-
-    if (m_bucket_count > 0) {
-      m_buckets_data.resize(m_bucket_count);
-      m_buckets = m_buckets_data.data();
-
-      tsl_rh_assert(!m_buckets_data.empty());
-      m_buckets_data.back().set_as_last_bucket();
-    }
-
-    this->min_load_factor(min_load_factor);
-    this->max_load_factor(max_load_factor);
-  }
-#endif
-
-  robin_hash(const robin_hash &other)
-      : Hash(other), KeyEqual(other), GrowthPolicy(other),
-        m_buckets_data(other.m_buckets_data),
-        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data()),
-        m_bucket_count(other.m_bucket_count),
-        m_nb_elements(other.m_nb_elements),
-        m_load_threshold(other.m_load_threshold),
-        m_max_load_factor(other.m_max_load_factor),
-        m_grow_on_next_insert(other.m_grow_on_next_insert),
-        m_min_load_factor(other.m_min_load_factor),
-        m_try_skrink_on_next_insert(other.m_try_skrink_on_next_insert) {}
-
-  robin_hash(robin_hash &&other) noexcept(
-      std::is_nothrow_move_constructible<
-          Hash>::value &&std::is_nothrow_move_constructible<KeyEqual>::value
-          &&std::is_nothrow_move_constructible<GrowthPolicy>::value &&
-              std::is_nothrow_move_constructible<buckets_container_type>::value)
-      : Hash(std::move(static_cast<Hash &>(other))),
-        KeyEqual(std::move(static_cast<KeyEqual &>(other))),
-        GrowthPolicy(std::move(static_cast<GrowthPolicy &>(other))),
-        m_buckets_data(std::move(other.m_buckets_data)),
-        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data()),
-        m_bucket_count(other.m_bucket_count),
-        m_nb_elements(other.m_nb_elements),
-        m_load_threshold(other.m_load_threshold),
-        m_max_load_factor(other.m_max_load_factor),
-        m_grow_on_next_insert(other.m_grow_on_next_insert),
-        m_min_load_factor(other.m_min_load_factor),
-        m_try_skrink_on_next_insert(other.m_try_skrink_on_next_insert) {
-    other.GrowthPolicy::clear();
-    other.m_buckets_data.clear();
-    other.m_buckets = static_empty_bucket_ptr();
-    other.m_bucket_count = 0;
-    other.m_nb_elements = 0;
-    other.m_load_threshold = 0;
-    other.m_grow_on_next_insert = false;
-    other.m_try_skrink_on_next_insert = false;
-  }
-
-  robin_hash &operator=(const robin_hash &other) {
-    if (&other != this) {
-      Hash::operator=(other);
-      KeyEqual::operator=(other);
-      GrowthPolicy::operator=(other);
-
-      m_buckets_data = other.m_buckets_data;
-      m_buckets = m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data();
-      m_bucket_count = other.m_bucket_count;
-      m_nb_elements = other.m_nb_elements;
-
-      m_load_threshold = other.m_load_threshold;
-      m_max_load_factor = other.m_max_load_factor;
-      m_grow_on_next_insert = other.m_grow_on_next_insert;
-
-      m_min_load_factor = other.m_min_load_factor;
-      m_try_skrink_on_next_insert = other.m_try_skrink_on_next_insert;
-    }
-
-    return *this;
-  }
-
-  robin_hash &operator=(robin_hash &&other) {
-    other.swap(*this);
-    other.clear();
-
-    return *this;
-  }
-
-  allocator_type get_allocator() const {
-    return m_buckets_data.get_allocator();
-  }
-
-  /*
-   * Iterators
-   */
-  iterator begin() noexcept {
-    std::size_t i = 0;
-    while (i < m_bucket_count && m_buckets[i].empty()) {
-      i++;
-    }
-
-    return iterator(m_buckets + i);
-  }
-
-  const_iterator begin() const noexcept { return cbegin(); }
-
-  const_iterator cbegin() const noexcept {
-    std::size_t i = 0;
-    while (i < m_bucket_count && m_buckets[i].empty()) {
-      i++;
-    }
-
-    return const_iterator(m_buckets + i);
-  }
-
-  iterator end() noexcept { return iterator(m_buckets + m_bucket_count); }
-
-  const_iterator end() const noexcept { return cend(); }
-
-  const_iterator cend() const noexcept {
-    return const_iterator(m_buckets + m_bucket_count);
-  }
-
-  /*
-   * Capacity
-   */
-  bool empty() const noexcept { return m_nb_elements == 0; }
-
-  size_type size() const noexcept { return m_nb_elements; }
-
-  size_type max_size() const noexcept { return m_buckets_data.max_size(); }
-
-  /*
-   * Modifiers
-   */
-  void clear() noexcept {
-    for (auto &bucket : m_buckets_data) {
-      bucket.clear();
-    }
-
-    m_nb_elements = 0;
-    m_grow_on_next_insert = false;
-  }
-
-  template <typename P> std::pair<iterator, bool> insert(P &&value) {
-    return insert_impl(KeySelect()(value), std::forward<P>(value));
-  }
-
-  template <typename P> iterator insert_hint(const_iterator hint, P &&value) {
-    if (hint != cend() &&
-        compare_keys(KeySelect()(*hint), KeySelect()(value))) {
-      return mutable_iterator(hint);
-    }
-
-    return insert(std::forward<P>(value)).first;
-  }
-
-  template <class InputIt> void insert(InputIt first, InputIt last) {
-    if (std::is_base_of<
-            std::forward_iterator_tag,
-            typename std::iterator_traits<InputIt>::iterator_category>::value) {
-      const auto nb_elements_insert = std::distance(first, last);
-      const size_type nb_free_buckets = m_load_threshold - size();
-      tsl_rh_assert(m_load_threshold >= size());
-
-      if (nb_elements_insert > 0 &&
-          nb_free_buckets < size_type(nb_elements_insert)) {
-        reserve(size() + size_type(nb_elements_insert));
-      }
-    }
-
-    for (; first != last; ++first) {
-      insert(*first);
-    }
-  }
-
-  template <class K, class M>
-  std::pair<iterator, bool> insert_or_assign(K &&key, M &&obj) {
-    auto it = try_emplace(std::forward<K>(key), std::forward<M>(obj));
-    if (!it.second) {
-      it.first.value() = std::forward<M>(obj);
-    }
-
-    return it;
-  }
-
-  template <class K, class M>
-  iterator insert_or_assign(const_iterator hint, K &&key, M &&obj) {
-    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
-      auto it = mutable_iterator(hint);
-      it.value() = std::forward<M>(obj);
-
-      return it;
-    }
-
-    return insert_or_assign(std::forward<K>(key), std::forward<M>(obj)).first;
-  }
-
-  template <class... Args> std::pair<iterator, bool> emplace(Args &&... args) {
-    return insert(value_type(std::forward<Args>(args)...));
-  }
-
-  template <class... Args>
-  iterator emplace_hint(const_iterator hint, Args &&... args) {
-    return insert_hint(hint, value_type(std::forward<Args>(args)...));
-  }
-
-  template <class K, class... Args>
-  std::pair<iterator, bool> try_emplace(K &&key, Args &&... args) {
-    return insert_impl(key, std::piecewise_construct,
-                       std::forward_as_tuple(std::forward<K>(key)),
-                       std::forward_as_tuple(std::forward<Args>(args)...));
-  }
-
-  template <class K, class... Args>
-  iterator try_emplace_hint(const_iterator hint, K &&key, Args &&... args) {
-    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
-      return mutable_iterator(hint);
-    }
-
-    return try_emplace(std::forward<K>(key), std::forward<Args>(args)...).first;
-  }
-
-  /**
-   * Here to avoid `template<class K> size_type erase(const K& key)` being used
-   * when we use an `iterator` instead of a `const_iterator`.
-   */
-  iterator erase(iterator pos) {
-    erase_from_bucket(pos);
-
-    /**
-     * Erase bucket used a backward shift after clearing the bucket.
-     * Check if there is a new value in the bucket, if not get the next
-     * non-empty.
-     */
-    if (pos.m_bucket->empty()) {
-      ++pos;
-    }
-
-    m_try_skrink_on_next_insert = true;
-
-    return pos;
-  }
-
-  iterator erase(const_iterator pos) { return erase(mutable_iterator(pos)); }
-
-  iterator erase(const_iterator first, const_iterator last) {
-    if (first == last) {
-      return mutable_iterator(first);
-    }
-
-    auto first_mutable = mutable_iterator(first);
-    auto last_mutable = mutable_iterator(last);
-    for (auto it = first_mutable.m_bucket; it != last_mutable.m_bucket; ++it) {
-      if (!it->empty()) {
-        it->clear();
-        m_nb_elements--;
-      }
-    }
-
-    if (last_mutable == end()) {
-      return end();
-    }
-
-    /*
-     * Backward shift on the values which come after the deleted values.
-     * We try to move the values closer to their ideal bucket.
-     */
-    std::size_t icloser_bucket =
-        static_cast<std::size_t>(first_mutable.m_bucket - m_buckets);
-    std::size_t ito_move_closer_value =
-        static_cast<std::size_t>(last_mutable.m_bucket - m_buckets);
-    tsl_rh_assert(ito_move_closer_value > icloser_bucket);
-
-    const std::size_t ireturn_bucket =
-        ito_move_closer_value -
-        std::min(
-            ito_move_closer_value - icloser_bucket,
-            std::size_t(
-                m_buckets[ito_move_closer_value].dist_from_ideal_bucket()));
-
-    while (ito_move_closer_value < m_bucket_count &&
-           m_buckets[ito_move_closer_value].dist_from_ideal_bucket() > 0) {
-      icloser_bucket =
-          ito_move_closer_value -
-          std::min(
-              ito_move_closer_value - icloser_bucket,
-              std::size_t(
-                  m_buckets[ito_move_closer_value].dist_from_ideal_bucket()));
-
-      tsl_rh_assert(m_buckets[icloser_bucket].empty());
-      const distance_type new_distance = distance_type(
-          m_buckets[ito_move_closer_value].dist_from_ideal_bucket() -
-          (ito_move_closer_value - icloser_bucket));
-      m_buckets[icloser_bucket].set_value_of_empty_bucket(
-          new_distance, m_buckets[ito_move_closer_value].truncated_hash(),
-          std::move(m_buckets[ito_move_closer_value].value()));
-      m_buckets[ito_move_closer_value].clear();
-
-      ++icloser_bucket;
-      ++ito_move_closer_value;
-    }
-
-    m_try_skrink_on_next_insert = true;
-
-    return iterator(m_buckets + ireturn_bucket);
-  }
-
-  template <class K> size_type erase(const K &key) {
-    return erase(key, hash_key(key));
-  }
-
-  template <class K> size_type erase(const K &key, std::size_t hash) {
-    auto it = find(key, hash);
-    if (it != end()) {
-      erase_from_bucket(it);
-      m_try_skrink_on_next_insert = true;
-
-      return 1;
-    } else {
-      return 0;
-    }
-  }
-
-  void swap(robin_hash &other) {
-    using std::swap;
-
-    swap(static_cast<Hash &>(*this), static_cast<Hash &>(other));
-    swap(static_cast<KeyEqual &>(*this), static_cast<KeyEqual &>(other));
-    swap(static_cast<GrowthPolicy &>(*this),
-         static_cast<GrowthPolicy &>(other));
-    swap(m_buckets_data, other.m_buckets_data);
-    swap(m_buckets, other.m_buckets);
-    swap(m_bucket_count, other.m_bucket_count);
-    swap(m_nb_elements, other.m_nb_elements);
-    swap(m_load_threshold, other.m_load_threshold);
-    swap(m_max_load_factor, other.m_max_load_factor);
-    swap(m_grow_on_next_insert, other.m_grow_on_next_insert);
-    swap(m_min_load_factor, other.m_min_load_factor);
-    swap(m_try_skrink_on_next_insert, other.m_try_skrink_on_next_insert);
-  }
-
-  /*
-   * Lookup
-   */
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  typename U::value_type &at(const K &key) {
-    return at(key, hash_key(key));
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  typename U::value_type &at(const K &key, std::size_t hash) {
-    return const_cast<typename U::value_type &>(
-        static_cast<const robin_hash *>(this)->at(key, hash));
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  const typename U::value_type &at(const K &key) const {
-    return at(key, hash_key(key));
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  const typename U::value_type &at(const K &key, std::size_t hash) const {
-    auto it = find(key, hash);
-    if (it != cend()) {
-      return it.value();
-    } else {
-      TSL_RH_THROW_OR_TERMINATE(std::out_of_range, "Couldn't find key.");
-    }
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  typename U::value_type &operator[](K &&key) {
-    return try_emplace(std::forward<K>(key)).first.value();
-  }
-
-  template <class K> size_type count(const K &key) const {
-    return count(key, hash_key(key));
-  }
-
-  template <class K> size_type count(const K &key, std::size_t hash) const {
-    if (find(key, hash) != cend()) {
-      return 1;
-    } else {
-      return 0;
-    }
-  }
-
-  template <class K> iterator find(const K &key) {
-    return find_impl(key, hash_key(key));
-  }
-
-  template <class K> iterator find(const K &key, std::size_t hash) {
-    return find_impl(key, hash);
-  }
-
-  template <class K> const_iterator find(const K &key) const {
-    return find_impl(key, hash_key(key));
-  }
-
-  template <class K> const_iterator find(const K &key, std::size_t hash) const {
-    return find_impl(key, hash);
-  }
-
-  template <class K> std::pair<iterator, iterator> equal_range(const K &key) {
-    return equal_range(key, hash_key(key));
-  }
-
-  template <class K>
-  std::pair<iterator, iterator> equal_range(const K &key, std::size_t hash) {
-    iterator it = find(key, hash);
-    return std::make_pair(it, (it == end()) ? it : std::next(it));
-  }
-
-  template <class K>
-  std::pair<const_iterator, const_iterator> equal_range(const K &key) const {
-    return equal_range(key, hash_key(key));
-  }
-
-  template <class K>
-  std::pair<const_iterator, const_iterator>
-  equal_range(const K &key, std::size_t hash) const {
-    const_iterator it = find(key, hash);
-    return std::make_pair(it, (it == cend()) ? it : std::next(it));
-  }
-
-  /*
-   * Bucket interface
-   */
-  size_type bucket_count() const { return m_bucket_count; }
-
-  size_type max_bucket_count() const {
-    return std::min(GrowthPolicy::max_bucket_count(),
-                    m_buckets_data.max_size());
-  }
-
-  /*
-   * Hash policy
-   */
-  float load_factor() const {
-    if (bucket_count() == 0) {
-      return 0;
-    }
-
-    return float(m_nb_elements) / float(bucket_count());
-  }
-
-  float min_load_factor() const { return m_min_load_factor; }
-
-  float max_load_factor() const { return m_max_load_factor; }
-
-  void min_load_factor(float ml) {
-    m_min_load_factor = clamp(ml, float(MINIMUM_MIN_LOAD_FACTOR),
-                              float(MAXIMUM_MIN_LOAD_FACTOR));
-  }
-
-  void max_load_factor(float ml) {
-    m_max_load_factor = clamp(ml, float(MINIMUM_MAX_LOAD_FACTOR),
-                              float(MAXIMUM_MAX_LOAD_FACTOR));
-    m_load_threshold = size_type(float(bucket_count()) * m_max_load_factor);
-  }
-
-  void rehash(size_type count) {
-    count = std::max(count,
-                     size_type(std::ceil(float(size()) / max_load_factor())));
-    rehash_impl(count);
-  }
-
-  void reserve(size_type count) {
-    rehash(size_type(std::ceil(float(count) / max_load_factor())));
-  }
-
-  /*
-   * Observers
-   */
-  hasher hash_function() const { return static_cast<const Hash &>(*this); }
-
-  key_equal key_eq() const { return static_cast<const KeyEqual &>(*this); }
-
-  /*
-   * Other
-   */
-  iterator mutable_iterator(const_iterator pos) {
-    return iterator(const_cast<bucket_entry *>(pos.m_bucket));
-  }
-
-private:
-  template <class K> std::size_t hash_key(const K &key) const {
-    return Hash::operator()(key);
-  }
-
-  template <class K1, class K2>
-  bool compare_keys(const K1 &key1, const K2 &key2) const {
-    return KeyEqual::operator()(key1, key2);
-  }
-
-  std::size_t bucket_for_hash(std::size_t hash) const {
-    const std::size_t bucket = GrowthPolicy::bucket_for_hash(hash);
-    tsl_rh_assert(bucket < m_bucket_count ||
-                  (bucket == 0 && m_bucket_count == 0));
-
-    return bucket;
-  }
-
-  template <class U = GrowthPolicy,
-            typename std::enable_if<is_power_of_two_policy<U>::value>::type * =
-                nullptr>
-  std::size_t next_bucket(std::size_t index) const noexcept {
-    tsl_rh_assert(index < bucket_count());
-
-    return (index + 1) & this->m_mask;
-  }
-
-  template <class U = GrowthPolicy,
-            typename std::enable_if<!is_power_of_two_policy<U>::value>::type * =
-                nullptr>
-  std::size_t next_bucket(std::size_t index) const noexcept {
-    tsl_rh_assert(index < bucket_count());
-
-    index++;
-    return (index != bucket_count()) ? index : 0;
-  }
-
-  template <class K> iterator find_impl(const K &key, std::size_t hash) {
-    return mutable_iterator(
-        static_cast<const robin_hash *>(this)->find(key, hash));
-  }
-
-  template <class K>
-  const_iterator find_impl(const K &key, std::size_t hash) const {
-    std::size_t ibucket = bucket_for_hash(hash);
-    distance_type dist_from_ideal_bucket = 0;
-
-    while (dist_from_ideal_bucket <=
-           m_buckets[ibucket].dist_from_ideal_bucket()) {
-      if (TSL_RH_LIKELY(
-              (!USE_STORED_HASH_ON_LOOKUP ||
-               m_buckets[ibucket].bucket_hash_equal(hash)) &&
-              compare_keys(KeySelect()(m_buckets[ibucket].value()), key))) {
-        return const_iterator(m_buckets + ibucket);
-      }
-
-      ibucket = next_bucket(ibucket);
-      dist_from_ideal_bucket++;
-    }
-
-    return cend();
-  }
-
-  void erase_from_bucket(iterator pos) {
-    pos.m_bucket->clear();
-    m_nb_elements--;
-
-    /**
-     * Backward shift, swap the empty bucket, previous_ibucket, with the values
-     * on its right, ibucket, until we cross another empty bucket or if the
-     * other bucket has a distance_from_ideal_bucket == 0.
-     *
-     * We try to move the values closer to their ideal bucket.
-     */
-    std::size_t previous_ibucket =
-        static_cast<std::size_t>(pos.m_bucket - m_buckets);
-    std::size_t ibucket = next_bucket(previous_ibucket);
-
-    while (m_buckets[ibucket].dist_from_ideal_bucket() > 0) {
-      tsl_rh_assert(m_buckets[previous_ibucket].empty());
-
-      const distance_type new_distance =
-          distance_type(m_buckets[ibucket].dist_from_ideal_bucket() - 1);
-      m_buckets[previous_ibucket].set_value_of_empty_bucket(
-          new_distance, m_buckets[ibucket].truncated_hash(),
-          std::move(m_buckets[ibucket].value()));
-      m_buckets[ibucket].clear();
-
-      previous_ibucket = ibucket;
-      ibucket = next_bucket(ibucket);
-    }
-  }
-
-  template <class K, class... Args>
-  std::pair<iterator, bool> insert_impl(const K &key,
-                                        Args &&... value_type_args) {
-    const std::size_t hash = hash_key(key);
-
-    std::size_t ibucket = bucket_for_hash(hash);
-    distance_type dist_from_ideal_bucket = 0;
-
-    while (dist_from_ideal_bucket <=
-           m_buckets[ibucket].dist_from_ideal_bucket()) {
-      if ((!USE_STORED_HASH_ON_LOOKUP ||
-           m_buckets[ibucket].bucket_hash_equal(hash)) &&
-          compare_keys(KeySelect()(m_buckets[ibucket].value()), key)) {
-        return std::make_pair(iterator(m_buckets + ibucket), false);
-      }
-
-      ibucket = next_bucket(ibucket);
-      dist_from_ideal_bucket++;
-    }
-
-    if (rehash_on_extreme_load()) {
-      ibucket = bucket_for_hash(hash);
-      dist_from_ideal_bucket = 0;
-
-      while (dist_from_ideal_bucket <=
-             m_buckets[ibucket].dist_from_ideal_bucket()) {
-        ibucket = next_bucket(ibucket);
-        dist_from_ideal_bucket++;
-      }
-    }
-
-    if (m_buckets[ibucket].empty()) {
-      m_buckets[ibucket].set_value_of_empty_bucket(
-          dist_from_ideal_bucket, bucket_entry::truncate_hash(hash),
-          std::forward<Args>(value_type_args)...);
-    } else {
-      insert_value(ibucket, dist_from_ideal_bucket,
-                   bucket_entry::truncate_hash(hash),
-                   std::forward<Args>(value_type_args)...);
-    }
-
-    m_nb_elements++;
-    /*
-     * The value will be inserted in ibucket in any case, either because it was
-     * empty or by stealing the bucket (robin hood).
-     */
-    return std::make_pair(iterator(m_buckets + ibucket), true);
-  }
-
-  template <class... Args>
-  void insert_value(std::size_t ibucket, distance_type dist_from_ideal_bucket,
-                    truncated_hash_type hash, Args &&... value_type_args) {
-    value_type value(std::forward<Args>(value_type_args)...);
-    insert_value_impl(ibucket, dist_from_ideal_bucket, hash, value);
-  }
-
-  void insert_value(std::size_t ibucket, distance_type dist_from_ideal_bucket,
-                    truncated_hash_type hash, value_type &&value) {
-    insert_value_impl(ibucket, dist_from_ideal_bucket, hash, value);
-  }
-
-  /*
-   * We don't use `value_type&& value` as last argument due to a bug in MSVC
-   * when `value_type` is a pointer, The compiler is not able to see the
-   * difference between `std::string*` and `std::string*&&` resulting in compile
-   * error.
-   *
-   * The `value` will be in a moved state at the end of the function.
-   */
-  void insert_value_impl(std::size_t ibucket,
-                         distance_type dist_from_ideal_bucket,
-                         truncated_hash_type hash, value_type &value) {
-    m_buckets[ibucket].swap_with_value_in_bucket(dist_from_ideal_bucket, hash,
-                                                 value);
-    ibucket = next_bucket(ibucket);
-    dist_from_ideal_bucket++;
-
-    while (!m_buckets[ibucket].empty()) {
-      if (dist_from_ideal_bucket >
-          m_buckets[ibucket].dist_from_ideal_bucket()) {
-        if (dist_from_ideal_bucket >= REHASH_ON_HIGH_NB_PROBES__NPROBES &&
-            load_factor() >= REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR) {
-          /**
-           * The number of probes is really high, rehash the map on the next
-           * insert. Difficult to do now as rehash may throw an exception.
-           */
-          m_grow_on_next_insert = true;
-        }
-
-        m_buckets[ibucket].swap_with_value_in_bucket(dist_from_ideal_bucket,
-                                                     hash, value);
-      }
-
-      ibucket = next_bucket(ibucket);
-      dist_from_ideal_bucket++;
-    }
-
-    m_buckets[ibucket].set_value_of_empty_bucket(dist_from_ideal_bucket, hash,
-                                                 std::move(value));
-  }
-
-  void rehash_impl(size_type count) {
-    robin_hash new_table(count, static_cast<Hash &>(*this),
-                         static_cast<KeyEqual &>(*this), get_allocator(),
-                         m_min_load_factor, m_max_load_factor);
-
-    const bool use_stored_hash =
-        USE_STORED_HASH_ON_REHASH(new_table.bucket_count());
-    for (auto &bucket : m_buckets_data) {
-      if (bucket.empty()) {
-        continue;
-      }
-
-      const std::size_t hash =
-          use_stored_hash ? bucket.truncated_hash()
-                          : new_table.hash_key(KeySelect()(bucket.value()));
-
-      new_table.insert_value_on_rehash(new_table.bucket_for_hash(hash), 0,
-                                       bucket_entry::truncate_hash(hash),
-                                       std::move(bucket.value()));
-    }
-
-    new_table.m_nb_elements = m_nb_elements;
-    new_table.swap(*this);
-  }
-
-  void insert_value_on_rehash(std::size_t ibucket,
-                              distance_type dist_from_ideal_bucket,
-                              truncated_hash_type hash, value_type &&value) {
-    while (true) {
-      if (dist_from_ideal_bucket >
-          m_buckets[ibucket].dist_from_ideal_bucket()) {
-        if (m_buckets[ibucket].empty()) {
-          m_buckets[ibucket].set_value_of_empty_bucket(dist_from_ideal_bucket,
-                                                       hash, std::move(value));
-          return;
-        } else {
-          m_buckets[ibucket].swap_with_value_in_bucket(dist_from_ideal_bucket,
-                                                       hash, value);
-        }
-      }
-
-      dist_from_ideal_bucket++;
-      ibucket = next_bucket(ibucket);
-    }
-  }
-
-  /**
-   * Grow the table if m_grow_on_next_insert is true or we reached the
-   * max_load_factor. Shrink the table if m_try_skrink_on_next_insert is true
-   * (an erase occured) and we're below the min_load_factor.
-   *
-   * Return true if the table has been rehashed.
-   */
-  bool rehash_on_extreme_load() {
-    if (m_grow_on_next_insert || size() >= m_load_threshold) {
-      rehash_impl(GrowthPolicy::next_bucket_count());
-      m_grow_on_next_insert = false;
-
-      return true;
-    }
-
-    if (m_try_skrink_on_next_insert) {
-      m_try_skrink_on_next_insert = false;
-      if (m_min_load_factor != 0.0f && load_factor() < m_min_load_factor) {
-        reserve(size() + 1);
-
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-public:
-  static const size_type DEFAULT_INIT_BUCKETS_SIZE = 0;
-
-  static constexpr float DEFAULT_MAX_LOAD_FACTOR = 0.5f;
-  static constexpr float MINIMUM_MAX_LOAD_FACTOR = 0.2f;
-  static constexpr float MAXIMUM_MAX_LOAD_FACTOR = 0.95f;
-
-  static constexpr float DEFAULT_MIN_LOAD_FACTOR = 0.0f;
-  static constexpr float MINIMUM_MIN_LOAD_FACTOR = 0.0f;
-  static constexpr float MAXIMUM_MIN_LOAD_FACTOR = 0.15f;
-
-  static_assert(MINIMUM_MAX_LOAD_FACTOR < MAXIMUM_MAX_LOAD_FACTOR,
-                "MINIMUM_MAX_LOAD_FACTOR should be < MAXIMUM_MAX_LOAD_FACTOR");
-  static_assert(MINIMUM_MIN_LOAD_FACTOR < MAXIMUM_MIN_LOAD_FACTOR,
-                "MINIMUM_MIN_LOAD_FACTOR should be < MAXIMUM_MIN_LOAD_FACTOR");
-  static_assert(MAXIMUM_MIN_LOAD_FACTOR < MINIMUM_MAX_LOAD_FACTOR,
-                "MAXIMUM_MIN_LOAD_FACTOR should be < MINIMUM_MAX_LOAD_FACTOR");
-
-private:
-  static const distance_type REHASH_ON_HIGH_NB_PROBES__NPROBES = 128;
-  static constexpr float REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR = 0.15f;
-
-  /**
-   * Return an always valid pointer to an static empty bucket_entry with
-   * last_bucket() == true.
-   */
-  bucket_entry *static_empty_bucket_ptr() {
-    static bucket_entry empty_bucket(true);
-    return &empty_bucket;
-  }
-
-private:
-  buckets_container_type m_buckets_data;
-
-  /**
-   * Points to m_buckets_data.data() if !m_buckets_data.empty() otherwise points
-   * to static_empty_bucket_ptr. This variable is useful to avoid the cost of
-   * checking if m_buckets_data is empty when trying to find an element.
-   *
-   * TODO Remove m_buckets_data and only use a pointer instead of a
-   * pointer+vector to save some space in the robin_hash object. Manage the
-   * Allocator manually.
-   */
-  bucket_entry *m_buckets;
-
-  /**
-   * Used a lot in find, avoid the call to m_buckets_data.size() which is a bit
-   * slower.
-   */
-  size_type m_bucket_count;
-
-  size_type m_nb_elements;
-
-  size_type m_load_threshold;
-  float m_max_load_factor;
-
-  bool m_grow_on_next_insert;
-
-  float m_min_load_factor;
-
-  /**
-   * We can't shrink down the map on erase operations as the erase methods need
-   * to return the next iterator. Shrinking the map would invalidate all the
-   * iterators and we could not return the next iterator in a meaningful way, On
-   * erase, we thus just indicate on erase that we should try to shrink the hash
-   * table on the next insert if we go below the min_load_factor.
-   */
-  bool m_try_skrink_on_next_insert;
-};
-
-} // namespace detail_robin_hash
-
-} // namespace tsl
-
-#endif
--- a/include/tsl/robin_map.h
+++ b/include/tsl/robin_map.h
-/**
- * MIT License
- *
- * Copyright (c) 2017 Tessil
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_MAP_H
-#define TSL_ROBIN_MAP_H
-
-#include "robin_hash.h"
-#include <cstddef>
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <type_traits>
-#include <utility>
-
-namespace tsl {
-
-/**
- * Implementation of a hash map using open-adressing and the robin hood hashing
- * algorithm with backward shift deletion.
- *
- * For operations modifying the hash map (insert, erase, rehash, ...), the
- * strong exception guarantee is only guaranteed when the expression
- * `std::is_nothrow_swappable<std::pair<Key, T>>::value &&
- * std::is_nothrow_move_constructible<std::pair<Key, T>>::value` is true,
- * otherwise if an exception is thrown during the swap or the move, the hash map
- * may end up in a undefined state. Per the standard a `Key` or `T` with a
- * noexcept copy constructor and no move constructor also satisfies the
- * `std::is_nothrow_move_constructible<std::pair<Key, T>>::value` criterion (and
- * will thus guarantee the strong exception for the map).
- *
- * When `StoreHash` is true, 32 bits of the hash are stored alongside the
- * values. It can improve the performance during lookups if the `KeyEqual`
- * function takes time (if it engenders a cache-miss for example) as we then
- * compare the stored hashes before comparing the keys. When
- * `tsl::rh::power_of_two_growth_policy` is used as `GrowthPolicy`, it may also
- * speed-up the rehash process as we can avoid to recalculate the hash. When it
- * is detected that storing the hash will not incur any memory penality due to
- * alignement (i.e. `sizeof(tsl::detail_robin_hash::bucket_entry<ValueType,
- * true>) == sizeof(tsl::detail_robin_hash::bucket_entry<ValueType, false>)`)
- * and `tsl::rh::power_of_two_growth_policy` is used, the hash will be stored
- * even if `StoreHash` is false so that we can speed-up the rehash (but it will
- * not be used on lookups unless `StoreHash` is true).
- *
- * `GrowthPolicy` defines how the map grows and consequently how a hash value is
- * mapped to a bucket. By default the map uses
- * `tsl::rh::power_of_two_growth_policy`. This policy keeps the number of
- * buckets to a power of two and uses a mask to map the hash to a bucket instead
- * of the slow modulo. Other growth policies are available and you may define
- * your own growth policy, check `tsl::rh::power_of_two_growth_policy` for the
- * interface.
- *
- * `std::pair<Key, T>` must be swappable.
- *
- * `Key` and `T` must be copy and/or move constructible.
- *
- * If the destructor of `Key` or `T` throws an exception, the behaviour of the
- * class is undefined.
- *
- * Iterators invalidation:
- *  - clear, operator=, reserve, rehash: always invalidate the iterators.
- *  - insert, emplace, emplace_hint, operator[]: if there is an effective
- * insert, invalidate the iterators.
- *  - erase: always invalidate the iterators.
- */
-template <class Key, class T, class Hash = std::hash<Key>,
-          class KeyEqual = std::equal_to<Key>,
-          class Allocator = std::allocator<std::pair<Key, T>>,
-          bool StoreHash = false,
-          class GrowthPolicy = tsl::rh::power_of_two_growth_policy<2>>
-class robin_map {
-private:
-  template <typename U>
-  using has_is_transparent = tsl::detail_robin_hash::has_is_transparent<U>;
-
-  class KeySelect {
-  public:
-    using key_type = Key;
-
-    const key_type &
-    operator()(const std::pair<Key, T> &key_value) const noexcept {
-      return key_value.first;
-    }
-
-    key_type &operator()(std::pair<Key, T> &key_value) noexcept {
-      return key_value.first;
-    }
-  };
-
-  class ValueSelect {
-  public:
-    using value_type = T;
-
-    const value_type &
-    operator()(const std::pair<Key, T> &key_value) const noexcept {
-      return key_value.second;
-    }
-
-    value_type &operator()(std::pair<Key, T> &key_value) noexcept {
-      return key_value.second;
-    }
-  };
-
-  using ht = detail_robin_hash::robin_hash<std::pair<Key, T>, KeySelect,
-                                           ValueSelect, Hash, KeyEqual,
-                                           Allocator, StoreHash, GrowthPolicy>;
-
-public:
-  using key_type = typename ht::key_type;
-  using mapped_type = T;
-  using value_type = typename ht::value_type;
-  using size_type = typename ht::size_type;
-  using difference_type = typename ht::difference_type;
-  using hasher = typename ht::hasher;
-  using key_equal = typename ht::key_equal;
-  using allocator_type = typename ht::allocator_type;
-  using reference = typename ht::reference;
-  using const_reference = typename ht::const_reference;
-  using pointer = typename ht::pointer;
-  using const_pointer = typename ht::const_pointer;
-  using iterator = typename ht::iterator;
-  using const_iterator = typename ht::const_iterator;
-
-public:
-  /*
-   * Constructors
-   */
-  robin_map() : robin_map(ht::DEFAULT_INIT_BUCKETS_SIZE) {}
-
-  explicit robin_map(size_type bucket_count, const Hash &hash = Hash(),
-                     const KeyEqual &equal = KeyEqual(),
-                     const Allocator &alloc = Allocator())
-      : m_ht(bucket_count, hash, equal, alloc) {}
-
-  robin_map(size_type bucket_count, const Allocator &alloc)
-      : robin_map(bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  robin_map(size_type bucket_count, const Hash &hash, const Allocator &alloc)
-      : robin_map(bucket_count, hash, KeyEqual(), alloc) {}
-
-  explicit robin_map(const Allocator &alloc)
-      : robin_map(ht::DEFAULT_INIT_BUCKETS_SIZE, alloc) {}
-
-  template <class InputIt>
-  robin_map(InputIt first, InputIt last,
-            size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-            const Hash &hash = Hash(), const KeyEqual &equal = KeyEqual(),
-            const Allocator &alloc = Allocator())
-      : robin_map(bucket_count, hash, equal, alloc) {
-    insert(first, last);
-  }
-
-  template <class InputIt>
-  robin_map(InputIt first, InputIt last, size_type bucket_count,
-            const Allocator &alloc)
-      : robin_map(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  template <class InputIt>
-  robin_map(InputIt first, InputIt last, size_type bucket_count,
-            const Hash &hash, const Allocator &alloc)
-      : robin_map(first, last, bucket_count, hash, KeyEqual(), alloc) {}
-
-  robin_map(std::initializer_list<value_type> init,
-            size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-            const Hash &hash = Hash(), const KeyEqual &equal = KeyEqual(),
-            const Allocator &alloc = Allocator())
-      : robin_map(init.begin(), init.end(), bucket_count, hash, equal, alloc) {}
-
-  robin_map(std::initializer_list<value_type> init, size_type bucket_count,
-            const Allocator &alloc)
-      : robin_map(init.begin(), init.end(), bucket_count, Hash(), KeyEqual(),
-                  alloc) {}
-
-  robin_map(std::initializer_list<value_type> init, size_type bucket_count,
-            const Hash &hash, const Allocator &alloc)
-      : robin_map(init.begin(), init.end(), bucket_count, hash, KeyEqual(),
-                  alloc) {}
-
-  robin_map &operator=(std::initializer_list<value_type> ilist) {
-    m_ht.clear();
-
-    m_ht.reserve(ilist.size());
-    m_ht.insert(ilist.begin(), ilist.end());
-
-    return *this;
-  }
-
-  allocator_type get_allocator() const { return m_ht.get_allocator(); }
-
-  /*
-   * Iterators
-   */
-  iterator begin() noexcept { return m_ht.begin(); }
-  const_iterator begin() const noexcept { return m_ht.begin(); }
-  const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
-
-  iterator end() noexcept { return m_ht.end(); }
-  const_iterator end() const noexcept { return m_ht.end(); }
-  const_iterator cend() const noexcept { return m_ht.cend(); }
-
-  /*
-   * Capacity
-   */
-  bool empty() const noexcept { return m_ht.empty(); }
-  size_type size() const noexcept { return m_ht.size(); }
-  size_type max_size() const noexcept { return m_ht.max_size(); }
-
-  /*
-   * Modifiers
-   */
-  void clear() noexcept { m_ht.clear(); }
-
-  std::pair<iterator, bool> insert(const value_type &value) {
-    return m_ht.insert(value);
-  }
-
-  template <class P, typename std::enable_if<std::is_constructible<
-                         value_type, P &&>::value>::type * = nullptr>
-  std::pair<iterator, bool> insert(P &&value) {
-    return m_ht.emplace(std::forward<P>(value));
-  }
-
-  std::pair<iterator, bool> insert(value_type &&value) {
-    return m_ht.insert(std::move(value));
-  }
-
-  iterator insert(const_iterator hint, const value_type &value) {
-    return m_ht.insert_hint(hint, value);
-  }
-
-  template <class P, typename std::enable_if<std::is_constructible<
-                         value_type, P &&>::value>::type * = nullptr>
-  iterator insert(const_iterator hint, P &&value) {
-    return m_ht.emplace_hint(hint, std::forward<P>(value));
-  }
-
-  iterator insert(const_iterator hint, value_type &&value) {
-    return m_ht.insert_hint(hint, std::move(value));
-  }
-
-  template <class InputIt> void insert(InputIt first, InputIt last) {
-    m_ht.insert(first, last);
-  }
-
-  void insert(std::initializer_list<value_type> ilist) {
-    m_ht.insert(ilist.begin(), ilist.end());
-  }
-
-  template <class M>
-  std::pair<iterator, bool> insert_or_assign(const key_type &k, M &&obj) {
-    return m_ht.insert_or_assign(k, std::forward<M>(obj));
-  }
-
-  template <class M>
-  std::pair<iterator, bool> insert_or_assign(key_type &&k, M &&obj) {
-    return m_ht.insert_or_assign(std::move(k), std::forward<M>(obj));
-  }
-
-  template <class M>
-  iterator insert_or_assign(const_iterator hint, const key_type &k, M &&obj) {
-    return m_ht.insert_or_assign(hint, k, std::forward<M>(obj));
-  }
-
-  template <class M>
-  iterator insert_or_assign(const_iterator hint, key_type &&k, M &&obj) {
-    return m_ht.insert_or_assign(hint, std::move(k), std::forward<M>(obj));
-  }
-
-  /**
-   * Due to the way elements are stored, emplace will need to move or copy the
-   * key-value once. The method is equivalent to
-   * insert(value_type(std::forward<Args>(args)...));
-   *
-   * Mainly here for compatibility with the std::unordered_map interface.
-   */
-  template <class... Args> std::pair<iterator, bool> emplace(Args &&... args) {
-    return m_ht.emplace(std::forward<Args>(args)...);
-  }
-
-  /**
-   * Due to the way elements are stored, emplace_hint will need to move or copy
-   * the key-value once. The method is equivalent to insert(hint,
-   * value_type(std::forward<Args>(args)...));
-   *
-   * Mainly here for compatibility with the std::unordered_map interface.
-   */
-  template <class... Args>
-  iterator emplace_hint(const_iterator hint, Args &&... args) {
-    return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace(const key_type &k, Args &&... args) {
-    return m_ht.try_emplace(k, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace(key_type &&k, Args &&... args) {
-    return m_ht.try_emplace(std::move(k), std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  iterator try_emplace(const_iterator hint, const key_type &k,
-                       Args &&... args) {
-    return m_ht.try_emplace_hint(hint, k, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  iterator try_emplace(const_iterator hint, key_type &&k, Args &&... args) {
-    return m_ht.try_emplace_hint(hint, std::move(k),
-                                 std::forward<Args>(args)...);
-  }
-
-  iterator erase(iterator pos) { return m_ht.erase(pos); }
-  iterator erase(const_iterator pos) { return m_ht.erase(pos); }
-  iterator erase(const_iterator first, const_iterator last) {
-    return m_ht.erase(first, last);
-  }
-  size_type erase(const key_type &key) { return m_ht.erase(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup to the value if you already have the hash.
-   */
-  size_type erase(const key_type &key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type erase(const K &key) {
-    return m_ht.erase(key);
-  }
-
-  /**
-   * @copydoc erase(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup to the value if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type erase(const K &key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  void swap(robin_map &other) { other.m_ht.swap(m_ht); }
-
-  /*
-   * Lookup
-   */
-  T &at(const Key &key) { return m_ht.at(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  T &at(const Key &key, std::size_t precalculated_hash) {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  const T &at(const Key &key) const { return m_ht.at(key); }
-
-  /**
-   * @copydoc at(const Key& key, std::size_t precalculated_hash)
-   */
-  const T &at(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  T &at(const K &key) {
-    return m_ht.at(key);
-  }
-
-  /**
-   * @copydoc at(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  T &at(const K &key, std::size_t precalculated_hash) {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc at(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const T &at(const K &key) const {
-    return m_ht.at(key);
-  }
-
-  /**
-   * @copydoc at(const K& key, std::size_t precalculated_hash)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const T &at(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  T &operator[](const Key &key) { return m_ht[key]; }
-  T &operator[](Key &&key) { return m_ht[std::move(key)]; }
-
-  size_type count(const Key &key) const { return m_ht.count(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  size_type count(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type count(const K &key) const {
-    return m_ht.count(key);
-  }
-
-  /**
-   * @copydoc count(const K& key) const
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type count(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  iterator find(const Key &key) { return m_ht.find(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  iterator find(const Key &key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  const_iterator find(const Key &key) const { return m_ht.find(key); }
-
-  /**
-   * @copydoc find(const Key& key, std::size_t precalculated_hash)
-   */
-  const_iterator find(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  iterator find(const K &key) {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  iterator find(const K &key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const_iterator find(const K &key) const {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const_iterator find(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  std::pair<iterator, iterator> equal_range(const Key &key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  std::pair<iterator, iterator> equal_range(const Key &key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  std::pair<const_iterator, const_iterator> equal_range(const Key &key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
-   */
-  std::pair<const_iterator, const_iterator>
-  equal_range(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<iterator, iterator> equal_range(const K &key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<iterator, iterator> equal_range(const K &key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<const_iterator, const_iterator> equal_range(const K &key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<const_iterator, const_iterator>
-  equal_range(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /*
-   * Bucket interface
-   */
-  size_type bucket_count() const { return m_ht.bucket_count(); }
-  size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
-
-  /*
-   *  Hash policy
-   */
-  float load_factor() const { return m_ht.load_factor(); }
-
-  float min_load_factor() const { return m_ht.min_load_factor(); }
-  float max_load_factor() const { return m_ht.max_load_factor(); }
-
-  /**
-   * Set the `min_load_factor` to `ml`. When the `load_factor` of the map goes
-   * below `min_load_factor` after some erase operations, the map will be
-   * shrunk when an insertion occurs. The erase method itself never shrinks
-   * the map.
-   *
-   * The default value of `min_load_factor` is 0.0f, the map never shrinks by
-   * default.
-   */
-  void min_load_factor(float ml) { m_ht.min_load_factor(ml); }
-  void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
-
-  void rehash(size_type count) { m_ht.rehash(count); }
-  void reserve(size_type count) { m_ht.reserve(count); }
-
-  /*
-   * Observers
-   */
-  hasher hash_function() const { return m_ht.hash_function(); }
-  key_equal key_eq() const { return m_ht.key_eq(); }
-
-  /*
-   * Other
-   */
-
-  /**
-   * Convert a const_iterator to an iterator.
-   */
-  iterator mutable_iterator(const_iterator pos) {
-    return m_ht.mutable_iterator(pos);
-  }
-
-  friend bool operator==(const robin_map &lhs, const robin_map &rhs) {
-    if (lhs.size() != rhs.size()) {
-      return false;
-    }
-
-    for (const auto &element_lhs : lhs) {
-      const auto it_element_rhs = rhs.find(element_lhs.first);
-      if (it_element_rhs == rhs.cend() ||
-          element_lhs.second != it_element_rhs->second) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  friend bool operator!=(const robin_map &lhs, const robin_map &rhs) {
-    return !operator==(lhs, rhs);
-  }
-
-  friend void swap(robin_map &lhs, robin_map &rhs) { lhs.swap(rhs); }
-
-private:
-  ht m_ht;
-};
-
-/**
- * Same as `tsl::robin_map<Key, T, Hash, KeyEqual, Allocator, StoreHash,
- * tsl::rh::prime_growth_policy>`.
- */
-template <class Key, class T, class Hash = std::hash<Key>,
-          class KeyEqual = std::equal_to<Key>,
-          class Allocator = std::allocator<std::pair<Key, T>>,
-          bool StoreHash = false>
-using robin_pg_map = robin_map<Key, T, Hash, KeyEqual, Allocator, StoreHash,
-                               tsl::rh::prime_growth_policy>;
-
-} // end namespace tsl
-
-#endif
--- a/include/utility/timer.h
+++ b/include/utility/timer.h
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <chrono>
-#ifdef TV_CUDA
-#include <cuda_runtime_api.h>
-#endif
-#include <iostream>
-
-namespace spconv {
-
-#ifdef TV_CUDA
-template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
-  CudaContextTimer() {
-    cudaDeviceSynchronize();
-    mCurTime = std::chrono::steady_clock::now();
-  }
-  typename TimeT::rep report() {
-    cudaDeviceSynchronize();
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-#endif
-
-template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
-  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
-  typename TimeT::rep report() {
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-
-} // namespace spconv
--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = ["setuptools>=41.0", "wheel", "pccm>=0.2.5", "cumm>=0.1.3"]
+build-backend = "setuptools.build_meta"
--- a/setup.py
+++ b/setup.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Note: To use the 'upload' functionality of this file, you must:
+#   $ pip install twine
+
+import io
 import os
-import platform
-import re
-import subprocess
+import shutil
 import sys
-from distutils.version import LooseVersion
 from pathlib import Path
+from shutil import rmtree
+from typing import List
+
+import pccm
+from pccm.extension import ExtCallback, PCCMBuild, PCCMExtension
+from setuptools import Command, find_packages, setup
+from setuptools.extension import Extension
+
+# Package meta-data.
+NAME = 'spconv'
+RELEASE_NAME = NAME
+deps = ["cumm"]
+cuda_ver = os.environ.get("CUMM_CUDA_VERSON", "")
+if cuda_ver:
+    cuda_ver = cuda_ver.replace(".", "") # 10.2 to 102
+    RELEASE_NAME += "-cu{}".format(cuda_ver)
+    deps = ["cumm-cu{}".format(cuda_ver)]
+DESCRIPTION = 'spatial sparse convolution'
+URL = 'https://github.com/traveller59/spconv'
+EMAIL = 'yanyan.sub@outlook.com'
+AUTHOR = 'Yan Yan'
+REQUIRES_PYTHON = '>=3.7'
+VERSION = None
+
+# What packages are required for this module to be executed?
+REQUIRED = ["pccm>=0.2.5", "pybind11>=2.6.0", "fire", "numpy", *deps]
+
+# What packages are optional?
+EXTRAS = {
+    # 'fancy feature': ['django'],
+}
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# If you do change the License, remember to change the Trove Classifier for that!
+
+here = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(str(Path(__file__).parent))
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
+try:
+    with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+        long_description = '\n' + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+# Load the package's __version__.py module as a dictionary.
+about = {}
+if not VERSION:
+    with open('version.txt', 'r') as f:
+        version = f.read().strip()
+else:
+    version = VERSION
+cwd = os.path.dirname(os.path.abspath(__file__))
+

-import torch
-from setuptools import Extension, find_packages, setup
-from setuptools.command.build_ext import build_ext
+def _convert_build_number(build_number):
+    parts = build_number.split(".")
+    if len(parts) == 2:
+        return "{}{:03d}".format(int(parts[0]), int(parts[1]))
+    elif len(parts) == 1:
+        return build_number
+    else:
+        raise NotImplementedError

-# if 'LIBTORCH_ROOT' not in os.environ:
-#     raise ValueError("You must set LIBTORCH_ROOT to your torch c++ library.")

-LIBTORCH_ROOT = str(Path(torch.__file__).parent)
+env_suffix = os.environ.get("SPCONV_VERSION_SUFFIX", "")
+if env_suffix != "":
+    version += ".dev{}".format(_convert_build_number(env_suffix))
+version_path = os.path.join(cwd, NAME, '__version__.py')
+about['__version__'] = version

-SPCONV_FORCE_BUILD_CUDA = os.getenv("SPCONV_FORCE_BUILD_CUDA")
+with open(version_path, 'w') as f:
+    f.write("__version__ = '{}'\n".format(version))

-PYTHON_VERSION = "{}.{}".format(sys.version_info.major, sys.version_info.minor)
+class UploadCommand(Command):
+    """Support setup.py upload."""

-remove_plus = torch.__version__.find("+")
-PYTORCH_VERSION = torch.__version__
-if remove_plus != -1:
-    PYTORCH_VERSION = torch.__version__[:remove_plus]
-PYTORCH_VERSION = list(map(int, PYTORCH_VERSION.split(".")))
-PYTORCH_VERSION_NUMBER = PYTORCH_VERSION[0] * 10000 + PYTORCH_VERSION[1] * 100 + PYTORCH_VERSION[2]
+    description = 'Build and publish the package.'
+    user_options = []

-class CMakeExtension(Extension):
-    def __init__(self, name, sourcedir='', library_dirs=[]):
-        Extension.__init__(self, name, sources=[], library_dirs=library_dirs)
-        self.sourcedir = os.path.abspath(sourcedir)
+    @staticmethod
+    def status(s):
+        """Prints things in bold."""
+        print('\033[1m{0}\033[0m'.format(s))

+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass

-class CMakeBuild(build_ext):
    def run(self):
        try:
-            out = subprocess.check_output(['cmake', '--version'])
+            self.status('Removing previous builds...')
+            rmtree(os.path.join(here, 'dist'))
        except OSError:
-            raise RuntimeError("CMake must be installed to build the following extensions: " +
-                               ", ".join(e.name for e in self.extensions))
-
-        if platform.system() == "Windows":
-            cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
-            if cmake_version < '3.13.0':
-                raise RuntimeError("CMake >= 3.13.0 is required on Windows")
-
-        for ext in self.extensions:
-            self.build_extension(ext)
-
-    def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-        cmake_args = [# '-G "Visual Studio 15 2017 Win64"',
-                      '-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT),
-                      '-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
-                      '-DSPCONV_BuildTests=OFF',
-                      '-DPYTORCH_VERSION={}'.format(PYTORCH_VERSION_NUMBER),
-                      ] #  -arch=sm_61
-        if not torch.cuda.is_available() and SPCONV_FORCE_BUILD_CUDA is None:
-            cmake_args += ['-DSPCONV_BuildCUDA=OFF']
-        else:
-            cuda_flags = ["\"--expt-relaxed-constexpr\""]
-            # must add following flags to use at::Half
-            # but will remove raw half operators.
-            cuda_flags += ["-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__"]
-            # cuda_flags += ["-D__CUDA_NO_HALF2_OPERATORS__"] 
-            cmake_args += ['-DCMAKE_CUDA_FLAGS=' + " ".join(cuda_flags)]
-        cfg = 'Debug' if self.debug else 'Release'
-        assert cfg == "Release", "pytorch ops don't support debug build."
-        build_args = ['--config', cfg]
-        print(cfg)
-        if platform.system() == "Windows":
-            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), str(Path(extdir) / "spconv"))]
-            # cmake_args += ['-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), str(Path(extdir) / "spconv"))]
-            cmake_args += ['-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), str(Path(extdir) / "spconv"))]
-            cmake_args += ["-DCMAKE_WINDOWS_EXPORT_ALL_SYMBOLS=TRUE"]
-            if sys.maxsize > 2**32:
-                cmake_args += ['-A', 'x64']
-            build_args += ['--', '/m']
-        else:
-            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(str(Path(extdir) / "spconv"))]
-            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-            build_args += ['--', '-j4']
-
-        env = os.environ.copy()
-        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
-                                                              self.distribution.get_version())
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-        print("|||||CMAKE ARGS|||||", cmake_args)
-        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
-        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
-
-
-packages = find_packages(exclude=('tools', 'tools.*'))
+            pass
+
+        self.status('Building Source and Wheel (universal) distribution...')
+        os.system('{0} setup.py sdist bdist_wheel --universal'.format(
+            sys.executable))
+
+        self.status('Uploading the package to PyPI via Twine...')
+        os.system('twine upload dist/*')
+
+        self.status('Pushing git tags...')
+        os.system('git tag v{0}'.format(about['__version__']))
+        os.system('git push --tags')
+
+        sys.exit()
+
+
+
+disable_jit = os.getenv("SPCONV_DISABLE_JIT", None)
+
+if disable_jit is not None and disable_jit == "1":
+    cmdclass = {
+        'upload': UploadCommand,
+        'build_ext': PCCMBuild,
+    }
+    from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
+    from spconv.csrc.sparse.all import SpconvOps
+    from cumm.gemm.gather import GatherAll
+    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
+
+    cu.namespace = "cumm.gemm.main"
+    ext_modules: List[Extension] = [
+        PCCMExtension([cu, SpconvOps(), GatherAll()],
+                      "spconv/core_cc",
+                      Path(__file__).resolve().parent / "spconv")
+    ]
+else:
+    cmdclass = {
+        'upload': UploadCommand,
+    }
+    ext_modules = []
+
+# Where the magic happens:
 setup(
-    name='spconv',
-    version='1.2.1',
-    author='Yan Yan',
-    author_email='scrin@foxmail.com',
-    description='spatial sparse convolution for pytorch',
-    long_description='',
-    setup_requires = ['torch>=1.3.0'],
-    packages=packages,
-    package_dir = {'spconv': 'spconv'},
-    ext_modules=[CMakeExtension('spconv', library_dirs=[])],
-    cmdclass=dict(build_ext=CMakeBuild),
-    zip_safe=False,
+    name=RELEASE_NAME,
+    version=about['__version__'],
+    description=DESCRIPTION,
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    author=AUTHOR,
+    author_email=EMAIL,
+    python_requires=REQUIRES_PYTHON,
+    url=URL,
+    packages=find_packages(exclude=('tests', )),
+    # If your package is a single module, use this instead of 'packages':
+    # py_modules=['mypackage'],
+    entry_points={
+        'console_scripts': [],
+    },
+    install_requires=REQUIRED,
+    extras_require=EXTRAS,
+    include_package_data=True,
+    license='MIT',
+    classifiers=[
+        # Trove classifiers
+        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: Implementation :: CPython',
+        'Programming Language :: Python :: Implementation :: PyPy'
+    ],
+    # $ setup.py publish support.
+    cmdclass=cmdclass,
+    ext_modules=ext_modules,
 )
--- a/spconv/__init__.py
+++ b/spconv/__init__.py
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import platform
-from pathlib import Path
+from . import build as _build

-import numpy as np
-import torch
-
-from spconv import ops, utils
-from spconv.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
-                         SparseConvTranspose3d, SparseInverseConv2d,
-                         SparseInverseConv3d, SubMConv2d, SubMConv3d)
-from spconv.core import SparseConvTensor
-from spconv.identity import Identity
-from spconv.modules import SparseModule, SparseSequential
-from spconv.ops import ConvAlgo
-from spconv.pool import SparseMaxPool2d, SparseMaxPool3d
-from spconv.tables import AddTable, ConcatTable, JoinTable
-
-_LIB_FILE_NAME = "libspconv.so"
-if platform.system() == "Windows":
-    _LIB_FILE_NAME = "spconv.dll"
-_LIB_PATH = str(Path(__file__).parent / _LIB_FILE_NAME)
-torch.ops.load_library(_LIB_PATH)
-
-
-class ToDense(SparseModule):
-    """convert SparseConvTensor to NCHW dense tensor.
-    """
-    def forward(self, x: SparseConvTensor):
-        return x.dense()
-
-
-class RemoveGrid(SparseModule):
-    """remove pre-allocated grid buffer.
-    """
-    def forward(self, x: SparseConvTensor):
-        x.grid = None
-        return x
+from .algo import ConvAlgo
+from . import utils, constants
\ No newline at end of file
--- a/spconv/algo.py
+++ b/spconv/algo.py
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from cumm import tensorview as tv
+from typing import Dict, List, Set, Tuple
+from spconv.core_cc.cumm.gemm.main import GemmAlgoDesp, GemmMainUnitTest, GemmParams
+# from spconv.core_cc.cumm.gemm.gather import GatherAll, ScatterAll
+from cumm.gemm.algospec.core import ShuffleStrideType, get_min_arch_of_algo_str, get_available_algo_str_from_arch
+from cumm.gemm.codeops import group_by, div_up
+from typing import Optional
+import time
+
+import numpy as np
+
+
+class ConvAlgo(Enum):
+    Native = "Native"
+    MaskImplicitGemm = "MaskImplicitGemm"
+    MaskSplitImplicitGemm = "MaskSplitImplicitGemm"
+
+
+class AlgoHint(Enum):
+    NoHint = 0b000
+    Fowrard = 0b001
+    BackwardInput = 0b010
+    BackwardWeight = 0b100
+
+
+ALL_ALGO_DESPS = GemmMainUnitTest.get_all_algo_desp()
+
+_GEMM_STATIC_KEY = Tuple[bool, bool, bool, int, int, int, str, str]
+
+# GATHER = GatherAll()
+# SCATTER = ScatterAll()
+
+
+class SimpleGemmAlgoMeta:
+    def __init__(self, tile_ms: List[int], tile_ns: List[int],
+                 tile_ks: List[int],
+                 tile_shape_to_algos: Dict[int, List[int]]) -> None:
+        self.tile_shape_to_algos = tile_shape_to_algos
+        self.tile_ms = tile_ms
+        self.tile_ns = tile_ns
+        self.tile_ks = tile_ks
+
+
+class BestAlgoByProfile:
+    def __init__(self,
+                 algo_desp: GemmAlgoDesp,
+                 external_gather: bool,
+                 external_scatter: bool,
+                 gather_params: Optional[Tuple[int, int, int, int]] = None,
+                 scatter_params: Optional[Tuple[int, int, int, int]] = None,
+                 splitk: int = 1) -> None:
+        self.algo_desp = algo_desp
+        self.external_gather = external_gather
+        self.external_scatter = external_scatter
+        self.gather_params = gather_params
+        self.scatter_params = scatter_params
+        self.splitk = splitk
+
+
+class SimpleGemm:
+    def __init__(self, desps: List[GemmAlgoDesp]) -> None:
+        self.desps = desps
+
+        self.static_key_to_desps = group_by(self.get_static_key, desps)
+        self.static_key_to_meta: Dict[_GEMM_STATIC_KEY,
+                                      SimpleGemmAlgoMeta] = {}
+        for k, static_desps in self.static_key_to_desps.items():
+            tile_shape_to_algos: Dict[int, List[int]] = {}
+            tile_ms: Set[int] = set()
+            tile_ns: Set[int] = set()
+            tile_ks: Set[int] = set()
+            for i, desp in enumerate(static_desps):
+                ts = desp.tile_shape
+                tile_ms.add(ts[0])
+                tile_ns.add(ts[1])
+                tile_ks.add(ts[2])
+                tile_key = ts[0] | (ts[1] << 20) | (ts[2] << 40)
+                if tile_key not in tile_shape_to_algos:
+                    tile_shape_to_algos[tile_key] = []
+                tile_shape_to_algos[tile_key].append(i)
+                tile_ms_list = list(tile_ms)
+                tile_ns_list = list(tile_ns)
+                tile_ks_list = list(tile_ks)
+                tile_ms_list.sort()
+                tile_ns_list.sort()
+                tile_ks_list.sort()
+            self.static_key_to_meta[k] = SimpleGemmAlgoMeta(
+                tile_ms_list, tile_ns_list, tile_ks_list, tile_shape_to_algos)
+
+        self.nk_forward_cache: Dict[Tuple[int, int],
+                            BestAlgoByProfile] = {}  # for forward
+        self.nk_dgrad_cache: Dict[Tuple[int, int],
+                            BestAlgoByProfile] = {}  # for backward weight
+
+        self.mn_cache: Dict[Tuple[int, int],
+                            BestAlgoByProfile] = {}  # for backward weight
+
+    @staticmethod
+    def get_static_key(d: GemmAlgoDesp) -> _GEMM_STATIC_KEY:
+        return (d.trans_a, d.trans_b, d.trans_c, d.dtype_a, d.dtype_b,
+                d.dtype_c, d.shuffle_type, d.algo)
+
+    def device_synchronize(self):
+        return GemmMainUnitTest.device_synchronize()
+
+    def get_all_available(
+            self,
+            a: tv.Tensor,
+            b: tv.Tensor,
+            c: tv.Tensor,
+            trans_a: bool,
+            trans_b: bool,
+            trans_c: bool,
+            arch: Tuple[int, int],
+            shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle):
+        if trans_c:
+            trans_a = not trans_a
+            trans_b = not trans_b
+            trans_a, trans_b = trans_b, trans_a
+            a, b = b, a
+            trans_c = False
+        avail_algos = get_available_algo_str_from_arch(arch)
+        finally_algos: List[GemmAlgoDesp] = []
+        for algo in avail_algos:
+            static_key = (trans_a, trans_b, trans_c, a.dtype, b.dtype, c.dtype,
+                          shuffle_type.value, algo)
+            desps = self.static_key_to_desps.get(static_key, None)
+            if desps is None or len(desps) == 0:
+                continue
+            for desp in desps:
+                lda = a.dim(1)
+                ldb = b.dim(1)
+                ldc = c.dim(1)
+                if desp.supported_ldx(lda, ldb, ldc):
+                    finally_algos.append(desp)
+        return finally_algos
+
+    def select(self,
+               a: tv.Tensor,
+               b: tv.Tensor,
+               c: tv.Tensor,
+               trans_a: bool,
+               trans_b: bool,
+               trans_c: bool,
+               arch: Tuple[int, int],
+               shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+               a_inds: tv.Tensor = tv.Tensor(),
+               b_inds: tv.Tensor = tv.Tensor(),
+               c_inds: tv.Tensor = tv.Tensor(),
+               hint: int = AlgoHint.NoHint.value):
+        m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds.shape, b_inds.shape,
+                                               c_inds.shape)
+        if trans_c:
+            trans_a = not trans_a
+            trans_b = not trans_b
+            trans_a, trans_b = trans_b, trans_a
+            a, b = b, a
+            trans_c = False
+        avail_algos = get_available_algo_str_from_arch(arch)
+        finally_algos: List[GemmAlgoDesp] = []
+        for algo in avail_algos:
+            static_key = (trans_a, trans_b, trans_c, a.dtype, b.dtype, c.dtype,
+                          shuffle_type.value, algo)
+            desps = self.static_key_to_desps.get(static_key, None)
+            if desps is None or len(desps) == 0:
+                continue
+            meta = self.static_key_to_meta[static_key]
+            # for shuffle stride algos, we need to make channel tile size as large as possible.
+            # so if ShuffleAC, we need to make k largest.
+            selected_algo_desps = GemmMainUnitTest.simple_select_tile_shape(
+                m,
+                n,
+                k,
+                meta.tile_ms,
+                meta.tile_ns,
+                meta.tile_ks,
+                meta.tile_shape_to_algos,
+                large_k_first=shuffle_type == shuffle_type.ShuffleAC)
+            if not selected_algo_desps:
+                candidate = desps
+            else:
+                candidate = [desps[i] for i in selected_algo_desps]
+            # select by hint
+            if hint == 0:
+                return candidate[0]
+            if hint & (AlgoHint.Fowrard.value | AlgoHint.BackwardInput.value):
+                # m may be huge, n and k are small
+                # don't need mixed precision
+                # don't need splitk
+                finally_algos = []
+                if a.dtype == tv.float16:
+                    dacc = tv.float16
+                    dcomp = tv.float16
+                    for can in candidate:
+                        if can.dacc == dacc and can.dcomp == dcomp:
+                            finally_algos.append(can)
+                else:
+                    finally_algos = candidate
+            elif hint & AlgoHint.BackwardWeight.value:
+                # k is huge
+                # don't support i8
+                # if f16, acc and comp must be f32
+                finally_algos = []
+                candidate_filtered: List[GemmAlgoDesp] = list(
+                    filter(lambda x: x.split_k_serial, candidate))
+                if not candidate_filtered:
+                    candidate_filtered = candidate
+                if a.dtype == tv.int8:
+                    continue
+                elif a.dtype == tv.float16:
+                    dacc = tv.float32
+                    dcomp = tv.float32
+                    for can in candidate_filtered:
+                        if can.dacc == dacc and can.dcomp == dcomp:
+                            finally_algos.append(can)
+                else:
+                    finally_algos = candidate_filtered
+            else:
+                return candidate[0]
+        # print(finally_algos)
+        if finally_algos:
+            return finally_algos[0]
+        return None
+
+    def get_profiled_algo(
+            self,
+            a_shape: List[int],
+            b_shape: List[int],
+            c_shape: List[int],
+            trans_a: bool,
+            trans_b: bool,
+            trans_c: bool,
+            arch: Tuple[int, int],
+            shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+            a_inds_shape: Optional[List[int]] = None,
+            b_inds_shape: Optional[List[int]] = None,
+            c_inds_shape: Optional[List[int]] = None,
+            hint: int = AlgoHint.NoHint.value):
+        if a_inds_shape is None:
+            a_inds_shape = []
+        if b_inds_shape is None:
+            b_inds_shape = []
+        if c_inds_shape is None:
+            c_inds_shape = []
+        m, n, k = GemmMainUnitTest.extract_mnk(a_shape, b_shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds_shape, b_inds_shape,
+                                               c_inds_shape)
+        if hint & AlgoHint.BackwardWeight.value:
+            key = (m, n)
+            return self.mn_cache.get(key, None)
+        elif hint & AlgoHint.BackwardInput.value:
+            key = (n, k)
+            return self.nk_dgrad_cache.get(key, None)
+        elif hint & AlgoHint.Fowrard.value:
+            key = (n, k)
+            return self.nk_forward_cache.get(key, None)
+        raise NotImplementedError
+
+    def extract_mnk(
+            self,
+            a_shape: List[int],
+            b_shape: List[int],
+            trans_a: bool,
+            trans_b: bool,
+            trans_c: bool,
+            arch: Tuple[int, int],
+            shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+            a_inds_shape: Optional[List[int]] = None,
+            b_inds_shape: Optional[List[int]] = None,
+            c_inds_shape: Optional[List[int]] = None,
+            hint: int = AlgoHint.NoHint.value):
+        if a_inds_shape is None:
+            a_inds_shape = []
+        if b_inds_shape is None:
+            b_inds_shape = []
+        if c_inds_shape is None:
+            c_inds_shape = []
+        m, n, k = GemmMainUnitTest.extract_mnk(a_shape, b_shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds_shape, b_inds_shape,
+                                               c_inds_shape)
+        return m, n, k
+
+    def profile_and_cache(
+            self,
+            a: tv.Tensor,
+            b: tv.Tensor,
+            c: tv.Tensor,
+            trans_a: bool,
+            trans_b: bool,
+            trans_c: bool,
+            arch: Tuple[int, int],
+            shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+            a_inds: tv.Tensor = tv.Tensor(),
+            b_inds: tv.Tensor = tv.Tensor(),
+            c_inds: tv.Tensor = tv.Tensor(),
+            hint: int = AlgoHint.NoHint.value,
+            alpha: float = 1.0,
+            beta: float = 0.0,
+            gather_data: tv.Tensor = tv.Tensor(),
+            scatter_data: tv.Tensor = tv.Tensor(),
+            # mm_func
+            stream: int = 0):
+        m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds.shape, b_inds.shape,
+                                               c_inds.shape)
+        if hint & AlgoHint.BackwardWeight.value:
+            key = (m, n)
+        else:
+            key = (n, k)
+
+        avail = self.get_all_available(a, b, c, trans_a, trans_b, trans_c,
+                                       arch, shuffle_type)
+        c_ = c.clone()
+        times: List[float] = []
+        # gather_algos: List[GemmAlgoDesp] = []
+        # find fastest gather algo for this input
+        best_gather_params = (-1, -1, -1, -1)
+        best_scatter_params = (-1, -1, -1, -1)
+        # gather_data_ = tv.Tensor()
+        # if not gather_data.empty(
+        # ) and not hint & AlgoHint.BackwardWeight.value:
+        #     # run gather here
+        #     all_gather_params = GATHER.get_all_gather_params()
+        #     gather_data_ = gather_data.clone()
+        #     gather_times: List[float] = []
+
+        #     for gather_params in all_gather_params:
+        #         if GATHER.supported(gather_params[2], a.dim(1), a.dtype):
+        #             this_times = []
+        #             for j in range(10):
+        #                 GemmMainUnitTest.stream_synchronize(stream)
+        #                 t = time.time()
+        #                 GATHER.gather(gather_data_, a, a_inds, *gather_params)
+        #                 GemmMainUnitTest.stream_synchronize(stream)
+        #                 this_times.append(time.time() - t)
+        #             gather_times.append(np.mean(this_times[5:]))
+
+        #     min_time = 1000
+        #     min_idx = -1
+        #     for i, t in enumerate(gather_times):
+        #         if t < min_time:
+        #             min_time = t
+        #             min_idx = i
+        #     best_gather_params = all_gather_params[min_idx]
+
+        # if not scatter_data.empty(
+        # ) and not hint & AlgoHint.BackwardWeight.value:
+        #     # run gather here
+        #     all_scatter_params = SCATTER.get_all_scatter_params()
+        #     scatter_data_ = scatter_data.clone()
+        #     scatter_times: List[float] = []
+
+        #     for params in all_scatter_params:
+        #         if SCATTER.supported_scatter(*params, a.dim(1), a.dtype):
+        #             this_times = []
+        #             for j in range(10):
+        #                 GemmMainUnitTest.stream_synchronize(stream)
+        #                 t = time.time()
+        #                 SCATTER.scatter(c_, scatter_data_, c_inds, *params)
+        #                 GemmMainUnitTest.stream_synchronize(stream)
+        #                 this_times.append(time.time() - t)
+        #             scatter_times.append(np.mean(this_times[5:]))
+
+        #     min_time = 1000
+        #     min_idx = -1
+        #     for i, t in enumerate(scatter_times):
+        #         if t < min_time:
+        #             min_time = t
+        #             min_idx = i
+        #     best_scatter_params = all_scatter_params[min_idx]
+
+
+        all_profile_res: List[BestAlgoByProfile] = []
+        for desp in avail:
+            c_.zero_()
+            split_k_slices = 1
+            # TODO better splitk selection
+            if desp.split_k_serial and hint & AlgoHint.BackwardWeight.value:
+                split_k_slices = max(min(32, k // 128), 1)
+            params = GemmParams()
+            params.a = a
+            params.b = b
+            params.c = c_
+            params.a_inds = a_inds
+            params.b_inds = b_inds
+            params.c_inds = c_inds
+            params.algo_desp = desp
+            params.alpha = alpha
+            params.beta = beta
+            params.stream = stream
+            if desp.split_k_serial and hint & AlgoHint.BackwardWeight.value:
+                splitk_tests = [1, 2, 4, 8, 16, 32, 64]
+            else:
+                splitk_tests = [1]
+            spk_speeds = []
+            for spk in splitk_tests:
+                this_times = []
+                for j in range(3):
+                    GemmMainUnitTest.stream_synchronize(stream)
+                    t = time.time()
+                    params.split_k_slices = spk
+                    GemmMainUnitTest.matmul2(params)
+                    GemmMainUnitTest.stream_synchronize(stream)
+                    this_times.append(time.time() - t)
+                times.append(np.mean(this_times[1:]))
+                spk_speeds.append(times[-1])
+
+                all_profile_res.append(
+                    BestAlgoByProfile(desp, False, False, best_gather_params, best_scatter_params, splitk=spk))
+            # if desp.split_k_serial:
+            #     print(a.shape, b.shape, spk_speeds)
+            # if not gather_data.empty(
+            # ) and not hint & AlgoHint.BackwardWeight.value:
+            #     # run gather here
+            #     for spk in splitk_tests:
+            #         this_times = []
+            #         for j in range(3):
+
+            #             GemmMainUnitTest.stream_synchronize(stream)
+            #             t = time.time()
+            #             params.a_inds = tv.Tensor()
+            #             params.a = gather_data_
+            #             params.split_k_slices = spk
+            #             GATHER.gather(gather_data_,
+            #                         a,
+            #                         a_inds,
+            #                         *best_gather_params,
+            #                         stream=stream)
+            #             GemmMainUnitTest.matmul2(params)
+            #             GemmMainUnitTest.stream_synchronize(stream)
+            #             this_times.append(time.time() - t)
+
+            #         times.append(np.mean(this_times[1:]))
+            #         # print("G", times[-1], times[-2])
+            #         all_profile_res.append(
+            #             BestAlgoByProfile(desp,
+            #                             True,
+            #                             False,
+            #                             best_gather_params, best_scatter_params,
+            #                             splitk=spk))
+
+        min_time = 1000
+        min_idx = -1
+        for i, t in enumerate(times):
+            if t < min_time:
+                min_time = t
+                min_idx = i
+        res = all_profile_res[min_idx]
+        if hint & AlgoHint.BackwardWeight.value:
+            key = (m, n)
+            self.mn_cache[key] = res
+        elif hint & AlgoHint.BackwardInput.value:
+            key = (n, k)
+            self.nk_dgrad_cache[key] = res
+        elif hint & AlgoHint.Fowrard.value:
+            key = (n, k)
+            self.nk_forward_cache[key] = res
+        else:
+            raise NotImplementedError
+
+        return res, min_time
+
+    def run_profile(
+        self,
+        profile_res: BestAlgoByProfile,
+        a: tv.Tensor,
+        b: tv.Tensor,
+        c: tv.Tensor,
+        trans_a: bool,
+        trans_b: bool,
+        trans_c: bool,
+        arch: Tuple[int, int],
+        stream: int,
+        shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+        a_inds: tv.Tensor = tv.Tensor(),
+        b_inds: tv.Tensor = tv.Tensor(),
+        c_inds: tv.Tensor = tv.Tensor(),
+        hint: int = AlgoHint.NoHint.value,
+        alpha: float = 1.0,
+        beta: float = 0.0,
+        gather_data: tv.Tensor = tv.Tensor(),
+        workspace: tv.Tensor = tv.Tensor()):
+        m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds.shape, b_inds.shape,
+                                               c_inds.shape)
+        # GemmMainUnitTest.stream_synchronize(stream)
+        algo_desp = profile_res.algo_desp
+        assert algo_desp is not None
+        split_k_slices = 1
+        # TODO better splitk selection
+        # if algo_desp.split_k_serial and hint & AlgoHint.BackwardWeight.value:
+        #     split_k_slices = max(min(32, k // 128), 1)
+        if profile_res.splitk > 1:
+            split_k_slices = profile_res.splitk
+        params = GemmParams()
+        params.a = a
+        params.b = b
+        params.c = c
+        params.a_inds = a_inds
+        params.b_inds = b_inds
+        params.c_inds = c_inds
+        params.algo_desp = algo_desp
+        params.split_k_slices = split_k_slices
+        params.stream = stream
+        params.alpha = alpha
+        params.beta = beta
+        params.workspace = workspace
+        # gather = 0
+        # if profile_res.external_gather and not gather_data.empty():
+        #     GemmMainUnitTest.stream_synchronize(stream)
+        #     tt = time.time()
+        #     assert not gather_data.empty()
+        #     params.a_inds = tv.Tensor()
+        #     params.a = gather_data
+        #     # print(profile_res.gather_params, gather_data.shape, a.shape, a_inds.shape)
+        #     GATHER.gather(gather_data,
+        #                    a,
+        #                    a_inds,
+        #                    *profile_res.gather_params,
+        #                    stream=stream)
+        #     GemmMainUnitTest.stream_synchronize(stream)
+        #     gather = time.time() - tt
+
+        GemmMainUnitTest.matmul2(params)
+        # GemmMainUnitTest.stream_synchronize(stream)
+        return algo_desp
+
+
+GEMM = SimpleGemm(ALL_ALGO_DESPS)
+
+if __name__ == "__main__":
+    print(len(ALL_ALGO_DESPS))
+    print(ALL_ALGO_DESPS[0])
+
+    a = tv.zeros([64000, 32], dtype=tv.float16)
+    b = tv.zeros([32, 64], dtype=tv.float16)
+    c = tv.zeros([64000, 64], dtype=tv.float16)
+    a_inds = tv.zeros([64000], dtype=tv.int32)
+    c_inds = tv.zeros([64000], dtype=tv.int32)
+    t = time.time()
+    for i in range(100):
+        algo = GEMM.select(a,
+                           c,
+                           b,
+                           True,
+                           False,
+                           False, (7, 5),
+                           ShuffleStrideType.ShuffleAB,
+                           a_inds=a_inds,
+                           b_inds=c_inds)
+    print((time.time() - t) / 100)
+    print(algo)
--- a/spconv/build.py
+++ b/spconv/build.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import pccm
+from pccm.utils import project_is_editable, project_is_installed
+
+from .constants import PACKAGE_NAME, PACKAGE_ROOT
+
+if project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME):
+    from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
+    from spconv.csrc.sparse.all import SpconvOps
+    # from cumm.gemm.gather import GatherAll, ScatterAll
+    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
+    cu.namespace = "cumm.gemm.main"
+    pccm.builder.build_pybind([cu, SpconvOps()],
+                              PACKAGE_ROOT / "core_cc",
+                              namespace_root=PACKAGE_ROOT,
+                              load_library=False)
--- a/spconv/constants.py
+++ b/spconv/constants.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+from typing import List
+from pccm.utils import project_is_editable, project_is_installed
+
+PACKAGE_NAME = "spconv"
+PACKAGE_ROOT = Path(__file__).parent.resolve()
+
+EDITABLE_INSTALLED = project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME)
+
+
+_filter_hwio_env = os.getenv("SPCONV_FILTER_HWIO", "0")
+FILTER_HWIO = _filter_hwio_env == "1"
\ No newline at end of file
--- a/spconv/core_cc/__init__.pyi
+++ b/spconv/core_cc/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/core_cc/csrc/__init__.pyi
+++ b/spconv/core_cc/csrc/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/core_cc/csrc/sparse/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/core_cc/csrc/sparse/all/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/__init__.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class SpconvOps:
+    @staticmethod
+    def generate_conv_inds(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int]) -> int: 
+        """
+        Args:
+            indices: 
+            hashdata: 
+            indice_pairs: 
+            indice_pairs_uniq: 
+            out_inds: 
+            indice_num_per_loc: 
+            batch_size: 
+            output_dims: 
+            input_dims: 
+            ksize: 
+            stride: 
+            padding: 
+            dilation: 
+        """
+        ...
+    @staticmethod
+    def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int: 
+        """
+        Args:
+            indices: 
+            indice_pairs: 
+            indice_pairs_uniq: 
+            indice_num_per_loc: 
+            batch_size: 
+            output_dims: 
+            input_dims: 
+            ksize: 
+            stride: 
+            padding: 
+            dilation: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int: 
+        """
+        Args:
+            indices: 
+            hashdata: 
+            indice_pairs: 
+            indice_pairs_uniq: 
+            out_inds: 
+            num_out_act: 
+            batch_size: 
+            output_dims: 
+            input_dims: 
+            ksize: 
+            stride: 
+            padding: 
+            dilation: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def generate_subm_conv_inds(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, batch_size: int, input_dims: List[int], ksize: List[int], dilation: List[int], indice_pair_mask: Tensor =  Tensor(), backward: bool = False, stream_int: int =  0) -> int: 
+        """
+        Args:
+            indices: 
+            hashdata: 
+            indice_pairs: 
+            out_inds: 
+            indice_num_per_loc: 
+            batch_size: 
+            input_dims: 
+            ksize: 
+            dilation: 
+            indice_pair_mask: 
+            backward: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def maxpool_forward(out: Tensor, inp: Tensor, out_inds: Tensor, in_inds: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            inp: 
+            out_inds: 
+            in_inds: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def maxpool_backward(out: Tensor, inp: Tensor, dout: Tensor, dinp: Tensor, out_inds: Tensor, in_inds: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            inp: 
+            dout: 
+            dinp: 
+            out_inds: 
+            in_inds: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def sort_1d_by_key(data: Tensor) -> Tensor: 
+        """
+        Args:
+            data: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops1d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops1d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops2d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops2d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops3d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops3d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...