support v0.6.16

19fd8251 · limm · 9ccee9c0 · 19fd8251 · 19fd8251 · 19fd8251
Commit 19fd8251 authored Jul 25, 2024 by limm
20 changed files
--- a/third_party/parallel-hashmap/parallel_hashmap/phmap_config_hip.h
+++ b/third_party/parallel-hashmap/parallel_hashmap/phmap_config_hip.h
@@ -37,8 +37,8 @@
 // ---------------------------------------------------------------------------

 #define PHMAP_VERSION_MAJOR 1
-#define PHMAP_VERSION_MINOR 0
-#define PHMAP_VERSION_PATCH 0
+#define PHMAP_VERSION_MINOR 3
+#define PHMAP_VERSION_PATCH 12

 // Included for the __GLIBC__ macro (or similar macros on other systems).
 #include <limits.h>
@@ -102,7 +102,7 @@
 #endif

 #if CHAR_BIT != 8
-    #error "phmap assumes CHAR_BIT == 8."
+    #warning "phmap assumes CHAR_BIT == 8."
 #endif

 // phmap currently assumes that an int is 4 bytes. 
@@ -122,7 +122,8 @@
    #define PHMAP_HAVE_BUILTIN(x) 0
 #endif

-#if (defined(_MSVC_LANG) && _MSVC_LANG >= 201703) || __cplusplus >= 201703
+#if (!defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5) && \
+    ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
    #define PHMAP_HAVE_CC17 1
 #else
    #define PHMAP_HAVE_CC17 0
@@ -150,40 +151,13 @@
    #define PHMAP_INTERNAL_HAVE_MIN_CLANG_VERSION(x, y) 0
 #endif

-// ----------------------------------------------------------------
-// Checks whether `std::is_trivially_destructible<T>` is supported.
-// ----------------------------------------------------------------
-#ifdef PHMAP_HAVE_STD_IS_TRIVIALLY_DESTRUCTIBLE
-    #error PHMAP_HAVE_STD_IS_TRIVIALLY_DESTRUCTIBLE cannot be directly set
-#elif defined(_LIBCPP_VERSION) || defined(_MSC_VER) ||                     \
-    (!defined(__clang__) && defined(__GNUC__) && defined(__GLIBCXX__) &&  PHMAP_INTERNAL_HAVE_MIN_GNUC_VERSION(4, 8))
-    #define PHMAP_HAVE_STD_IS_TRIVIALLY_DESTRUCTIBLE 1
-#endif
-
-// --------------------------------------------------------------
-// Checks whether `std::is_trivially_default_constructible<T>` is 
-// supported.
-// --------------------------------------------------------------
-#if defined(PHMAP_HAVE_STD_IS_TRIVIALLY_CONSTRUCTIBLE)
-    #error PHMAP_HAVE_STD_IS_TRIVIALLY_CONSTRUCTIBLE cannot be directly set
-#elif defined(PHMAP_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE)
-    #error PHMAP_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE cannot directly set
-#elif (defined(__clang__) && defined(_LIBCPP_VERSION)) ||        \
-    (!defined(__clang__) && defined(__GNUC__) &&                 \
-     PHMAP_INTERNAL_HAVE_MIN_GNUC_VERSION(5, 1) && \
-     (defined(_LIBCPP_VERSION) || defined(__GLIBCXX__))) ||      \
-    (defined(_MSC_VER) && !defined(__NVCC__))
-    #define PHMAP_HAVE_STD_IS_TRIVIALLY_CONSTRUCTIBLE 1
-    #define PHMAP_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE 1
-#endif
-
 // -------------------------------------------------------------------
 // Checks whether C++11's `thread_local` storage duration specifier is
 // supported.
 // -------------------------------------------------------------------
 #ifdef PHMAP_HAVE_THREAD_LOCAL
    #error PHMAP_HAVE_THREAD_LOCAL cannot be directly set
-#elif defined(__APPLE__)
+#elif defined(__APPLE__) && defined(__clang__)
    #if __has_feature(cxx_thread_local) && \
        !(TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0)
        #define PHMAP_HAVE_THREAD_LOCAL 1
@@ -343,7 +317,11 @@
 #endif

 #if PHMAP_HAVE_CC17
-    #define PHMAP_HAVE_SHARED_MUTEX 1
+    #ifdef __has_include
+       #if __has_include(<shared_mutex>)
+           #define PHMAP_HAVE_SHARED_MUTEX 1
+       #endif
+    #endif
 #endif

 #ifndef PHMAP_HAVE_STD_STRING_VIEW
@@ -674,6 +652,15 @@
    #define PHMAP_IF_CONSTEXPR(expr) if ((expr))
 #endif

+// ----------------------------------------------------------------------
+// builtin unreachable
+// ----------------------------------------------------------------------
+#if PHMAP_HAVE_BUILTIN(__builtin_unreachable)
+    #define PHMAP_BUILTIN_UNREACHABLE() __builtin_unreachable()
+#else
+    #define PHMAP_BUILTIN_UNREACHABLE() (void)0
+#endif
+
 // ----------------------------------------------------------------------
 // base/macros.h
 // ----------------------------------------------------------------------

--- a/third_party/parallel-hashmap/parallel_hashmap/phmap_dump.h
+++ b/third_party/parallel-hashmap/parallel_hashmap/phmap_dump.h
@@ -44,6 +44,8 @@ namespace priv {

 #if !defined(PHMAP_NON_DETERMINISTIC) && !defined(PHMAP_DISABLE_DUMP)

+static constexpr size_t s_version_base = std::numeric_limits<size_t>::max() - 10;
+static constexpr size_t s_version = s_version_base;
 // ------------------------------------------------------------------------
 // dump/load for raw_hash_set
 // ------------------------------------------------------------------------
@@ -53,12 +55,14 @@ bool raw_hash_set<Policy, Hash, Eq, Alloc>::phmap_dump(OutputArchive& ar) const
    static_assert(type_traits_internal::IsTriviallyCopyable<value_type>::value,
                    "value_type should be trivially copyable");

+    ar.saveBinary(&s_version, sizeof(size_t));
    ar.saveBinary(&size_, sizeof(size_t));
    ar.saveBinary(&capacity_, sizeof(size_t));
    if (size_ == 0)
        return true;
    ar.saveBinary(ctrl_,  sizeof(ctrl_t) * (capacity_ + Group::kWidth + 1));
    ar.saveBinary(slots_, sizeof(slot_type) * capacity_);
+    ar.saveBinary(&growth_left(), sizeof(size_t));
    return true;
 }

@@ -68,7 +72,15 @@ bool raw_hash_set<Policy, Hash, Eq, Alloc>::phmap_load(InputArchive& ar) {
    static_assert(type_traits_internal::IsTriviallyCopyable<value_type>::value,
                    "value_type should be trivially copyable");
    raw_hash_set<Policy, Hash, Eq, Alloc>().swap(*this); // clear any existing content
-    ar.loadBinary(&size_, sizeof(size_t));
+
+    size_t version = 0;
+    ar.loadBinary(&version, sizeof(size_t));
+    if (version < s_version_base) {
+        // we didn't store the version, version actually contains the size
+        size_ = version;
+    } else {
+        ar.loadBinary(&size_, sizeof(size_t));
+    }
    ar.loadBinary(&capacity_, sizeof(size_t));

    if (capacity_) {
@@ -79,6 +91,10 @@ bool raw_hash_set<Policy, Hash, Eq, Alloc>::phmap_load(InputArchive& ar) {
        return true;
    ar.loadBinary(ctrl_,  sizeof(ctrl_t) * (capacity_ + Group::kWidth + 1));
    ar.loadBinary(slots_, sizeof(slot_type) * capacity_);
+    if (version >= s_version_base) {
+        // growth_left should be restored after calling initialize_slots() which resets it.
+        ar.loadBinary(&growth_left(), sizeof(size_t));
+    }
    return true;
 }

@@ -153,11 +169,28 @@ public:
        ofs_.open(file_path, std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
    }

+    ~BinaryOutputArchive() = default;
+    BinaryOutputArchive(const BinaryOutputArchive&) = delete;
+    BinaryOutputArchive& operator=(const BinaryOutputArchive&) = delete;
+
    bool saveBinary(const void *p, size_t sz) {
-        ofs_.write(reinterpret_cast<const char*>(p), sz);
+        ofs_.write(reinterpret_cast<const char*>(p), (std::streamsize)sz);
        return true;
    }

+    template<typename V>
+    typename std::enable_if<type_traits_internal::IsTriviallyCopyable<V>::value, bool>::type
+    saveBinary(const V& v) {
+        ofs_.write(reinterpret_cast<const char *>(&v), sizeof(V));
+        return true;
+    }
+
+    template<typename Map>
+    auto saveBinary(const Map& v) -> decltype(v.phmap_dump(*this), bool())
+    {
+        return v.phmap_dump(*this);
+    }
+
 private:
    std::ofstream ofs_;
 };
@@ -168,12 +201,29 @@ public:
    BinaryInputArchive(const char * file_path) {
        ifs_.open(file_path, std::ofstream::in | std::ofstream::binary);
    }
+    
+    ~BinaryInputArchive() = default;
+    BinaryInputArchive(const BinaryInputArchive&) = delete;
+    BinaryInputArchive& operator=(const BinaryInputArchive&) = delete;

    bool loadBinary(void* p, size_t sz) {
-        ifs_.read(reinterpret_cast<char*>(p), sz);
+        ifs_.read(reinterpret_cast<char*>(p),  (std::streamsize)sz);
+        return true;
+    }
+
+    template<typename V>
+    typename std::enable_if<type_traits_internal::IsTriviallyCopyable<V>::value, bool>::type
+    loadBinary(V* v) {
+        ifs_.read(reinterpret_cast<char *>(v), sizeof(V));
        return true;
    }

+    template<typename Map>
+    auto loadBinary(Map* v) -> decltype(v->phmap_load(*this), bool())
+    {
+        return v->phmap_load(*this);
+    }
+    
 private:
    std::ifstream ifs_;
 };

--- a/third_party/parallel-hashmap/parallel_hashmap/phmap_fwd_decl.h
+++ b/third_party/parallel-hashmap/parallel_hashmap/phmap_fwd_decl.h
@@ -20,6 +20,7 @@

 #include <memory>
 #include <utility>
+#include <mutex>

 #if defined(PHMAP_USE_ABSL_HASH) && !defined(ABSL_HASH_HASH_H_)
    namespace absl { template <class T> struct Hash; };
@@ -127,6 +128,37 @@ namespace phmap {
              class Mutex = phmap::NullMutex>   // use std::mutex to enable internal locks
        class parallel_node_hash_map;

+    // -----------------------------------------------------------------------------
+    // phmap::parallel_*_hash_* using std::mutex by default
+    // -----------------------------------------------------------------------------
+    template <class T,
+              class Hash  = phmap::priv::hash_default_hash<T>,
+              class Eq    = phmap::priv::hash_default_eq<T>,
+              class Alloc = phmap::priv::Allocator<T>,
+              size_t N    = 4>
+    using parallel_flat_hash_set_m = parallel_flat_hash_set<T, Hash, Eq, Alloc, N, std::mutex>;
+
+    template <class K, class V,
+              class Hash  = phmap::priv::hash_default_hash<K>,
+              class Eq    = phmap::priv::hash_default_eq<K>,
+              class Alloc = phmap::priv::Allocator<phmap::priv::Pair<const K, V>>,
+              size_t N    = 4>
+    using parallel_flat_hash_map_m = parallel_flat_hash_map<K, V, Hash, Eq, Alloc, N, std::mutex>;
+
+    template <class T,
+              class Hash  = phmap::priv::hash_default_hash<T>,
+              class Eq    = phmap::priv::hash_default_eq<T>,
+              class Alloc = phmap::priv::Allocator<T>,
+              size_t N    = 4>
+    using parallel_node_hash_set_m = parallel_node_hash_set<T, Hash, Eq, Alloc, N, std::mutex>;
+
+    template <class K, class V,
+              class Hash  = phmap::priv::hash_default_hash<K>,
+              class Eq    = phmap::priv::hash_default_eq<K>,
+              class Alloc = phmap::priv::Allocator<phmap::priv::Pair<const K, V>>,
+              size_t N     = 4>
+    using parallel_node_hash_map_m = parallel_node_hash_map<K, V, Hash, Eq, Alloc, N, std::mutex>;
+
    // ------------- forward declarations for btree containers ----------------------------------
    template <typename Key, typename Compare = phmap::Less<Key>,
              typename Alloc = phmap::Allocator<Key>>

--- a/third_party/parallel-hashmap/parallel_hashmap/phmap_hip.h
+++ b/third_party/parallel-hashmap/parallel_hashmap/phmap_hip.h
@@ -137,6 +137,7 @@ void SwapAlloc(AllocType& lhs, AllocType& rhs,
  using std::swap;
  swap(lhs, rhs);
 }
+
 template <typename AllocType>
 void SwapAlloc(AllocType& /*lhs*/, AllocType& /*rhs*/,
               std::false_type /* propagate_on_container_swap */) {}
@@ -194,27 +195,36 @@ struct IsDecomposable<
 // TODO(alkis): Switch to std::is_nothrow_swappable when gcc/clang supports it.
 // --------------------------------------------------------------------------
 template <class T>
-constexpr bool IsNoThrowSwappable() {
+constexpr bool IsNoThrowSwappable(std::true_type = {} /* is_swappable */) {
    using std::swap;
    return noexcept(swap(std::declval<T&>(), std::declval<T&>()));
 }

+template <class T>
+constexpr bool IsNoThrowSwappable(std::false_type /* is_swappable */) {
+  return false;
+}
+
 // --------------------------------------------------------------------------
 template <typename T>
-int TrailingZeros(T x) {
+uint32_t TrailingZeros(T x) {
+    uint32_t res;
    PHMAP_IF_CONSTEXPR(sizeof(T) == 8)
-        return base_internal::CountTrailingZerosNonZero64(static_cast<uint64_t>(x));
+        res = base_internal::CountTrailingZerosNonZero64(static_cast<uint64_t>(x));
    else
-        return base_internal::CountTrailingZerosNonZero32(static_cast<uint32_t>(x));
+        res = base_internal::CountTrailingZerosNonZero32(static_cast<uint32_t>(x));
+    return res;
 }

 // --------------------------------------------------------------------------
 template <typename T>
-int LeadingZeros(T x) {
+uint32_t LeadingZeros(T x) {
+    uint32_t res;
    PHMAP_IF_CONSTEXPR(sizeof(T) == 8)
-        return base_internal::CountLeadingZeros64(static_cast<uint64_t>(x));
+        res = base_internal::CountLeadingZeros64(static_cast<uint64_t>(x));
    else
-        return base_internal::CountLeadingZeros32(static_cast<uint32_t>(x));
+        res = base_internal::CountLeadingZeros32(static_cast<uint32_t>(x));
+    return res;
 }

 // --------------------------------------------------------------------------
@@ -353,7 +363,7 @@ inline size_t H1(size_t hashval, const ctrl_t* ) {
 #endif


-inline h2_t H2(size_t hashval)       { return (ctrl_t)(hashval & 0x7F); }
+inline ctrl_t H2(size_t hashval)       { return (ctrl_t)(hashval & 0x7F); }

 inline bool IsEmpty(ctrl_t c)          { return c == kEmpty; }
 inline bool IsFull(ctrl_t c)           { return c >= static_cast<ctrl_t>(0); }
@@ -420,14 +430,10 @@ struct GroupSse2Impl
 #endif
    }

-#ifdef __INTEL_COMPILER
-#pragma warning push
-#pragma warning disable 68
-#endif
    // Returns a bitmask representing the positions of empty or deleted slots.
    // -----------------------------------------------------------------------
    BitMask<uint32_t, kWidth> MatchEmptyOrDeleted() const {
-        auto special = _mm_set1_epi8(static_cast<uint8_t>(kSentinel));
+        auto special = _mm_set1_epi8(static_cast<char>(kSentinel));
        return BitMask<uint32_t, kWidth>(
            static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8_fixed(special, ctrl))));
    }
@@ -435,13 +441,10 @@ struct GroupSse2Impl
    // Returns the number of trailing empty or deleted elements in the group.
    // ----------------------------------------------------------------------
    uint32_t CountLeadingEmptyOrDeleted() const {
-        auto special = _mm_set1_epi8(static_cast<uint8_t>(kSentinel));
+        auto special = _mm_set1_epi8(static_cast<char>(kSentinel));
        return TrailingZeros(
            static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8_fixed(special, ctrl)) + 1));
    }
-#ifdef __INTEL_COMPILER
-#pragma warning pop
-#endif

    // ----------------------------------------------------------------------
    void ConvertSpecialToEmptyAndFullToDeleted(ctrl_t* dst) const {
@@ -577,8 +580,7 @@ inline size_t CapacityToGrowth(size_t capacity)
    assert(IsValidCapacity(capacity));
    // `capacity*7/8`
    PHMAP_IF_CONSTEXPR (Group::kWidth == 8) {
-        if (capacity == 7)
-        {
+        if (capacity == 7) {
            // x-x/8 does not work when x==7.
            return 6;
        }
@@ -594,8 +596,7 @@ inline size_t GrowthToLowerboundCapacity(size_t growth)
 {
    // `growth*8/7`
    PHMAP_IF_CONSTEXPR (Group::kWidth == 8) {
-        if (growth == 7)
-        {
+        if (growth == 7) {
            // x+(x-1)/7 does not work when x==7.
            return 8;
        }
@@ -959,7 +960,7 @@ public:
            return tmp;
        }

-#if PHMAP_BIDIRECTIONAL
+#if 0 // PHMAP_BIDIRECTIONAL
        // PRECONDITION: not a begin() iterator.
        iterator& operator--() {
            assert(ctrl_);
@@ -1186,10 +1187,10 @@ public:
        size_(phmap::exchange(that.size_, 0)),
        capacity_(phmap::exchange(that.capacity_, 0)),
        infoz_(phmap::exchange(that.infoz_, HashtablezInfoHandle())),
-    // Hash, equality and allocator are copied instead of moved because
-    // `that` must be left valid. If Hash is std::function<Key>, moving it
-    // would create a nullptr functor that cannot be called.
-        settings_(that.settings_) {
+        // Hash, equality and allocator are copied instead of moved because
+        // `that` must be left valid. If Hash is std::function<Key>, moving it
+        // would create a nullptr functor that cannot be called.
+        settings_(std::move(that.settings_)) {
        // growth_left was copied above, reset the one from `that`.
        that.growth_left() = 0;
    }
@@ -1244,7 +1245,7 @@ public:
    }
    iterator end() 
    {
-#if PHMAP_BIDIRECTIONAL
+#if 0 // PHMAP_BIDIRECTIONAL
        return iterator_at(capacity_); 
 #else
        return {ctrl_ + capacity_};
@@ -1264,21 +1265,16 @@ public:
    size_t max_size() const { return (std::numeric_limits<size_t>::max)(); }

    PHMAP_ATTRIBUTE_REINITIALIZES void clear() {
-        // Iterating over this container is O(bucket_count()). When bucket_count()
-        // is much greater than size(), iteration becomes prohibitively expensive.
-        // For clear() it is more important to reuse the allocated array when the
-        // container is small because allocation takes comparatively long time
-        // compared to destruction of the elements of the container. So we pick the
-        // largest bucket_count() threshold for which iteration is still fast and
-        // past that we simply deallocate the array.
        if (empty())
            return;
-        if (capacity_ > 127) {
-            destroy_slots();
-        } else if (capacity_) {
-            for (size_t i = 0; i != capacity_; ++i) {
-                if (IsFull(ctrl_[i])) {
-                    PolicyTraits::destroy(&alloc_ref(), slots_ + i);
+        if (capacity_) {
+           PHMAP_IF_CONSTEXPR((!std::is_trivially_destructible<typename PolicyTraits::value_type>::value ||
+                               std::is_same<typename Policy::is_flat, std::false_type>::value)) {
+                // node map or not trivially destructible... we  need to iterate and destroy values one by one
+                for (size_t i = 0; i != capacity_; ++i) {
+                    if (IsFull(ctrl_[i])) {
+                        PolicyTraits::destroy(&alloc_ref(), slots_ + i);
+                    }
                }
            }
            size_ = 0;
@@ -1449,10 +1445,9 @@ public:
    // This overload kicks in if we cannot deduce the key from args. It constructs
    // value_type unconditionally and then either moves it into the table or
    // destroys.
-    template <class... Args, typename std::enable_if<
-                                 !IsDecomposable<Args...>::value, int>::type = 0>
+    template <class... Args, typename std::enable_if<!IsDecomposable<Args...>::value, int>::type = 0>
    std::pair<iterator, bool> emplace(Args&&... args) {
-        typename std::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type
+        typename phmap::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type
            raw;
        slot_type* slot = reinterpret_cast<slot_type*>(&raw);

@@ -1463,7 +1458,7 @@ public:

    template <class... Args, typename std::enable_if<!IsDecomposable<Args...>::value, int>::type = 0>
    std::pair<iterator, bool> emplace_with_hash(size_t hashval, Args&&... args) {
-        typename std::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type raw;
+        typename phmap::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type raw;
        slot_type* slot = reinterpret_cast<slot_type*>(&raw);

        PolicyTraits::construct(&alloc_ref(), slot, std::forward<Args>(args)...);
@@ -1508,6 +1503,10 @@ public:
        friend class raw_hash_set;

    public:
+        slot_type* slot() const {
+            return *slot_;
+        }
+
        template <class... Args>
        void operator()(Args&&... args) const {
            assert(*slot_);
@@ -1522,22 +1521,39 @@ public:
        slot_type** slot_;
    };

+    // Extension API: support for lazy emplace.
+    // Looks up key in the table. If found, returns the iterator to the element.
+    // Otherwise calls f with one argument of type raw_hash_set::constructor. f
+    // MUST call raw_hash_set::constructor with arguments as if a
+    // raw_hash_set::value_type is constructed, otherwise the behavior is
+    // undefined.
+    //
+    // For example:
+    //
+    //   std::unordered_set<ArenaString> s;
+    //   // Makes ArenaStr even if "abc" is in the map.
+    //   s.insert(ArenaString(&arena, "abc"));
+    //
+    //   flat_hash_set<ArenaStr> s;
+    //   // Makes ArenaStr only if "abc" is not in the map.
+    //   s.lazy_emplace("abc", [&](const constructor& ctor) {
+    //                         ctor(&arena, "abc");
+    //   });
+    // -----------------------------------------------------
    template <class K = key_type, class F>
    iterator lazy_emplace(const key_arg<K>& key, F&& f) {
-        auto res = find_or_prepare_insert(key);
-        if (res.second) {
-            lazy_emplace_at(res.first, std::forward<F>(f));
-        }
-        return iterator_at(res.first);
+        return lazy_emplace_with_hash(key, this->hash(key), std::forward<F>(f));
    }

    template <class K = key_type, class F>
    iterator lazy_emplace_with_hash(const key_arg<K>& key, size_t hashval, F&& f) {
-        auto res = find_or_prepare_insert(key, hashval);
-        if (res.second) {
-            lazy_emplace_at(res.first, std::forward<F>(f));
+        size_t offset = _find_key(key, hashval);
+        if (offset == (size_t)-1) {
+            offset = prepare_insert(hashval);
+            lazy_emplace_at(offset, std::forward<F>(f));
+            this->set_ctrl(offset, H2(hashval));
        }
-        return iterator_at(res.first);
+        return iterator_at(offset);
    }

    template <class K = key_type, class F>
@@ -1549,11 +1565,13 @@ public:

    template <class K = key_type, class F>
    void emplace_single_with_hash(const key_arg<K>& key, size_t hashval, F&& f) {
-        auto res = find_or_prepare_insert(key, hashval);
-        if (res.second)
-            lazy_emplace_at(res.first, std::forward<F>(f));
-        else
-            _erase(iterator_at(res.first));
+        size_t offset = _find_key(key, hashval);
+        if (offset == (size_t)-1) {
+            offset = prepare_insert(hashval);
+            lazy_emplace_at(offset, std::forward<F>(f));
+            this->set_ctrl(offset, H2(hashval));
+        } else
+            _erase(iterator_at(offset));
    }


@@ -1649,7 +1667,7 @@ public:
    void swap(raw_hash_set& that) noexcept(
        IsNoThrowSwappable<hasher>() && IsNoThrowSwappable<key_equal>() &&
        (!AllocTraits::propagate_on_container_swap::value ||
-         IsNoThrowSwappable<allocator_type>())) {
+         IsNoThrowSwappable<allocator_type>(typename AllocTraits::propagate_on_container_swap{}))) {
        using std::swap;
        swap(ctrl_, that.ctrl_);
        swap(slots_, that.slots_);
@@ -1659,12 +1677,7 @@ public:
        swap(hash_ref(), that.hash_ref());
        swap(eq_ref(), that.eq_ref());
        swap(infoz_, that.infoz_);
-        if (AllocTraits::propagate_on_container_swap::value) {
-            swap(alloc_ref(), that.alloc_ref());
-        } else {
-            // If the allocators do not compare equal it is officially undefined
-            // behavior. We choose to do nothing.
-        }
+        SwapAlloc(alloc_ref(), that.alloc_ref(), typename AllocTraits::propagate_on_container_swap{});
    }

 #if !defined(PHMAP_NON_DETERMINISTIC)
@@ -1795,7 +1808,7 @@ public:

    size_t bucket_count() const { return capacity_; }
    float load_factor() const {
-        return capacity_ ? static_cast<double>(size()) / capacity_ : 0.0;
+        return capacity_ ? static_cast<float>(static_cast<double>(size()) / capacity_) : 0.0f;
    }
    float max_load_factor() const { return 1.0f; }
    void max_load_factor(float) {
@@ -1886,11 +1899,14 @@ private:
    std::pair<iterator, bool> emplace_decomposable(const K& key, size_t hashval, 
                                                   Args&&... args)
    {
-        auto res = find_or_prepare_insert(key, hashval);
-        if (res.second) {
-            emplace_at(res.first, std::forward<Args>(args)...);
+        size_t offset = _find_key(key, hashval);
+        if (offset == (size_t)-1) {
+            offset = prepare_insert(hashval);
+            emplace_at(offset, std::forward<Args>(args)...);
+            this->set_ctrl(offset, H2(hashval));
+            return {iterator_at(offset), true};
        }
-        return {iterator_at(res.first), res.second};
+        return {iterator_at(offset), false};
    }

    struct EmplaceDecomposable 
@@ -1916,9 +1932,11 @@ private:
    {
        template <class K, class... Args>
        std::pair<iterator, bool> operator()(const K& key, Args&&...) && {
-            auto res = s.find_or_prepare_insert(key);
+            size_t hashval = s.hash(key);
+            auto res = s.find_or_prepare_insert(key, hashval);
            if (res.second) {
                PolicyTraits::transfer(&s.alloc_ref(), s.slots_ + res.first, &slot);
+                s.set_ctrl(res.first, H2(hashval));
            } else if (do_destroy) {
                PolicyTraits::destroy(&s.alloc_ref(), &slot);
            }
@@ -1937,6 +1955,7 @@ private:
            auto res = s.find_or_prepare_insert(key, hashval);
            if (res.second) {
                PolicyTraits::transfer(&s.alloc_ref(), s.slots_ + res.first, &slot);
+                s.set_ctrl(res.first, H2(hashval));
            } else if (do_destroy) {
                PolicyTraits::destroy(&s.alloc_ref(), &slot);
            }
@@ -1991,12 +2010,19 @@ private:
    }

    void destroy_slots() {
-        if (!capacity_) return;
-        for (size_t i = 0; i != capacity_; ++i) {
-            if (IsFull(ctrl_[i])) {
-                PolicyTraits::destroy(&alloc_ref(), slots_ + i);
+        if (!capacity_)
+            return;
+        
+        PHMAP_IF_CONSTEXPR((!std::is_trivially_destructible<typename PolicyTraits::value_type>::value ||
+                            std::is_same<typename Policy::is_flat, std::false_type>::value)) {
+            // node map, or not trivially destructible... we  need to iterate and destroy values one by one
+            // std::cout << "either this is a node map or " << type_name<typename PolicyTraits::value_type>()  << " is not trivially_destructible\n";
+            for (size_t i = 0; i != capacity_; ++i) {
+                if (IsFull(ctrl_[i])) {
+                    PolicyTraits::destroy(&alloc_ref(), slots_ + i);
+                }
            }
-        }
+        } 
        auto layout = MakeLayout(capacity_);
        // Unpoison before returning the memory to the allocator.
        SanitizerUnpoisonMemoryRegion(slots_, sizeof(slot_type) * capacity_);
@@ -2055,7 +2081,7 @@ private:
        //       mark target as FULL
        //       repeat procedure for current slot with moved from element (target)
        ConvertDeletedToEmptyAndFullToDeleted(ctrl_, capacity_);
-        typename std::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type
+        typename phmap::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type
            raw;
        slot_type* slot = reinterpret_cast<slot_type*>(&raw);
        for (size_t i = 0; i != capacity_; ++i) {
@@ -2172,7 +2198,7 @@ private:

 protected:
    template <class K>
-    std::pair<size_t, bool> find_or_prepare_insert(const K& key, size_t hashval) {
+    size_t _find_key(const K& key, size_t hashval) {
        auto seq = probe(hashval);
        while (true) {
            Group g{ctrl_ + seq.offset()};
@@ -2180,17 +2206,20 @@ protected:
                if (PHMAP_PREDICT_TRUE(PolicyTraits::apply(
                                          EqualElement<K>{key, eq_ref()},
                                          PolicyTraits::element(slots_ + seq.offset((size_t)i)))))
-                    return {seq.offset((size_t)i), false};
+                    return seq.offset((size_t)i);
            }
            if (PHMAP_PREDICT_TRUE(g.MatchEmpty())) break;
            seq.next();
        }
-        return {prepare_insert(hashval), true};
+        return (size_t)-1;
    }

    template <class K>
-    std::pair<size_t, bool> find_or_prepare_insert(const K& key) {
-        return find_or_prepare_insert(key, this->hash(key));
+    std::pair<size_t, bool> find_or_prepare_insert(const K& key, size_t hashval) {
+        size_t offset = _find_key(key, hashval);
+        if (offset == (size_t)-1)
+            return {prepare_insert(hashval), true};
+        return {offset, false};
    }

    size_t prepare_insert(size_t hashval) PHMAP_ATTRIBUTE_NOINLINE {
@@ -2202,7 +2231,7 @@ protected:
        }
        ++size_;
        growth_left() -= IsEmpty(ctrl_[target.offset]);
-        set_ctrl(target.offset, H2(hashval));
+        // set_ctrl(target.offset, H2(hashval));
        infoz_.RecordInsert(hashval, target.probe_length);
        return target.offset;
    }
@@ -2219,33 +2248,19 @@ protected:
    void emplace_at(size_t i, Args&&... args) {
        PolicyTraits::construct(&alloc_ref(), slots_ + i,
                                std::forward<Args>(args)...);
-
+        
+#ifdef PHMAP_CHECK_CONSTRUCTED_VALUE
+        // this check can be costly, so do it only when requested
        assert(PolicyTraits::apply(FindElement{*this}, *iterator_at(i)) ==
               iterator_at(i) &&
               "constructed value does not match the lookup key");
+#endif
    }

    iterator iterator_at(size_t i) { return {ctrl_ + i, slots_ + i}; }
    const_iterator iterator_at(size_t i) const { return {ctrl_ + i, slots_ + i}; }

-private:
-    friend struct RawHashSetTestOnlyAccess;
-
-    probe_seq<Group::kWidth> probe(size_t hashval) const {
-        return probe_seq<Group::kWidth>(H1(hashval, ctrl_), capacity_);
-    }
-
-    // Reset all ctrl bytes back to kEmpty, except the sentinel.
-    void reset_ctrl(size_t capacity) {
-        std::memset(ctrl_, kEmpty, capacity + Group::kWidth);
-        ctrl_[capacity] = kSentinel;
-        SanitizerPoisonMemoryRegion(slots_, sizeof(slot_type) * capacity);
-    }
-
-    void reset_growth_left(size_t capacity) {
-        growth_left() = CapacityToGrowth(capacity) - size_;
-    }
-
+protected:
    // Sets the control byte, and if `i < Group::kWidth`, set the cloned byte at
    // the end too.
    void set_ctrl(size_t i, ctrl_t h) {
@@ -2262,7 +2277,27 @@ private:
              ((Group::kWidth - 1) & capacity_)] = h;
    }

-    size_t& growth_left() { return settings_.template get<0>(); }
+private:
+    friend struct RawHashSetTestOnlyAccess;
+
+    probe_seq<Group::kWidth> probe(size_t hashval) const {
+        return probe_seq<Group::kWidth>(H1(hashval, ctrl_), capacity_);
+    }
+
+    // Reset all ctrl bytes back to kEmpty, except the sentinel.
+    void reset_ctrl(size_t new_capacity) {
+        std::memset(ctrl_, kEmpty, new_capacity + Group::kWidth);
+        ctrl_[new_capacity] = kSentinel;
+        SanitizerPoisonMemoryRegion(slots_, sizeof(slot_type) * new_capacity);
+    }
+
+    void reset_growth_left(size_t new_capacity) {
+        growth_left() = CapacityToGrowth(new_capacity) - size_;
+    }
+
+    size_t& growth_left() { return std::get<0>(settings_); }
+
+    const size_t& growth_left() const { return std::get<0>(settings_); }

    template <size_t N,
              template <class, class, class, class> class RefSet,
@@ -2290,13 +2325,13 @@ private:
    //  small tables.
    bool is_small() const { return capacity_ < Group::kWidth - 1; }

-    hasher& hash_ref() { return settings_.template get<1>(); }
-    const hasher& hash_ref() const { return settings_.template get<1>(); }
-    key_equal& eq_ref() { return settings_.template get<2>(); }
-    const key_equal& eq_ref() const { return settings_.template get<2>(); }
-    allocator_type& alloc_ref() { return settings_.template get<3>(); }
+    hasher& hash_ref() { return std::get<1>(settings_); }
+    const hasher& hash_ref() const { return std::get<1>(settings_); }
+    key_equal& eq_ref() { return std::get<2>(settings_); }
+    const key_equal& eq_ref() const { return std::get<2>(settings_); }
+    allocator_type& alloc_ref() { return std::get<3>(settings_); }
    const allocator_type& alloc_ref() const {
-        return settings_.template get<3>();
+        return std::get<3>(settings_);
    }

    // TODO(alkis): Investigate removing some of these fields:
@@ -2307,9 +2342,8 @@ private:
    size_t size_ = 0;                // number of full slots
    size_t capacity_ = 0;            // total number of slots
    HashtablezInfoHandle infoz_;
-    phmap::priv::CompressedTuple<size_t /* growth_left */, hasher,
-                                              key_equal, allocator_type>
-    settings_{0, hasher{}, key_equal{}, allocator_type{}};
+    std::tuple<size_t /* growth_left */, hasher, key_equal, allocator_type>
+        settings_{0, hasher{}, key_equal{}, allocator_type{}};
 };


@@ -2456,22 +2490,31 @@ public:
 private:
    template <class K, class V>
    std::pair<iterator, bool> insert_or_assign_impl(K&& k, V&& v) {
-        auto res = this->find_or_prepare_insert(k);
-        if (res.second)
-            this->emplace_at(res.first, std::forward<K>(k), std::forward<V>(v));
-        else
-            Policy::value(&*this->iterator_at(res.first)) = std::forward<V>(v);
-        return {this->iterator_at(res.first), res.second};
+        size_t hashval = this->hash(k);
+        size_t offset = this->_find_key(k, hashval);
+        if (offset == (size_t)-1) {
+            offset = this->prepare_insert(hashval);
+            this->emplace_at(offset, std::forward<K>(k), std::forward<V>(v));
+            this->set_ctrl(offset, H2(hashval));
+            return {this->iterator_at(offset), true};
+        } 
+        Policy::value(&*this->iterator_at(offset)) = std::forward<V>(v);
+        return {this->iterator_at(offset), false};
    }

    template <class K = key_type, class... Args>
    std::pair<iterator, bool> try_emplace_impl(K&& k, Args&&... args) {
-        auto res = this->find_or_prepare_insert(k);
-        if (res.second)
-            this->emplace_at(res.first, std::piecewise_construct,
+        size_t hashval = this->hash(k);
+        size_t offset = this->_find_key(k, hashval);
+        if (offset == (size_t)-1) {
+            offset = this->prepare_insert(hashval);
+            this->emplace_at(offset, std::piecewise_construct,
                             std::forward_as_tuple(std::forward<K>(k)),
                             std::forward_as_tuple(std::forward<Args>(args)...));
-        return {this->iterator_at(res.first), res.second};
+            this->set_ctrl(offset, H2(hashval));
+            return {this->iterator_at(offset), true};
+        }
+        return {this->iterator_at(offset), false};
    }
 };

@@ -2537,7 +2580,11 @@ public:
    using key_arg         = typename KeyArgImpl::template type<K, key_type>;

 protected:
-    using Lockable = phmap::LockableImpl<Mtx_>;
+    using Lockable      = phmap::LockableImpl<Mtx_>;
+    using UniqueLock    = typename Lockable::UniqueLock;
+    using SharedLock    = typename Lockable::SharedLock;
+    using ReadWriteLock = typename Lockable::ReadWriteLock;
+    

    // --------------------------------------------------------------------
    struct Inner : public Lockable
@@ -2588,9 +2635,7 @@ private:
    // --------------------------------------------------------------------
    template <class T>
    using RequiresInsertable = typename std::enable_if<
-        phmap::disjunction<std::is_convertible<T, init_type>,
-                          SameAsElementReference<T>>::value,
-        int>::type;
+        phmap::disjunction<std::is_convertible<T, init_type>, SameAsElementReference<T>>::value, int>::type;

    // RequiresNotInit is a workaround for gcc prior to 7.1.
    // See https://godbolt.org/g/Y4xsUh.
@@ -2917,7 +2962,7 @@ public:
    PHMAP_ATTRIBUTE_REINITIALIZES void clear() {
        for (auto& inner : sets_)
        {
-            typename Lockable::UniqueLock m(inner);
+            UniqueLock m(inner);
            inner.set_.clear();
        }
    }
@@ -2926,7 +2971,7 @@ public:
    // ----------------------------------------
    void clear(std::size_t submap_index) {
        Inner& inner = sets_[submap_index];
-        typename Lockable::UniqueLock m(inner);
+        UniqueLock m(inner);
        inner.set_.clear();
    }

@@ -3019,11 +3064,11 @@ public:
        Inner& inner   = sets_[subidx(hashval)];
        auto&  set     = inner.set_;

-        typename Lockable::UniqueLock m(inner);
+        UniqueLock m(inner);
        auto   res  = set.insert(std::move(node), hashval);
        return { make_iterator(&inner, res.position),
-                res.inserted,
-                res.inserted ? node_type() : std::move(res.node) };
+                 res.inserted,
+                 res.inserted ? node_type() : std::move(res.node) };
    }

    iterator insert(const_iterator, node_type&& node) {
@@ -3043,15 +3088,6 @@ public:
    // ----------------------------------
    // same as emplace, but hashval is provided
    // --------------------------------------------------------------------
-    template <class K, class... Args>
-    std::pair<iterator, bool> emplace_decomposable_with_hash(const K& key, size_t hashval, Args&&... args)
-    {
-        Inner& inner   = sets_[subidx(hashval)];
-        auto&  set     = inner.set_;
-        typename Lockable::UniqueLock m(inner);
-        return make_rv(&inner, set.emplace_decomposable(key, hashval, std::forward<Args>(args)...));
-    }
-
    struct EmplaceDecomposableHashval 
    {
        template <class K, class... Args>
@@ -3072,8 +3108,7 @@ public:
    //   // Creates no std::string copies and makes no heap allocations.
    //   m.emplace("abc", "xyz");
    // --------------------------------------------------------------------
-    template <class... Args, typename std::enable_if<
-                                 IsDecomposable<Args...>::value, int>::type = 0>
+    template <class... Args, typename std::enable_if<IsDecomposable<Args...>::value, int>::type = 0>
    std::pair<iterator, bool> emplace_with_hash(size_t hashval, Args&&... args) {
        return PolicyTraits::apply(EmplaceDecomposableHashval{*this, hashval},
                                   std::forward<Args>(args)...);
@@ -3083,19 +3118,17 @@ public:
    // value_type unconditionally and then either moves it into the table or
    // destroys.
    // --------------------------------------------------------------------
-    template <class... Args, typename std::enable_if<
-                                 !IsDecomposable<Args...>::value, int>::type = 0>
+    template <class... Args, typename std::enable_if<!IsDecomposable<Args...>::value, int>::type = 0>
    std::pair<iterator, bool> emplace_with_hash(size_t hashval, Args&&... args) {
-        typename std::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type raw;
+        typename phmap::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type raw;
        slot_type* slot = reinterpret_cast<slot_type*>(&raw);

        PolicyTraits::construct(&alloc_ref(), slot, std::forward<Args>(args)...);
        const auto& elem = PolicyTraits::element(slot);
        Inner& inner    = sets_[subidx(hashval)];
        auto&  set      = inner.set_;
-        typename Lockable::UniqueLock m(inner);
-        typename EmbeddedSet::template InsertSlotWithHash<true> f {
-            inner, std::move(*slot), hashval};
+        UniqueLock m(inner);
+        typename EmbeddedSet::template InsertSlotWithHash<true> f { inner, std::move(*slot), hashval };
        return make_rv(PolicyTraits::apply(f, elem));
    }

@@ -3104,26 +3137,36 @@ public:
        return emplace_with_hash(hashval, std::forward<Args>(args)...).first;
    }

-    template <class K = key_type, class F>
-    iterator lazy_emplace_with_hash(const key_arg<K>& key, size_t hashval, F&& f) {
-        Inner& inner = sets_[subidx(hashval)];
-        auto&  set   = inner.set_;
-        typename Lockable::UniqueLock m(inner);
-        return make_iterator(&inner, set.lazy_emplace_with_hash(key, hashval, std::forward<F>(f)));
-    }
-
    // --------------------------------------------------------------------
    // end of phmap expension
    // --------------------------------------------------------------------

    template <class K, class... Args>
-    std::pair<iterator, bool> emplace_decomposable(const K& key, Args&&... args)
+    std::pair<iterator, bool> emplace_decomposable_with_hash(const K& key, size_t hashval, Args&&... args)
    {
-        size_t hashval = this->hash(key);
        Inner& inner   = sets_[subidx(hashval)];
        auto&  set     = inner.set_;
-        typename Lockable::UniqueLock m(inner);
-        return make_rv(&inner, set.emplace_decomposable(key, hashval, std::forward<Args>(args)...));
+        ReadWriteLock m(inner);
+        
+        size_t offset = set._find_key(key, hashval);
+        if (offset == (size_t)-1 && m.switch_to_unique()) {
+            // we did an unlock/lock, and another thread could have inserted the same key, so we need to
+            // do a find() again.
+            offset = set._find_key(key, hashval);
+        }
+        if (offset == (size_t)-1) {
+            offset = set.prepare_insert(hashval);
+            set.emplace_at(offset, std::forward<Args>(args)...);
+            set.set_ctrl(offset, H2(hashval));
+            return make_rv(&inner, {set.iterator_at(offset), true});
+        }
+        return make_rv(&inner, {set.iterator_at(offset), false});
+    }
+
+    template <class K, class... Args>
+    std::pair<iterator, bool> emplace_decomposable(const K& key, Args&&... args)
+    {
+        return emplace_decomposable_with_hash(key, this->hash(key), std::forward<Args>(args)...);
    }

    struct EmplaceDecomposable 
@@ -3145,31 +3188,27 @@ public:
    //   // Creates no std::string copies and makes no heap allocations.
    //   m.emplace("abc", "xyz");
    // --------------------------------------------------------------------
-    template <class... Args, typename std::enable_if<
-                                 IsDecomposable<Args...>::value, int>::type = 0>
+    template <class... Args, typename std::enable_if<IsDecomposable<Args...>::value, int>::type = 0>
    std::pair<iterator, bool> emplace(Args&&... args) {
-        return PolicyTraits::apply(EmplaceDecomposable{*this},
-                                   std::forward<Args>(args)...);
+        return PolicyTraits::apply(EmplaceDecomposable{*this}, std::forward<Args>(args)...);
    }

    // This overload kicks in if we cannot deduce the key from args. It constructs
    // value_type unconditionally and then either moves it into the table or
    // destroys.
    // --------------------------------------------------------------------
-    template <class... Args, typename std::enable_if<
-                                 !IsDecomposable<Args...>::value, int>::type = 0>
+    template <class... Args, typename std::enable_if<!IsDecomposable<Args...>::value, int>::type = 0>
    std::pair<iterator, bool> emplace(Args&&... args) {
-        typename std::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type raw;
+        typename phmap::aligned_storage<sizeof(slot_type), alignof(slot_type)>::type raw;
        slot_type* slot = reinterpret_cast<slot_type*>(&raw);
        size_t hashval  = this->hash(PolicyTraits::key(slot));

        PolicyTraits::construct(&alloc_ref(), slot, std::forward<Args>(args)...);
        const auto& elem = PolicyTraits::element(slot);
-        Inner& inner    = sets_[subidx(hashval)];
-        auto&  set      = inner.set_;
-        typename Lockable::UniqueLock m(inner);
-        typename EmbeddedSet::template InsertSlotWithHash<true> f {
-            inner, std::move(*slot), hashval};
+        Inner& inner     = sets_[subidx(hashval)];
+        auto&  set       = inner.set_;
+        UniqueLock m(inner);
+        typename EmbeddedSet::template InsertSlotWithHash<true> f { inner, std::move(*slot), hashval };
        return make_rv(PolicyTraits::apply(f, elem));
    }

@@ -3194,12 +3233,27 @@ public:
    // lazy_emplace
    // ------------
    template <class K = key_type, class F>
-    iterator lazy_emplace(const key_arg<K>& key, F&& f) {
-        auto hashval = this->hash(key);
+    iterator lazy_emplace_with_hash(const key_arg<K>& key, size_t hashval, F&& f) {
        Inner& inner = sets_[subidx(hashval)];
        auto&  set   = inner.set_;
-        typename Lockable::UniqueLock m(inner);
-        return make_iterator(&inner, set.lazy_emplace_with_hash(key, hashval, std::forward<F>(f)));
+        ReadWriteLock m(inner);
+        size_t offset = set._find_key(key, hashval);
+        if (offset == (size_t)-1 && m.switch_to_unique()) {
+            // we did an unlock/lock, and another thread could have inserted the same key, so we need to
+            // do a find() again.
+            offset = set._find_key(key, hashval);
+        }
+        if (offset == (size_t)-1) {
+            offset = set.prepare_insert(hashval);
+            set.lazy_emplace_at(offset, std::forward<F>(f));
+            set.set_ctrl(offset, H2(hashval));
+        }
+        return make_iterator(&inner, set.iterator_at(offset));
+    }
+
+    template <class K = key_type, class F>
+    iterator lazy_emplace(const key_arg<K>& key, F&& f) {
+        return lazy_emplace_with_hash(key, this->hash(key), std::forward<F>(f));
    }
    
    // emplace_single
@@ -3208,14 +3262,13 @@ public:
    void emplace_single_with_hash(const key_arg<K>& key, size_t hashval, F&& f) {
        Inner& inner = sets_[subidx(hashval)];
        auto&  set   = inner.set_;
-        typename Lockable::UniqueLock m(inner);
+        UniqueLock m(inner);
        set.emplace_single_with_hash(key, hashval, std::forward<F>(f));
    }

    template <class K = key_type, class F>
    void emplace_single(const key_arg<K>& key, F&& f) {
-        auto hashval = this->hash(key);
-        emplace_single_with_hash<K, F>(key, hashval, std::forward<F>(f));
+        emplace_single_with_hash<K, F>(key, this->hash(key), std::forward<F>(f));
    }

    // if set contains key, lambda is called with the value_type (under read lock protection),
@@ -3224,7 +3277,7 @@ public:
    template <class K = key_type, class F>
    bool if_contains(const key_arg<K>& key, F&& f) const {
        return const_cast<parallel_hash_set*>(this)->template 
-            modify_if_impl<K, F, typename Lockable::SharedLock>(key, std::forward<F>(f));
+            modify_if_impl<K, F, SharedLock>(key, std::forward<F>(f));
    }

    // if set contains key, lambda is called with the value_type  without read lock protection,
@@ -3242,7 +3295,7 @@ public:
    // ----------------------------------------------------------------------------------------------------
    template <class K = key_type, class F>
    bool modify_if(const key_arg<K>& key, F&& f) {
-        return modify_if_impl<K, F, typename Lockable::UniqueLock>(key, std::forward<F>(f));
+        return modify_if_impl<K, F, UniqueLock>(key, std::forward<F>(f));
    }

    // -----------------------------------------------------------------------------------------
@@ -3266,23 +3319,33 @@ public:
    // ----------------------------------------------------------------------------------------------------
    template <class K = key_type, class F>
    bool erase_if(const key_arg<K>& key, F&& f) {
-        return erase_if_impl<K, F, typename Lockable::UniqueLock>(key, std::forward<F>(f));
+        return !!erase_if_impl<K, F, ReadWriteLock>(key, std::forward<F>(f));
    }

    template <class K = key_type, class F, class L>
-    bool erase_if_impl(const key_arg<K>& key, F&& f) {
+    size_type erase_if_impl(const key_arg<K>& key, F&& f) {
 #if __cplusplus >= 201703L
        static_assert(std::is_invocable<F, value_type&>::value);
 #endif
-        L m;
-        auto it = this->template find<K, L>(key, this->hash(key), m);
-        if (it == this->end()) return false;
+        auto hashval = this->hash(key);
+        Inner& inner = sets_[subidx(hashval)];
+        auto& set = inner.set_;
+        L m(inner);
+        auto it = set.find(key, hashval);
+        if (it == set.end())
+            return 0;
+        if (m.switch_to_unique()) {
+            // we did an unlock/lock, need to call `find()` again
+            it = set.find(key, hashval);
+            if (it == set.end())
+                return 0;
+        }
        if (std::forward<F>(f)(const_cast<value_type &>(*it)))
        {
-            this->erase(it);
-            return true;
+            set._erase(it);
+            return 1;
        }
-        return false;
+        return 0;
    }

    // if map already  contains key, the first lambda is called with the mapped value (under 
@@ -3293,14 +3356,18 @@ public:
    // ---------------------------------------------------------------------------------------
    template <class K = key_type, class FExists, class FEmplace>
    bool lazy_emplace_l(const key_arg<K>& key, FExists&& fExists, FEmplace&& fEmplace) {
-        typename Lockable::UniqueLock m;
-        auto res = this->find_or_prepare_insert(key, m);
+        size_t hashval = this->hash(key);
+        ReadWriteLock m;
+        auto res = this->find_or_prepare_insert_with_hash(hashval, key, m);
        Inner* inner = std::get<0>(res);
-        if (std::get<2>(res))
+        if (std::get<2>(res)) {
+            // key not found. call fEmplace lambda which should invoke passed constructor
            inner->set_.lazy_emplace_at(std::get<1>(res), std::forward<FEmplace>(fEmplace));
-        else {
+            inner->set_.set_ctrl(std::get<1>(res), H2(hashval));
+        } else {
+            // key found. Call fExists lambda. In case of the set, non "key" part of value_type can be changed
            auto it = this->iterator_at(inner, inner->set_.iterator_at(std::get<1>(res)));
-            std::forward<FExists>(fExists)(const_cast<value_type &>(*it)); // in case of the set, non "key" part of value_type can be changed
+            std::forward<FExists>(fExists)(const_cast<value_type &>(*it)); 
        }
        return std::get<2>(res);
    }
@@ -3315,19 +3382,70 @@ public:
    template <class F>
    void for_each(F&& fCallback) const {
        for (auto const& inner : sets_) {
-            typename Lockable::SharedLock m(const_cast<Inner&>(inner));
+            SharedLock m(const_cast<Inner&>(inner));
            std::for_each(inner.set_.begin(), inner.set_.end(), fCallback);
        }
    }

    // this version allows to modify the values
-    void for_each_m(std::function<void (value_type&)> && fCallback) {
+    template <class F>
+    void for_each_m(F&& fCallback) {
        for (auto& inner : sets_) {
-            typename Lockable::UniqueLock m(const_cast<Inner&>(inner));
+            UniqueLock m(inner);
            std::for_each(inner.set_.begin(), inner.set_.end(), fCallback);
        }
    }

+#if __cplusplus >= 201703L
+    template <class ExecutionPolicy, class F>
+    void for_each(ExecutionPolicy&& policy, F&& fCallback) const {
+        std::for_each(
+            std::forward<ExecutionPolicy>(policy), sets_.begin(), sets_.end(),
+            [&](auto const& inner) {
+                SharedLock m(const_cast<Inner&>(inner));
+                std::for_each(inner.set_.begin(), inner.set_.end(), fCallback);
+            }
+        );
+    }
+
+    template <class ExecutionPolicy, class F>
+    void for_each_m(ExecutionPolicy&& policy, F&& fCallback) {
+        std::for_each(
+            std::forward<ExecutionPolicy>(policy), sets_.begin(), sets_.end(),
+            [&](auto& inner) {
+                UniqueLock m(inner);
+                std::for_each(inner.set_.begin(), inner.set_.end(), fCallback);
+            }
+        );
+    }
+#endif
+
+    // Extension API: access internal submaps by index
+    // under lock protection
+    // ex: m.with_submap(i, [&](const Map::EmbeddedSet& set) {
+    //        for (auto& p : set) { ...; }});
+    // -------------------------------------------------
+    template <class F>
+    void with_submap(size_t idx, F&& fCallback) const {
+        const Inner& inner     = sets_[idx];
+        const auto&  set = inner.set_;
+        SharedLock m(const_cast<Inner&>(inner));
+        fCallback(set);
+    }
+
+    template <class F>
+    void with_submap_m(size_t idx, F&& fCallback) {
+        Inner& inner   = sets_[idx];
+        auto&  set     = inner.set_;
+        UniqueLock m(inner);
+        fCallback(set);
+    }
+
+    // unsafe, for internal use only
+    Inner& get_inner(size_t idx) {
+        return  sets_[idx];
+    }
+
    // Extension API: support for heterogeneous keys.
    //
    //   std::unordered_set<std::string> s;
@@ -3341,17 +3459,8 @@ public:
    // --------------------------------------------------------------------
    template <class K = key_type>
    size_type erase(const key_arg<K>& key) {
-        auto hashval = this->hash(key);
-        Inner& inner = sets_[subidx(hashval)];
-        auto&  set   = inner.set_;
-        typename Lockable::UpgradeLock m(inner);
-        auto it   = set.find(key, hashval);
-        if (it == set.end()) 
-            return 0;
-
-        typename Lockable::UpgradeToUnique unique(m);
-        set._erase(it);
-        return 1;
+        auto always_erase =  [](const value_type&){ return true; };
+        return erase_if_impl<K, decltype(always_erase), ReadWriteLock>(key, std::move(always_erase));
    }

    // --------------------------------------------------------------------
@@ -3372,11 +3481,11 @@ public:
    //
    // Do not use erase APIs taking iterators when accessing the map concurrently
    // --------------------------------------------------------------------
-    void _erase(iterator it, bool do_lock = true) {
+    void _erase(iterator it) {
        Inner* inner = it.inner_;
        assert(inner != nullptr);
        auto&  set   = inner->set_;
-        // typename Lockable::UniqueLock m(*inner); // don't lock here 
+        // UniqueLock m(*inner); // don't lock here 
        
        set._erase(it.it_);
    }
@@ -3429,15 +3538,20 @@ public:
        return it == end() ? node_type() : extract(const_iterator{it});
    }

-    void swap(parallel_hash_set& that) noexcept(
-        IsNoThrowSwappable<EmbeddedSet>() &&
-        (!AllocTraits::propagate_on_container_swap::value ||
-         IsNoThrowSwappable<allocator_type>())) {
+    template<class Mtx2_>
+    void swap(parallel_hash_set<N, RefSet, Mtx2_, Policy, Hash, Eq, Alloc>& that)
+        noexcept(IsNoThrowSwappable<EmbeddedSet>() &&
+                 (!AllocTraits::propagate_on_container_swap::value ||
+                  IsNoThrowSwappable<allocator_type>(typename AllocTraits::propagate_on_container_swap{})))
+    {
        using std::swap;
+        using Lockable2 = phmap::LockableImpl<Mtx2_>;
+         
        for (size_t i=0; i<num_tables; ++i)
        {
-            typename Lockable::UniqueLocks l(sets_[i], that.sets_[i]);
-            swap(sets_[i].set_, that.sets_[i].set_);
+            typename Lockable::UniqueLock l(sets_[i]);
+            typename Lockable2::UniqueLock l2(that.get_inner(i));
+            swap(sets_[i].set_, that.get_inner(i).set_);
        }
    }

@@ -3445,7 +3559,7 @@ public:
        size_t nn = n / num_tables;
        for (auto& inner : sets_)
        {
-            typename Lockable::UniqueLock m(inner);
+            UniqueLock m(inner);
            inner.set_.rehash(nn);
        }
    }
@@ -3453,7 +3567,7 @@ public:
    void reserve(size_t n) 
    {
        size_t target = GrowthToLowerboundCapacity(n);
-        size_t normalized = 16 * NormalizeCapacity(n / num_tables);
+        size_t normalized = num_tables * NormalizeCapacity(n / num_tables);
        rehash(normalized > target ? normalized : target); 
    }

@@ -3481,7 +3595,7 @@ public:
    void prefetch_hash(size_t hashval) const {
        const Inner& inner = sets_[subidx(hashval)];
        const auto&  set   = inner.set_;
-        typename Lockable::SharedLock m(const_cast<Inner&>(inner));
+        SharedLock m(const_cast<Inner&>(inner));
        set.prefetch_hash(hashval);
    }

@@ -3500,7 +3614,7 @@ public:
    // --------------------------------------------------------------------
    template <class K = key_type>
    iterator find(const key_arg<K>& key, size_t hashval) {
-        typename Lockable::SharedLock m;
+        SharedLock m;
        return find(key, hashval, m);
    }

@@ -3548,7 +3662,7 @@ public:
        size_t sz = 0;
        for (const auto& inner : sets_)
        {
-            typename Lockable::SharedLock m(const_cast<Inner&>(inner));
+            SharedLock m(const_cast<Inner&>(inner));
            sz += inner.set_.bucket_count();
        }
        return sz; 
@@ -3576,8 +3690,11 @@ public:
        return !(a == b);
    }

+    template<class Mtx2_>
    friend void swap(parallel_hash_set& a,
-                     parallel_hash_set& b) noexcept(noexcept(a.swap(b))) {
+                     parallel_hash_set<N, RefSet, Mtx2_, Policy, Hash, Eq, Alloc>& b)
+        noexcept(noexcept(a.swap(b)))
+    {
        a.swap(b);
    }

@@ -3641,7 +3758,7 @@ private:
    void drop_deletes_without_resize() PHMAP_ATTRIBUTE_NOINLINE {
        for (auto& inner : sets_)
        {
-            typename Lockable::UniqueLock m(inner);
+            UniqueLock m(inner);
            inner.set_.drop_deletes_without_resize();
        }
    }
@@ -3650,26 +3767,28 @@ private:
        size_t hashval = PolicyTraits::apply(HashElement{hash_ref()}, elem);
        Inner& inner   = sets_[subidx(hashval)];
        auto&  set     = inner.set_;
-        typename Lockable::SharedLock m(const_cast<Inner&>(inner));
+        SharedLock m(const_cast<Inner&>(inner));
        return set.has_element(elem, hashval);
    }

    // TODO(alkis): Optimize this assuming *this and that don't overlap.
    // --------------------------------------------------------------------
-    parallel_hash_set& move_assign(parallel_hash_set&& that, std::true_type) {
-        parallel_hash_set tmp(std::move(that));
+    template<class Mtx2_>
+    parallel_hash_set& move_assign(parallel_hash_set<N, RefSet, Mtx2_, Policy, Hash, Eq, Alloc>&& that, std::true_type) {
+        parallel_hash_set<N, RefSet, Mtx2_, Policy, Hash, Eq, Alloc> tmp(std::move(that));
        swap(tmp);
        return *this;
    }

-    parallel_hash_set& move_assign(parallel_hash_set&& that, std::false_type) {
-        parallel_hash_set tmp(std::move(that), alloc_ref());
+    template<class Mtx2_>
+    parallel_hash_set& move_assign(parallel_hash_set<N, RefSet, Mtx2_, Policy, Hash, Eq, Alloc>&& that, std::false_type) {
+        parallel_hash_set<N, RefSet, Mtx2_, Policy, Hash, Eq, Alloc> tmp(std::move(that), alloc_ref());
        swap(tmp);
        return *this;
    }

 protected:
-    template <class K = key_type, class L = typename Lockable::SharedLock>
+    template <class K = key_type, class L = SharedLock>
    pointer find_ptr(const key_arg<K>& key, size_t hashval, L& mutexlock)
    {
        Inner& inner = sets_[subidx(hashval)];
@@ -3678,7 +3797,7 @@ protected:
        return set.find_ptr(key, hashval);
    }

-    template <class K = key_type, class L = typename Lockable::SharedLock>
+    template <class K = key_type, class L = SharedLock>
    iterator find(const key_arg<K>& key, size_t hashval, L& mutexlock) {
        Inner& inner = sets_[subidx(hashval)];
        auto& set = inner.set_;
@@ -3688,17 +3807,26 @@ protected:

    template <class K>
    std::tuple<Inner*, size_t, bool> 
-    find_or_prepare_insert_with_hash(size_t hashval, const K& key, typename Lockable::UniqueLock &mutexlock) {
+    find_or_prepare_insert_with_hash(size_t hashval, const K& key, ReadWriteLock &mutexlock) {
        Inner& inner = sets_[subidx(hashval)];
        auto&  set   = inner.set_;
-        mutexlock    = std::move(typename Lockable::UniqueLock(inner));
-        auto  p   = set.find_or_prepare_insert(key, hashval); // std::pair<size_t, bool>
-        return std::make_tuple(&inner, p.first, p.second);
+        mutexlock    = std::move(ReadWriteLock(inner));
+        size_t offset = set._find_key(key, hashval);
+        if (offset == (size_t)-1 && mutexlock.switch_to_unique()) {
+            // we did an unlock/lock, and another thread could have inserted the same key, so we need to
+            // do a find() again.
+            offset = set._find_key(key, hashval);
+        }
+        if (offset == (size_t)-1) {
+            offset = set.prepare_insert(hashval);
+            return std::make_tuple(&inner, offset, true);
+        }
+        return std::make_tuple(&inner, offset, false);
    }

    template <class K>
    std::tuple<Inner*, size_t, bool> 
-    find_or_prepare_insert(const K& key, typename Lockable::UniqueLock &mutexlock) {
+    find_or_prepare_insert(const K& key, ReadWriteLock &mutexlock) {
        return find_or_prepare_insert_with_hash<K>(this->hash(key), key, mutexlock);
    }

@@ -3766,7 +3894,10 @@ class parallel_hash_map : public parallel_hash_set<N, RefSet, Mtx_, Policy, Hash
        KeyArg<IsTransparent<Eq>::value && IsTransparent<Hash>::value>;

    using Base = typename parallel_hash_map::parallel_hash_set;
-    using Lockable = phmap::LockableImpl<Mtx_>;
+    using Lockable      = phmap::LockableImpl<Mtx_>;
+    using UniqueLock    = typename Lockable::UniqueLock;
+    using SharedLock    = typename Lockable::SharedLock;
+    using ReadWriteLock = typename Lockable::ReadWriteLock;

 public:
    using key_type    = typename Policy::key_type;
@@ -3916,20 +4047,41 @@ public:
    // ---------------------------------------------------------------------------------------
    template <class K = key_type, class F, class... Args>
    bool try_emplace_l(K&& k, F&& f, Args&&... args) {
-        typename Lockable::UniqueLock m;
-        auto res = this->find_or_prepare_insert(k, m);
+        size_t hashval = this->hash(k);
+        ReadWriteLock m;
+        auto res = this->find_or_prepare_insert_with_hash(hashval, k, m);
        typename Base::Inner *inner = std::get<0>(res);
-        if (std::get<2>(res))
+        if (std::get<2>(res)) {
            inner->set_.emplace_at(std::get<1>(res), std::piecewise_construct,
                                   std::forward_as_tuple(std::forward<K>(k)),
                                   std::forward_as_tuple(std::forward<Args>(args)...));
-        else {
+            inner->set_.set_ctrl(std::get<1>(res), H2(hashval));
+        } else {
            auto it = this->iterator_at(inner, inner->set_.iterator_at(std::get<1>(res)));
-            std::forward<F>(f)(const_cast<value_type &>(*it)); // in case of the set, non "key" part of value_type can be changed
+            // call lambda. in case of the set, non "key" part of value_type can be changed
+            std::forward<F>(f)(const_cast<value_type &>(*it));
        }
        return std::get<2>(res);
    }

+    // returns {pointer, bool} instead of {iterator, bool} per try_emplace.
+    // useful for node-based containers, since the pointer is not invalidated by concurrent insert etc.
+    template <class K = key_type, class... Args>
+    std::pair<typename parallel_hash_map::parallel_hash_set::pointer, bool> try_emplace_p(K&& k, Args&&... args) {
+        size_t hashval = this->hash(k);
+        ReadWriteLock m;
+        auto res = this->find_or_prepare_insert_with_hash(hashval, k, m);
+        typename Base::Inner *inner = std::get<0>(res);
+        if (std::get<2>(res)) {
+            inner->set_.emplace_at(std::get<1>(res), std::piecewise_construct,
+                                   std::forward_as_tuple(std::forward<K>(k)),
+                                   std::forward_as_tuple(std::forward<Args>(args)...));
+            inner->set_.set_ctrl(std::get<1>(res), H2(hashval));
+        }
+        auto it = this->iterator_at(inner, inner->set_.iterator_at(std::get<1>(res)));
+        return {&*it, std::get<2>(res)};
+    }
+
    // ----------- end of phmap extensions --------------------------

    template <class K = key_type, class P = Policy, K* = nullptr>
@@ -3946,12 +4098,14 @@ private:

    template <class K, class V>
    std::pair<iterator, bool> insert_or_assign_impl(K&& k, V&& v) {
-        typename Lockable::UniqueLock m;
-        auto res = this->find_or_prepare_insert(k, m);
+        size_t hashval = this->hash(k);
+        ReadWriteLock m;
+        auto res = this->find_or_prepare_insert_with_hash(hashval, k, m);
        typename Base::Inner *inner = std::get<0>(res);
-        if (std::get<2>(res))
+        if (std::get<2>(res)) {
            inner->set_.emplace_at(std::get<1>(res), std::forward<K>(k), std::forward<V>(v));
-        else
+            inner->set_.set_ctrl(std::get<1>(res), H2(hashval));
+        } else
            Policy::value(&*inner->set_.iterator_at(std::get<1>(res))) = std::forward<V>(v);
        return {this->iterator_at(inner, inner->set_.iterator_at(std::get<1>(res))), 
                std::get<2>(res)};
@@ -3959,26 +4113,21 @@ private:

    template <class K = key_type, class... Args>
    std::pair<iterator, bool> try_emplace_impl(K&& k, Args&&... args) {
-        typename Lockable::UniqueLock m;
-        auto res = this->find_or_prepare_insert(k, m);
-        typename Base::Inner *inner = std::get<0>(res);
-        if (std::get<2>(res))
-            inner->set_.emplace_at(std::get<1>(res), std::piecewise_construct,
-                                   std::forward_as_tuple(std::forward<K>(k)),
-                                   std::forward_as_tuple(std::forward<Args>(args)...));
-        return {this->iterator_at(inner, inner->set_.iterator_at(std::get<1>(res))), 
-                std::get<2>(res)};
+        return try_emplace_impl_with_hash(this->hash(k), std::forward<K>(k),
+                                          std::forward<Args>(args)...);
    }

    template <class K = key_type, class... Args>
    std::pair<iterator, bool> try_emplace_impl_with_hash(size_t hashval, K&& k, Args&&... args) {
-        typename Lockable::UniqueLock m;
+        ReadWriteLock m;
        auto res = this->find_or_prepare_insert_with_hash(hashval, k, m);
        typename Base::Inner *inner = std::get<0>(res);
-        if (std::get<2>(res))
+        if (std::get<2>(res)) {
            inner->set_.emplace_at(std::get<1>(res), std::piecewise_construct,
                                   std::forward_as_tuple(std::forward<K>(k)),
                                   std::forward_as_tuple(std::forward<Args>(args)...));
+            inner->set_.set_ctrl(std::get<1>(res), H2(hashval));
+        }
        return {this->iterator_at(inner, inner->set_.iterator_at(std::get<1>(res))), 
                std::get<2>(res)};
    }
@@ -4097,6 +4246,7 @@ struct FlatHashSetPolicy
    using key_type = T;
    using init_type = T;
    using constant_iterators = std::true_type;
+    using is_flat = std::true_type;

    template <class Allocator, class... Args>
    static void construct(Allocator* alloc, slot_type* slot, Args&&... args) {
@@ -4139,6 +4289,7 @@ struct FlatHashMapPolicy
    using key_type = K;
    using mapped_type = V;
    using init_type = std::pair</*non const*/ key_type, mapped_type>;
+    using is_flat = std::true_type;

    template <class Allocator, class... Args>
    static void construct(Allocator* alloc, slot_type* slot, Args&&... args) {
@@ -4221,6 +4372,7 @@ struct NodeHashSetPolicy
    using key_type = T;
    using init_type = T;
    using constant_iterators = std::true_type;
+    using is_flat = std::false_type;

    template <class Allocator, class... Args>
        static T* new_element(Allocator* alloc, Args&&... args) {
@@ -4266,6 +4418,7 @@ public:
    using key_type = Key;
    using mapped_type = Value;
    using init_type = std::pair</*non const*/ key_type, mapped_type>;
+    using is_flat = std::false_type;

    template <class Allocator, class... Args>
        static value_type* new_element(Allocator* alloc, Args&&... args) {
@@ -4311,28 +4464,26 @@ public:

 #if PHMAP_HAVE_STD_STRING_VIEW

-// support char16_t wchar_t ....
-template<class CharT> 
-struct StringHashT 
-{
-    using is_transparent = void;
-
-    size_t operator()(std::basic_string_view<CharT> v) const {
-        std::string_view bv{reinterpret_cast<const char*>(v.data()), v.size() * sizeof(CharT)};
-        return std::hash<std::string_view>()(bv);
-    }
-};
-
 // Supports heterogeneous lookup for basic_string<T>-like elements.
 template<class CharT> 
 struct StringHashEqT
 {
-    using Hash = StringHashT<CharT>;
+    struct Hash 
+    {
+        using is_transparent = void;
+        
+        size_t operator()(std::basic_string_view<CharT> v) const {
+            std::string_view bv{
+                reinterpret_cast<const char*>(v.data()), v.size() * sizeof(CharT)};
+            return std::hash<std::string_view>()(bv);
+        }
+    };

    struct Eq {
        using is_transparent = void;

-        bool operator()(std::basic_string_view<CharT> lhs, std::basic_string_view<CharT> rhs) const {
+        bool operator()(std::basic_string_view<CharT> lhs,
+                        std::basic_string_view<CharT> rhs) const {
            return lhs == rhs;
        }
    };
@@ -4369,7 +4520,9 @@ struct HashEq<T*>
        using is_transparent = void;
        template <class U>
        size_t operator()(const U& ptr) const {
-            return phmap::Hash<const T*>{}(HashEq::ToPtr(ptr));
+            // we want phmap::Hash<T*> and not phmap::Hash<const T*>
+            // so "struct std::hash<T*> " override works
+            return phmap::Hash<T*>{}((T*)(uintptr_t)HashEq::ToPtr(ptr));
        }
    };

@@ -4426,7 +4579,7 @@ struct HashtableDebugAccess<Set, typename std::enable_if<has_member_type_raw_has
        auto seq = set.probe(hashval);
        while (true) {
            priv::Group g{set.ctrl_ + seq.offset()};
-            for (uint32_t i : g.Match(priv::H2(hashval))) {
+            for (uint32_t i : g.Match((h2_t)priv::H2(hashval))) {
                if (Traits::apply(
                        typename Set::template EqualElement<typename Set::key_type>{
                            key, set.eq_ref()},
@@ -4638,7 +4791,7 @@ public:
 //   hashing function and equality operator.
 // * Contains a `capacity()` member function indicating the number of element
 //   slots (open, deleted, and empty) within the hash set.
-// * Returns `void` from the `erase(iterator)` overload.
+// * Returns `void` from the `_erase(iterator)` overload.
 // -----------------------------------------------------------------------------
 template <class T, class Hash, class Eq, class Alloc> // default values in phmap_fwd_decl.h
 class node_hash_set
@@ -4703,7 +4856,7 @@ public:
 //   hashing function and equality operator.
 // * Contains a `capacity()` member function indicating the number of element
 //   slots (open, deleted, and empty) within the hash map.
-// * Returns `void` from the `erase(iterator)` overload.
+// * Returns `void` from the `_erase(iterator)` overload.
 // -----------------------------------------------------------------------------
 template <class Key, class Value, class Hash, class Eq, class Alloc>  // default values in phmap_fwd_decl.h
 class node_hash_map

--- a/third_party/parallel-hashmap/parallel_hashmap/phmap_utils.h
+++ b/third_party/parallel-hashmap/parallel_hashmap/phmap_utils.h
@@ -57,7 +57,6 @@ struct phmap_mix<4>
    inline size_t operator()(size_t a) const
    {
        static constexpr uint64_t kmul = 0xcc9e2d51UL;
-        // static constexpr uint64_t kmul = 0x3B9ACB93UL; // [greg] my own random prime
        uint64_t l = a * kmul;
        return static_cast<size_t>(l ^ (l >> 32));
    }
@@ -71,7 +70,6 @@ struct phmap_mix<4>
        inline size_t operator()(size_t a) const
        {
            static constexpr uint64_t k = 0xde5fb9d2630458e9ULL;
-            // static constexpr uint64_t k = 0x7C9D0BF0567102A5ULL; // [greg] my own random prime
            uint64_t h;
            uint64_t l = umul128(a, k, &h);
            return static_cast<size_t>(h + l);
@@ -164,16 +162,7 @@ struct Hash
        return _hash<T>(val);
    }
 };
-
-template <class T>
-struct Hash<T *>
-{
-    inline size_t operator()(const T *val) const noexcept
-    {
-        return static_cast<size_t>(reinterpret_cast<const uintptr_t>(val)); 
-    }
-};
-
+ 
 template<class ArgumentType, class ResultType>
 struct phmap_unary_function
 {
@@ -286,6 +275,13 @@ struct Hash<double> : public phmap_unary_function<double, size_t>

 #endif

+#if defined(_MSC_VER)
+#   define PHMAP_HASH_ROTL32(x, r) _rotl(x,r)
+#else
+#   define PHMAP_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
+#endif
+
+
 template <class H, int sz> struct Combiner
 {
    H operator()(H seed, size_t value);
@@ -293,17 +289,49 @@ template <class H, int sz> struct Combiner

 template <class H> struct Combiner<H, 4>
 {
-    H operator()(H seed, size_t value)
+    H operator()(H h1, size_t k1)
    {
-        return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2));
+        // Copyright 2005-2014 Daniel James.
+        // Distributed under the Boost Software License, Version 1.0. (See accompanying
+        // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+        
+        const uint32_t c1 = 0xcc9e2d51;
+        const uint32_t c2 = 0x1b873593;
+
+        k1 *= c1;
+        k1 = PHMAP_HASH_ROTL32(k1,15);
+        k1 *= c2;
+
+        h1 ^= k1;
+        h1 = PHMAP_HASH_ROTL32(h1,13);
+        h1 = h1*5+0xe6546b64;
+
+        return h1;
    }
 };

 template <class H> struct Combiner<H, 8>
 {
-    H operator()(H seed, size_t value)
+    H operator()(H h, size_t k)
    {
-        return seed ^ (value + size_t(0xc6a4a7935bd1e995) + (seed << 6) + (seed >> 2));
+        // Copyright 2005-2014 Daniel James.
+        // Distributed under the Boost Software License, Version 1.0. (See accompanying
+        // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+        const uint64_t m = (uint64_t(0xc6a4a793) << 32) + 0x5bd1e995;
+        const int r = 47;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h ^= k;
+        h *= m;
+
+        // Completely arbitrary number, to prevent 0's
+        // from hashing to 0.
+        h += 0xe6546b64;
+
+        return h;
    }
 };

@@ -347,21 +375,22 @@ struct Hash<std::pair<T1, T2>> {
 template<class... T> 
 struct Hash<std::tuple<T...>> {
    size_t operator()(std::tuple<T...> const& t) const noexcept {
-        return _hash_helper(t);
+        size_t seed = 0;
+        return _hash_helper(seed, t);
    }

 private:
-    template<size_t I = 0, class ...P>
-        typename std::enable_if<I == sizeof...(P), size_t>::type
-        _hash_helper(const std::tuple<P...> &) const noexcept { return 0; }
+    template<size_t I = 0, class TUP>
+    typename std::enable_if<I == std::tuple_size<TUP>::value, size_t>::type
+    _hash_helper(size_t seed, const TUP &) const noexcept { return seed; }

-    template<size_t I = 0, class ...P>
-    typename std::enable_if<I < sizeof...(P), size_t>::type
-    _hash_helper(const std::tuple<P...> &t) const noexcept {
+    template<size_t I = 0, class TUP>
+    typename std::enable_if<I < std::tuple_size<TUP>::value, size_t>::type
+    _hash_helper(size_t seed, const TUP &t) const noexcept {
        const auto &el = std::get<I>(t);
        using el_type = typename std::remove_cv<typename std::remove_reference<decltype(el)>::type>::type;
-        return Combiner<size_t, sizeof(size_t)>()(
-            phmap::Hash<el_type>()(el),  _hash_helper<I + 1>(t));
+        seed = Combiner<size_t, sizeof(size_t)>()(seed, phmap::Hash<el_type>()(el));
+        return _hash_helper<I + 1>(seed, t);
    }
 };


--- a/third_party/parallel-hashmap/parallel_hashmap/phmap_utils_hip.h
+++ b/third_party/parallel-hashmap/parallel_hashmap/phmap_utils_hip.h
@@ -59,7 +59,6 @@ struct phmap_mix<4>
    inline size_t operator()(size_t a) const
    {
        static constexpr uint64_t kmul = 0xcc9e2d51UL;
-        // static constexpr uint64_t kmul = 0x3B9ACB93UL; // [greg] my own random prime
        uint64_t l = a * kmul;
        return static_cast<size_t>(l ^ (l >> 32));
    }
@@ -73,7 +72,6 @@ struct phmap_mix<4>
        inline size_t operator()(size_t a) const
        {
            static constexpr uint64_t k = 0xde5fb9d2630458e9ULL;
-            // static constexpr uint64_t k = 0x7C9D0BF0567102A5ULL; // [greg] my own random prime
            uint64_t h;
            uint64_t l = umul128(a, k, &h);
            return static_cast<size_t>(h + l);
@@ -166,16 +164,7 @@ struct Hash
        return _hash<T>(val);
    }
 };
-
-template <class T>
-struct Hash<T *>
-{
-    inline size_t operator()(const T *val) const noexcept
-    {
-        return static_cast<size_t>(reinterpret_cast<const uintptr_t>(val)); 
-    }
-};
-
+ 
 template<class ArgumentType, class ResultType>
 struct phmap_unary_function
 {
@@ -288,6 +277,13 @@ struct Hash<double> : public phmap_unary_function<double, size_t>

 #endif

+#if defined(_MSC_VER)
+#   define PHMAP_HASH_ROTL32(x, r) _rotl(x,r)
+#else
+#   define PHMAP_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
+#endif
+
+
 template <class H, int sz> struct Combiner
 {
    H operator()(H seed, size_t value);
@@ -295,17 +291,49 @@ template <class H, int sz> struct Combiner

 template <class H> struct Combiner<H, 4>
 {
-    H operator()(H seed, size_t value)
+    H operator()(H h1, size_t k1)
    {
-        return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2));
+        // Copyright 2005-2014 Daniel James.
+        // Distributed under the Boost Software License, Version 1.0. (See accompanying
+        // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+        
+        const uint32_t c1 = 0xcc9e2d51;
+        const uint32_t c2 = 0x1b873593;
+
+        k1 *= c1;
+        k1 = PHMAP_HASH_ROTL32(k1,15);
+        k1 *= c2;
+
+        h1 ^= k1;
+        h1 = PHMAP_HASH_ROTL32(h1,13);
+        h1 = h1*5+0xe6546b64;
+
+        return h1;
    }
 };

 template <class H> struct Combiner<H, 8>
 {
-    H operator()(H seed, size_t value)
+    H operator()(H h, size_t k)
    {
-        return seed ^ (value + size_t(0xc6a4a7935bd1e995) + (seed << 6) + (seed >> 2));
+        // Copyright 2005-2014 Daniel James.
+        // Distributed under the Boost Software License, Version 1.0. (See accompanying
+        // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+        const uint64_t m = (uint64_t(0xc6a4a793) << 32) + 0x5bd1e995;
+        const int r = 47;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h ^= k;
+        h *= m;
+
+        // Completely arbitrary number, to prevent 0's
+        // from hashing to 0.
+        h += 0xe6546b64;
+
+        return h;
    }
 };

@@ -349,21 +377,22 @@ struct Hash<std::pair<T1, T2>> {
 template<class... T> 
 struct Hash<std::tuple<T...>> {
    size_t operator()(std::tuple<T...> const& t) const noexcept {
-        return _hash_helper(t);
+        size_t seed = 0;
+        return _hash_helper(seed, t);
    }

 private:
-    template<size_t I = 0, class ...P>
-        typename std::enable_if<I == sizeof...(P), size_t>::type
-        _hash_helper(const std::tuple<P...> &) const noexcept { return 0; }
+    template<size_t I = 0, class TUP>
+    typename std::enable_if<I == std::tuple_size<TUP>::value, size_t>::type
+    _hash_helper(size_t seed, const TUP &) const noexcept { return seed; }

-    template<size_t I = 0, class ...P>
-    typename std::enable_if<I < sizeof...(P), size_t>::type
-    _hash_helper(const std::tuple<P...> &t) const noexcept {
+    template<size_t I = 0, class TUP>
+    typename std::enable_if<I < std::tuple_size<TUP>::value, size_t>::type
+    _hash_helper(size_t seed, const TUP &t) const noexcept {
        const auto &el = std::get<I>(t);
        using el_type = typename std::remove_cv<typename std::remove_reference<decltype(el)>::type>::type;
-        return Combiner<size_t, sizeof(size_t)>()(
-            phmap::Hash<el_type>()(el),  _hash_helper<I + 1>(t));
+        seed = Combiner<size_t, sizeof(size_t)>()(seed, phmap::Hash<el_type>()(el));
+        return _hash_helper<I + 1>(seed, t);
    }
 };


--- a/third_party/parallel-hashmap/phmap_lldb.py
+++ b/third_party/parallel-hashmap/phmap_lldb.py
-# Python lldb formatters for parallel-hashmap
-# tested witch clang10 / lldb9 & 10
-
-# to install it, type the following command or put it in $HOME/.lldbinit:
-# command script import "PATH_TO_SCRIPT/lldb_phmap.py"
-
-
-import lldb
-import os
-import sys
-import re
-
-_MAX_CHILDREN = 250
-_MAX_CTRL_INDEX = 1_000
-_MODULE_NAME = os.path.basename(__file__).split(".")[0]
-
-
-def _get_function_name(instance=None):
-    """Return the name of the calling function"""
-    class_name = f"{type(instance).__name__}." if instance else ""
-    return class_name + sys._getframe(1).f_code.co_name
-
-
-class flat_map_slot_type:
-    CLASS_PATTERN = "^phmap::priv::raw_hash_set<phmap::priv::FlatHashMapPolicy.*>::slot_type$"
-    HAS_SUMMARY = True
-    IS_SYNTHETIC_PROVIDER = False
-
-    @staticmethod
-    def summary(valobj, _):
-        try:
-            valobj = valobj.GetChildMemberWithName('value')
-            first = valobj.GetChildMemberWithName('first').GetSummary()
-            if not first: first = "{...}"
-            second = valobj.GetChildMemberWithName('second').GetSummary()
-            if not second: second = "{...}"
-            return f"{{{first}, {second}}}"
-        except BaseException as ex:
-            print(f"{_get_function_name()} -> {ex}")
-        return ""
-
-
-class node_map_slot_type:
-    CLASS_PATTERN = r"phmap::priv::raw_hash_set<phmap::priv::NodeHashMapPolicy.*>::slot_type$"
-    HAS_SUMMARY = True
-    IS_SYNTHETIC_PROVIDER = False
-
-    @staticmethod
-    def summary(valobj, _):
-        try:
-            valobj = valobj.Dereference()
-            first = valobj.GetChildMemberWithName('first').GetSummary()
-            if not first: first = "{...}"
-            second = valobj.GetChildMemberWithName('second').GetSummary()
-            if not second: second = "{...}"
-            return f"{{{first}, {second}}}"
-        except BaseException as ex:
-            print(f"{_get_function_name()} -> {ex}")
-        return "{?}"
-
-
-class node_set_slot_type:
-    CLASS_PATTERN = r"phmap::priv::raw_hash_set<phmap::priv::NodeHashSetPolicy.*>::slot_type$"
-    HAS_SUMMARY = True
-    IS_SYNTHETIC_PROVIDER = False
-
-    @staticmethod
-    def summary(valobj, _):
-        try:
-            summary = valobj.Dereference().GetSummary()
-            if not summary: summary = "{...}"
-            return summary
-        except BaseException as ex:
-            print(f"{_get_function_name()} -> {ex}")
-        return "{?}"
-
-
-class flat_hash_map_or_set:
-    CLASS_PATTERN = "^phmap::flat_hash_(map|set)<.*>$"
-    HAS_SUMMARY = True
-    IS_SYNTHETIC_PROVIDER = True
-
-    @staticmethod
-    def summary(valobj, _):
-        try:
-            valobj = valobj.GetNonSyntheticValue()
-            size = valobj.GetChildMemberWithName('size_').GetValueAsUnsigned()
-            capacity = valobj.GetChildMemberWithName('capacity_').GetValueAsUnsigned()
-            return f"size = {size} (capacity = {capacity})"
-        except BaseException as ex:
-            print(f"{_get_function_name()} -> {ex}")
-        return "{?}"
-
-    def __init__(self, valobj, _):
-        self.valobj = valobj
-        self.slots_ = self.slot_type = self.ctrl_ = None
-        self.size_ = self.capacity_ = self.slot_size = 0
-
-    def num_children(self):
-        return min(self.size_, _MAX_CHILDREN)
-
-    def has_children(self):
-        return True
-
-    def update(self):
-        try:
-            self.size_ = self.valobj.GetChildMemberWithName('size_').GetValueAsUnsigned()
-            self.capacity_ = self.valobj.GetChildMemberWithName('capacity_').GetValueAsUnsigned()
-            self.slots_ = self.valobj.GetChildMemberWithName("slots_")
-            self.slot_type = self.slots_.GetType().GetPointeeType()
-            self.slot_size = self.slot_type.GetByteSize()
-            self.ctrl_ = self.valobj.GetChildMemberWithName("ctrl_")
-        except BaseException as ex:
-            print(f"{_get_function_name(self)} -> {ex}")
-
-    def get_child_index(self, name):
-        try:
-            if name in ('size_', 'capacity_'):
-                return -1
-            return int(name.lstrip('[').rstrip(']'))
-        except:
-            return -1
-
-    def get_child_at_index(self, index):
-        try:
-            if index < 0:
-                return None
-            if index >= self.size_ or index >= _MAX_CHILDREN:
-                return None
-            real_idx = -1
-            for idx in range(min(self.capacity_ + 3, _MAX_CTRL_INDEX)):
-                ctrl = self.ctrl_.GetChildAtIndex(idx).GetValueAsSigned()
-                if ctrl >= -1:
-                    real_idx += 1
-                    if real_idx == index:
-                        return self.slots_.CreateChildAtOffset(f'[{index}]', idx * self.slot_size, self.slot_type)
-        except BaseException as ex:
-            print(f"{_get_function_name(self)} -> {ex}")
-        return None
-
-
-class parallel_flat_or_node_map_or_set:
-    CLASS_PATTERN = "^phmap::parallel_(flat|node)_hash_(map|set)<.*>$"
-    HAS_SUMMARY = True
-    IS_SYNTHETIC_PROVIDER = True
-    REGEX_EXTRACT_ARRAY_SIZE = re.compile(r"std::array\s*<.*,\s*(\d+)\s*>")
-
-    @staticmethod
-    def _get_size_and_capacity(valobj):
-        try:
-            valobj = valobj.GetNonSyntheticValue()
-            sets = valobj.GetChildMemberWithName('sets_')
-            # sets is an std::array<T, SIZE>.
-            # It's not possible to get the size of the array with templates parameters
-            # "set.GetType().GetTemplateArgumentType(1)" returns an "unsigned long" type but not the value
-            # so we must extract it with a regex
-            m = parallel_flat_or_node_map_or_set.REGEX_EXTRACT_ARRAY_SIZE.match(sets.GetType().GetName())
-            n_buckets = int(m.group(1))
-            # this is dependent on the implementation of the standard library
-            buckets = sets.GetChildMemberWithName('_M_elems')
-            size = capacity = 0
-            for idx in range(n_buckets):
-                bucket = buckets.GetChildAtIndex(idx).GetChildMemberWithName('set_')
-                size += bucket.GetChildMemberWithName('size_').GetValueAsUnsigned()
-                capacity += bucket.GetChildMemberWithName('capacity_').GetValueAsUnsigned()
-            return size, capacity, n_buckets
-        except:
-            return '?', '?', 0
-
-    @staticmethod
-    def summary(valobj, _):
-        size, capacity, _ = parallel_flat_or_node_map_or_set._get_size_and_capacity(valobj)
-        return f"size = {size} (capacity = {capacity})"
-
-    def __init__(self, valobj, _):
-        self.valobj = valobj
-        self.buckets = self.slot_type = None
-        self.size_ = self.capacity_ = self.n_buckets_ = self.slot_type = self.ctrl_size = 0
-
-    def num_children(self):
-        return min(self.size_, _MAX_CHILDREN)
-
-    def has_children(self):
-        return True
-
-    def update(self):
-        try:
-            self.size_, self.capacity_, self.n_buckets_ = self._get_size_and_capacity(self.valobj)
-            self.buckets = self.valobj.GetChildMemberWithName('sets_').GetChildMemberWithName('_M_elems')
-            bucket0 = self.buckets.GetChildAtIndex(0).GetChildMemberWithName('set_')
-            self.slot_type = bucket0.GetChildMemberWithName('slots_').GetType().GetPointeeType()
-            self.slot_size = self.slot_type.GetByteSize()
-        except BaseException as ex:
-            print(f"{_get_function_name(self)} -> {ex}")
-
-    def get_child_index(self, name):
-        try:
-            if name in ('sets_'):
-                return -1
-            return int(name.lstrip('[').rstrip(']'))
-        except:
-            return -1
-
-    def get_child_at_index(self, index):
-        try:
-            if index < 0:
-                return None
-            if index >= self.size_ or index >= _MAX_CHILDREN:
-                return None
-            real_idx = -1
-            total_idx = 0
-            for idx in range(self.n_buckets_):
-                bucket = self.buckets.GetChildAtIndex(idx).GetChildMemberWithName('set_')
-                size = bucket.GetChildMemberWithName("size_").GetValueAsUnsigned()
-                if size:
-                    slots_ = bucket.GetChildMemberWithName("slots_")
-                    ctrl_ = bucket.GetChildMemberWithName("ctrl_")
-                    for jdx in range(size):
-                        ctrl = ctrl_.GetChildAtIndex(jdx).GetValueAsSigned()
-                        if ctrl >= -1:
-                            real_idx += 1
-                            if real_idx == index:
-                                return slots_.CreateChildAtOffset(f'[{index}]', jdx * self.slot_size, self.slot_type)
-                    total_idx += size
-                    if total_idx > _MAX_CHILDREN:
-                        return None
-        except BaseException as ex:
-            print(f"{_get_function_name(self)} -> {ex}")
-        return None
-
-
-def __lldb_init_module(debugger, internal_dict):
-    for sp in (
-            flat_map_slot_type,
-            node_map_slot_type,
-            node_set_slot_type,
-            flat_hash_map_or_set,
-            parallel_flat_or_node_map_or_set,
-    ):
-        if sp.HAS_SUMMARY:
-            debugger.HandleCommand(
-                f'type summary add --regex "{sp.CLASS_PATTERN}" --python-function {_MODULE_NAME}.{sp.__name__}.summary '
-                f'--category phmap --expand')
-        if sp.IS_SYNTHETIC_PROVIDER:
-            debugger.HandleCommand(
-                f'type synthetic add --regex "{sp.CLASS_PATTERN}" --python-class {_MODULE_NAME}.{sp.__name__} '
-                f'--category phmap')
-    debugger.HandleCommand('type category enable phmap')
+# Python lldb formatters for parallel-hashmap
+# tested witch clang10 / lldb9 & 10
+
+# to install it, type the following command or put it in $HOME/.lldbinit:
+# command script import "PATH_TO_SCRIPT/lldb_phmap.py"
+
+
+import lldb
+import os
+import sys
+import re
+
+_MAX_CHILDREN = 250
+_MAX_CTRL_INDEX = 1_000
+_MODULE_NAME = os.path.basename(__file__).split(".")[0]
+
+
+def _get_function_name(instance=None):
+    """Return the name of the calling function"""
+    class_name = f"{type(instance).__name__}." if instance else ""
+    return class_name + sys._getframe(1).f_code.co_name
+
+
+class flat_map_slot_type:
+    CLASS_PATTERN = "^phmap::priv::raw_hash_set<phmap::priv::FlatHashMapPolicy.*>::slot_type$"
+    HAS_SUMMARY = True
+    IS_SYNTHETIC_PROVIDER = False
+
+    @staticmethod
+    def summary(valobj, _):
+        try:
+            valobj = valobj.GetChildMemberWithName('value')
+            first = valobj.GetChildMemberWithName('first').GetSummary()
+            if not first: first = "{...}"
+            second = valobj.GetChildMemberWithName('second').GetSummary()
+            if not second: second = "{...}"
+            return f"{{{first}, {second}}}"
+        except BaseException as ex:
+            print(f"{_get_function_name()} -> {ex}")
+        return ""
+
+
+class node_map_slot_type:
+    CLASS_PATTERN = r"phmap::priv::raw_hash_set<phmap::priv::NodeHashMapPolicy.*>::slot_type$"
+    HAS_SUMMARY = True
+    IS_SYNTHETIC_PROVIDER = False
+
+    @staticmethod
+    def summary(valobj, _):
+        try:
+            valobj = valobj.Dereference()
+            first = valobj.GetChildMemberWithName('first').GetSummary()
+            if not first: first = "{...}"
+            second = valobj.GetChildMemberWithName('second').GetSummary()
+            if not second: second = "{...}"
+            return f"{{{first}, {second}}}"
+        except BaseException as ex:
+            print(f"{_get_function_name()} -> {ex}")
+        return "{?}"
+
+
+class node_set_slot_type:
+    CLASS_PATTERN = r"phmap::priv::raw_hash_set<phmap::priv::NodeHashSetPolicy.*>::slot_type$"
+    HAS_SUMMARY = True
+    IS_SYNTHETIC_PROVIDER = False
+
+    @staticmethod
+    def summary(valobj, _):
+        try:
+            summary = valobj.Dereference().GetSummary()
+            if not summary: summary = "{...}"
+            return summary
+        except BaseException as ex:
+            print(f"{_get_function_name()} -> {ex}")
+        return "{?}"
+
+
+class flat_hash_map_or_set:
+    CLASS_PATTERN = "^phmap::flat_hash_(map|set)<.*>$"
+    HAS_SUMMARY = True
+    IS_SYNTHETIC_PROVIDER = True
+
+    @staticmethod
+    def summary(valobj, _):
+        try:
+            valobj = valobj.GetNonSyntheticValue()
+            size = valobj.GetChildMemberWithName('size_').GetValueAsUnsigned()
+            capacity = valobj.GetChildMemberWithName('capacity_').GetValueAsUnsigned()
+            return f"size = {size} (capacity = {capacity})"
+        except BaseException as ex:
+            print(f"{_get_function_name()} -> {ex}")
+        return "{?}"
+
+    def __init__(self, valobj, _):
+        self.valobj = valobj
+        self.slots_ = self.slot_type = self.ctrl_ = None
+        self.size_ = self.capacity_ = self.slot_size = 0
+
+    def num_children(self):
+        return min(self.size_, _MAX_CHILDREN)
+
+    def has_children(self):
+        return True
+
+    def update(self):
+        try:
+            self.size_ = self.valobj.GetChildMemberWithName('size_').GetValueAsUnsigned()
+            self.capacity_ = self.valobj.GetChildMemberWithName('capacity_').GetValueAsUnsigned()
+            self.slots_ = self.valobj.GetChildMemberWithName("slots_")
+            self.slot_type = self.slots_.GetType().GetPointeeType()
+            self.slot_size = self.slot_type.GetByteSize()
+            self.ctrl_ = self.valobj.GetChildMemberWithName("ctrl_")
+        except BaseException as ex:
+            print(f"{_get_function_name(self)} -> {ex}")
+
+    def get_child_index(self, name):
+        try:
+            if name in ('size_', 'capacity_'):
+                return -1
+            return int(name.lstrip('[').rstrip(']'))
+        except:
+            return -1
+
+    def get_child_at_index(self, index):
+        try:
+            if index < 0:
+                return None
+            if index >= self.size_ or index >= _MAX_CHILDREN:
+                return None
+            real_idx = -1
+            for idx in range(min(self.capacity_ + 3, _MAX_CTRL_INDEX)):
+                ctrl = self.ctrl_.GetChildAtIndex(idx, True, True).GetValueAsSigned()
+                if ctrl >= -1:
+                    real_idx += 1
+                    if real_idx == index:
+                        slot = self.slots_.CreateChildAtOffset(f'', idx * self.slot_size, self.slot_type)
+                        print(slot.type.name)
+                        if "MapPolicy" in slot.type.name:
+                            val = slot.GetChildAtIndex(0, True, True)
+                        else:
+                            val = slot
+                        return val.CreateChildAtOffset(f'[{index}]', 0, val.type)
+        except BaseException as ex:
+            print(f"{_get_function_name(self)} -> {ex}")
+        return None
+
+
+class parallel_flat_or_node_map_or_set:
+    CLASS_PATTERN = "^phmap::parallel_(flat|node)_hash_(map|set)<.*>$"
+    HAS_SUMMARY = True
+    IS_SYNTHETIC_PROVIDER = True
+    REGEX_EXTRACT_ARRAY_SIZE = re.compile(r"std::array\s*<.*,\s*(\d+)\s*>")
+
+    @staticmethod
+    def _get_size_and_capacity(valobj):
+        try:
+            valobj = valobj.GetNonSyntheticValue()
+            sets = valobj.GetChildMemberWithName('sets_')
+            # sets is an std::array<T, SIZE>.
+            # It's not possible to get the size of the array with templates parameters
+            # "set.GetType().GetTemplateArgumentType(1)" returns an "unsigned long" type but not the value
+            # so we must extract it with a regex
+            m = parallel_flat_or_node_map_or_set.REGEX_EXTRACT_ARRAY_SIZE.match(sets.GetType().GetName())
+            n_buckets = int(m.group(1))
+            # this is dependent on the implementation of the standard library
+            buckets = sets.GetChildMemberWithName('_M_elems')
+            size = capacity = 0
+            for idx in range(n_buckets):
+                bucket = buckets.GetChildAtIndex(idx, True, True).GetChildMemberWithName('set_')
+                size += bucket.GetChildMemberWithName('size_').GetValueAsUnsigned()
+                capacity += bucket.GetChildMemberWithName('capacity_').GetValueAsUnsigned()
+            return size, capacity, n_buckets
+        except:
+            return '?', '?', 0
+
+    @staticmethod
+    def summary(valobj, _):
+        size, capacity, _ = parallel_flat_or_node_map_or_set._get_size_and_capacity(valobj)
+        return f"size = {size} (capacity = {capacity})"
+
+    def __init__(self, valobj, _):
+        self.valobj = valobj
+        self.buckets = self.slot_type = None
+        self.size_ = self.capacity_ = self.n_buckets_ = self.slot_type = self.ctrl_size = 0
+
+    def num_children(self):
+        return min(self.size_, _MAX_CHILDREN)
+
+    def has_children(self):
+        return True
+
+    def update(self):
+        try:
+            self.size_, self.capacity_, self.n_buckets_ = self._get_size_and_capacity(self.valobj)
+            self.buckets = self.valobj.GetChildMemberWithName('sets_').GetChildMemberWithName('_M_elems')
+            bucket0 = self.buckets.GetChildAtIndex(0).GetChildMemberWithName('set_')
+            self.slot_type = bucket0.GetChildMemberWithName('slots_').GetType().GetPointeeType()
+            self.slot_size = self.slot_type.GetByteSize()
+        except BaseException as ex:
+            print(f"{_get_function_name(self)} -> {ex}")
+
+    def get_child_index(self, name):
+        try:
+            if name in ('sets_'):
+                return -1
+            return int(name.lstrip('[').rstrip(']'))
+        except:
+            return -1
+
+    def get_child_at_index(self, index):
+        try:
+            if index < 0:
+                return None
+            if index >= self.size_ or index >= _MAX_CHILDREN:
+                return None
+            real_idx = -1
+            total_idx = 0
+            for idx in range(self.n_buckets_):
+                bucket = self.buckets.GetChildAtIndex(idx, True, True).GetChildMemberWithName('set_')
+                size = bucket.GetChildMemberWithName("size_").GetValueAsUnsigned()
+                if size:
+                    slots_ = bucket.GetChildMemberWithName("slots_")
+                    ctrl_ = bucket.GetChildMemberWithName("ctrl_")
+                    for jdx in range(size):
+                        ctrl = ctrl_.GetChildAtIndex(jdx, True, True).GetValueAsSigned()
+                        if ctrl >= -1:
+                            real_idx += 1
+                            if real_idx == index:
+                                slot = slots_.CreateChildAtOffset(f'[{index}]', jdx * self.slot_size, self.slot_type)
+                                if "MapPolicy" in slot.type.name:
+                                    val = slot.GetChildAtIndex(0, True, True)
+                                else:
+                                    val = slot
+                                return val.CreateChildAtOffset(f'[{index}]', 0, val.type)
+                    total_idx += size
+                    if total_idx > _MAX_CHILDREN:
+                        return None
+        except BaseException as ex:
+            print(f"{_get_function_name(self)} -> {ex}")
+        return None
+
+
+def __lldb_init_module(debugger, internal_dict):
+    for sp in (
+            flat_map_slot_type,
+            node_map_slot_type,
+            node_set_slot_type,
+            flat_hash_map_or_set,
+            parallel_flat_or_node_map_or_set,
+    ):
+        if sp.HAS_SUMMARY:
+            debugger.HandleCommand(
+                f'type summary add --regex "{sp.CLASS_PATTERN}" --python-function {_MODULE_NAME}.{sp.__name__}.summary '
+                f'--category phmap --expand')
+        if sp.IS_SYNTHETIC_PROVIDER:
+            debugger.HandleCommand(
+                f'type synthetic add --regex "{sp.CLASS_PATTERN}" --python-class {_MODULE_NAME}.{sp.__name__} '
+                f'--category phmap')
+    debugger.HandleCommand('type category enable phmap')
--- a/third_party/parallel-hashmap/tests/btree_test.cc
+++ b/third_party/parallel-hashmap/tests/btree_test.cc
@@ -48,7 +48,7 @@ namespace phmap {
        size_t BaseCountedInstance::num_comparisons_ = 0;

    }  // namespace test_internal
-}  // namespace phmap\
+}  // namespace phmap


 static const size_t test_values = 10000;
@@ -418,14 +418,14 @@ namespace {
    };

    template <typename T, typename V>
-    void DoTest(const char *name, T *b, const std::vector<V> &values) {
+    void DoTest(const char *, T *b, const std::vector<V> &values) {
        typename KeyOfValue<typename T::key_type, V>::type key_of_value;

        T &mutable_b = *b;
        const T &const_b = *b;

        // Test insert.
-        for (int i = 0; i < values.size(); ++i) {
+        for (size_t i = 0; i < values.size(); ++i) {
            mutable_b.insert(values[i]);
            mutable_b.value_check(values[i]);
        }
@@ -436,14 +436,14 @@ namespace {
        // Test copy constructor.
        T b_copy(const_b);
        EXPECT_EQ(b_copy.size(), const_b.size());
-        for (int i = 0; i < values.size(); ++i) {
+        for (size_t i = 0; i < values.size(); ++i) {
            CheckPairEquals(*b_copy.find(key_of_value(values[i])), values[i]);
        }

        // Test range constructor.
        T b_range(const_b.begin(), const_b.end());
        EXPECT_EQ(b_range.size(), const_b.size());
-        for (int i = 0; i < values.size(); ++i) {
+        for (size_t i = 0; i < values.size(); ++i) {
            CheckPairEquals(*b_range.find(key_of_value(values[i])), values[i]);
        }

@@ -455,7 +455,7 @@ namespace {
        b_range.clear();
        b_range.insert(b_copy.begin(), b_copy.end());
        EXPECT_EQ(b_range.size(), b_copy.size());
-        for (int i = 0; i < values.size(); ++i) {
+        for (size_t i = 0; i < values.size(); ++i) {
            CheckPairEquals(*b_range.find(key_of_value(values[i])), values[i]);
        }

@@ -473,7 +473,7 @@ namespace {
        b_range.swap(b_copy);
        EXPECT_EQ(b_copy.size(), 0);
        EXPECT_EQ(b_range.size(), const_b.size());
-        for (int i = 0; i < values.size(); ++i) {
+        for (size_t i = 0; i < values.size(); ++i) {
            CheckPairEquals(*b_range.find(key_of_value(values[i])), values[i]);
        }
        b_range.swap(b_copy);
@@ -482,13 +482,13 @@ namespace {
        swap(b_range, b_copy);
        EXPECT_EQ(b_copy.size(), 0);
        EXPECT_EQ(b_range.size(), const_b.size());
-        for (int i = 0; i < values.size(); ++i) {
+        for (size_t i = 0; i < values.size(); ++i) {
            CheckPairEquals(*b_range.find(key_of_value(values[i])), values[i]);
        }
        swap(b_range, b_copy);

        // Test erase via values.
-        for (int i = 0; i < values.size(); ++i) {
+        for (size_t i = 0; i < values.size(); ++i) {
            mutable_b.erase(key_of_value(values[i]));
            // Erasing a non-existent key should have no effect.
            ASSERT_EQ(mutable_b.erase(key_of_value(values[i])), 0);
@@ -499,7 +499,7 @@ namespace {

        // Test erase via iterators.
        mutable_b = b_copy;
-        for (int i = 0; i < values.size(); ++i) {
+        for (size_t i = 0; i < values.size(); ++i) {
            mutable_b.erase(mutable_b.find(key_of_value(values[i])));
        }

@@ -507,7 +507,7 @@ namespace {
        EXPECT_EQ(const_b.size(), 0);

        // Test insert with hint.
-        for (int i = 0; i < values.size(); i++) {
+        for (size_t i = 0; i < values.size(); i++) {
            mutable_b.insert(mutable_b.upper_bound(key_of_value(values[i])), values[i]);
        }

@@ -521,7 +521,7 @@ namespace {
        // First half.
        mutable_b = b_copy;
        typename T::iterator mutable_iter_end = mutable_b.begin();
-        for (int i = 0; i < values.size() / 2; ++i) ++mutable_iter_end;
+        for (size_t i = 0; i < values.size() / 2; ++i) ++mutable_iter_end;
        mutable_b.erase(mutable_b.begin(), mutable_iter_end);
        EXPECT_EQ(mutable_b.size(), values.size() - values.size() / 2);
        const_b.verify();
@@ -529,7 +529,7 @@ namespace {
        // Second half.
        mutable_b = b_copy;
        typename T::iterator mutable_iter_begin = mutable_b.begin();
-        for (int i = 0; i < values.size() / 2; ++i) ++mutable_iter_begin;
+        for (size_t i = 0; i < values.size() / 2; ++i) ++mutable_iter_begin;
        mutable_b.erase(mutable_iter_begin, mutable_b.end());
        EXPECT_EQ(mutable_b.size(), values.size() / 2);
        const_b.verify();
@@ -537,9 +537,9 @@ namespace {
        // Second quarter.
        mutable_b = b_copy;
        mutable_iter_begin = mutable_b.begin();
-        for (int i = 0; i < values.size() / 4; ++i) ++mutable_iter_begin;
+        for (size_t i = 0; i < values.size() / 4; ++i) ++mutable_iter_begin;
        mutable_iter_end = mutable_iter_begin;
-        for (int i = 0; i < values.size() / 4; ++i) ++mutable_iter_end;
+        for (size_t i = 0; i < values.size() / 4; ++i) ++mutable_iter_end;
        mutable_b.erase(mutable_iter_begin, mutable_iter_end);
        EXPECT_EQ(mutable_b.size(), values.size() - values.size() / 4);
        const_b.verify();
@@ -606,7 +606,7 @@ namespace {
        using V = typename remove_pair_const<typename T::value_type>::type;
        const std::vector<V> random_values = GenerateValuesWithSeed<V>(
            test_values, 4 * test_values,
-            testing::GTEST_FLAG(random_seed));
+            GTEST_FLAG_GET(random_seed));

        unique_checker<T, C> container;

@@ -630,7 +630,7 @@ namespace {
        using V = typename remove_pair_const<typename T::value_type>::type;
        const std::vector<V> random_values = GenerateValuesWithSeed<V>(
            test_values, 4 * test_values,
-            testing::GTEST_FLAG(random_seed));
+            GTEST_FLAG_GET(random_seed));

        multi_checker<T, C> container;

@@ -799,9 +799,6 @@ namespace {

    template <typename K, int N = 256>
    void SetTest() {
-        EXPECT_EQ(
-            sizeof(phmap::btree_set<K>),
-            2 * sizeof(void *) + sizeof(typename phmap::btree_set<K>::size_type));
        using BtreeSet = phmap::btree_set<K>;
        using CountingBtreeSet =
            phmap::btree_set<K, std::less<K>, PropagatingCountingAlloc<K>>;
@@ -811,9 +808,6 @@ namespace {

    template <typename K, int N = 256>
    void MapTest() {
-        EXPECT_EQ(
-            sizeof(phmap::btree_map<K, K>),
-            2 * sizeof(void *) + sizeof(typename phmap::btree_map<K, K>::size_type));
        using BtreeMap = phmap::btree_map<K, K>;
        using CountingBtreeMap =
            phmap::btree_map<K, K, std::less<K>,
@@ -834,9 +828,6 @@ namespace {

    template <typename K, int N = 256>
    void MultiSetTest() {
-        EXPECT_EQ(
-            sizeof(phmap::btree_multiset<K>),
-            2 * sizeof(void *) + sizeof(typename phmap::btree_multiset<K>::size_type));
        using BtreeMSet = phmap::btree_multiset<K>;
        using CountingBtreeMSet =
            phmap::btree_multiset<K, std::less<K>, PropagatingCountingAlloc<K>>;
@@ -846,9 +837,6 @@ namespace {

    template <typename K, int N = 256>
    void MultiMapTest() {
-        EXPECT_EQ(sizeof(phmap::btree_multimap<K, K>),
-                  2 * sizeof(void *) +
-                  sizeof(typename phmap::btree_multimap<K, K>::size_type));
        using BtreeMMap = phmap::btree_multimap<K, K>;
        using CountingBtreeMMap =
            phmap::btree_multimap<K, K, std::less<K>,
@@ -1862,7 +1850,7 @@ namespace {
            while (s.size() < kSize) {
                s.insert(MovableOnlyInstance(s.size()));
            }
-            for (int i = 0; i < kSize; ++i) {
+            for (size_t i = 0; i < kSize; ++i) {
                // Extract with key
                auto nh = s.extract(MovableOnlyInstance(i));
                EXPECT_EQ(s.size(), kSize - 1);
@@ -1895,7 +1883,7 @@ namespace {
                m.insert(
                    {CopyableMovableInstance(m.size()), MovableOnlyInstance(m.size())});
            }
-            for (int i = 0; i < kSize; ++i) {
+            for (size_t i = 0; i < kSize; ++i) {
                // Extract with key
                auto nh = m.extract(CopyableMovableInstance(i));
                EXPECT_EQ(m.size(), kSize - 1);

--- a/third_party/parallel-hashmap/tests/btree_test.h
+++ b/third_party/parallel-hashmap/tests/btree_test.h
@@ -391,7 +391,7 @@ namespace priv {
    };

    // Generate n values for our tests and benchmarks. Value range is [0, maxval].
-    inline std::vector<int> GenerateNumbersWithSeed(int n, int maxval, int seed) {
+    inline std::vector<int> GenerateNumbersWithSeed(size_t n, int maxval, int seed) {
        // NOTE: Some tests rely on generated numbers not changing between test runs.
        // We use std::minstd_rand0 because it is well-defined, but don't use
        // std::uniform_int_distribution because platforms use different algorithms.
@@ -400,7 +400,7 @@ namespace priv {
        std::vector<int> values;
        phmap::flat_hash_set<int> unique_values;
        if (values.size() < n) {
-            for (size_t i = values.size(); i < (size_t)n; i++) {
+            for (size_t i = values.size(); i < n; i++) {
                int value;
                do {
                    value = static_cast<int>(rng()) % (maxval + 1);
@@ -414,13 +414,13 @@ namespace priv {

    // Generates n values in the range [0, maxval].
    template <typename V>
-    std::vector<V> GenerateValuesWithSeed(int n, int maxval, int seed) {
+    std::vector<V> GenerateValuesWithSeed(size_t n, int maxval, int seed) {
        const std::vector<int> nums = GenerateNumbersWithSeed(n, maxval, seed);
        Generator<V> gen(maxval);
        std::vector<V> vec;

        vec.reserve(n);
-        for (int i = 0; i < n; i++) {
+        for (size_t i = 0; i < n; i++) {
            vec.push_back(gen(nums[i]));
        }


--- a/third_party/parallel-hashmap/tests/compressed_tuple_test.cc
+++ b/third_party/parallel-hashmap/tests/compressed_tuple_test.cc
-// Copyright 2018 The Abseil Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "parallel_hashmap/phmap.h"
-
-#include <memory>
-#include <string>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-namespace phmap {
-namespace priv {
-namespace {
-
-enum class CallType { kConstRef, kConstMove };
-
-template <int>
-struct Empty {
-  constexpr CallType value() const& { return CallType::kConstRef; }
-  constexpr CallType value() const&& { return CallType::kConstMove; }
-};
-
-template <typename T>
-struct NotEmpty {
-  T value;
-};
-
-template <typename T, typename U>
-struct TwoValues {
-  T value1;
-  U value2;
-};
-
-TEST(CompressedTupleTest, Sizeof) {
-  EXPECT_EQ(sizeof(int), sizeof(CompressedTuple<int>));
-  EXPECT_EQ(sizeof(int), sizeof(CompressedTuple<int, Empty<0>>));
-  EXPECT_EQ(sizeof(int), sizeof(CompressedTuple<int, Empty<0>, Empty<1>>));
-  EXPECT_EQ(sizeof(int),
-            sizeof(CompressedTuple<int, Empty<0>, Empty<1>, Empty<2>>));
-
-  EXPECT_EQ(sizeof(TwoValues<int, double>),
-            sizeof(CompressedTuple<int, NotEmpty<double>>));
-  EXPECT_EQ(sizeof(TwoValues<int, double>),
-            sizeof(CompressedTuple<int, Empty<0>, NotEmpty<double>>));
-  EXPECT_EQ(sizeof(TwoValues<int, double>),
-            sizeof(CompressedTuple<int, Empty<0>, NotEmpty<double>, Empty<1>>));
-}
-
-TEST(CompressedTupleTest, Access) {
-  struct S {
-    std::string x;
-  };
-  CompressedTuple<int, Empty<0>, S> x(7, {}, S{"ABC"});
-  EXPECT_EQ(sizeof(x), sizeof(TwoValues<int, S>));
-  EXPECT_EQ(7, x.get<0>());
-  EXPECT_EQ("ABC", x.get<2>().x);
-}
-
-TEST(CompressedTupleTest, NonClasses) {
-  CompressedTuple<int, const char*> x(7, "ABC");
-  EXPECT_EQ(7, x.get<0>());
-  EXPECT_STREQ("ABC", x.get<1>());
-}
-
-TEST(CompressedTupleTest, MixClassAndNonClass) {
-  CompressedTuple<int, const char*, Empty<0>, NotEmpty<double>> x(7, "ABC", {},
-                                                                  {1.25});
-  struct Mock {
-    int v;
-    const char* p;
-    double d;
-  };
-  EXPECT_EQ(sizeof(x), sizeof(Mock));
-  EXPECT_EQ(7, x.get<0>());
-  EXPECT_STREQ("ABC", x.get<1>());
-  EXPECT_EQ(1.25, x.get<3>().value);
-}
-
-TEST(CompressedTupleTest, Nested) {
-  CompressedTuple<int, CompressedTuple<int>,
-                  CompressedTuple<int, CompressedTuple<int>>>
-      x(1, CompressedTuple<int>(2),
-        CompressedTuple<int, CompressedTuple<int>>(3, CompressedTuple<int>(4)));
-  EXPECT_EQ(1, x.get<0>());
-  EXPECT_EQ(2, x.get<1>().get<0>());
-  EXPECT_EQ(3, x.get<2>().get<0>());
-  EXPECT_EQ(4, x.get<2>().get<1>().get<0>());
-
-  CompressedTuple<Empty<0>, Empty<0>,
-                  CompressedTuple<Empty<0>, CompressedTuple<Empty<0>>>>
-      y;
-  std::set<Empty<0>*> empties{&y.get<0>(), &y.get<1>(), &y.get<2>().get<0>(),
-                              &y.get<2>().get<1>().get<0>()};
-#ifdef _MSC_VER
-  // MSVC has a bug where many instances of the same base class are layed out in
-  // the same address when using __declspec(empty_bases).
-  // This will be fixed in a future version of MSVC.
-  int expected = 1;
-#else
-  int expected = 4;
-#endif
-  EXPECT_EQ(expected, sizeof(y));
-  EXPECT_EQ(expected, empties.size());
-  EXPECT_EQ(sizeof(y), sizeof(Empty<0>) * empties.size());
-
-  EXPECT_EQ(4 * sizeof(char),
-            sizeof(CompressedTuple<CompressedTuple<char, char>,
-                                   CompressedTuple<char, char>>));
-  EXPECT_TRUE(
-      (std::is_empty<CompressedTuple<CompressedTuple<Empty<0>>,
-                                     CompressedTuple<Empty<1>>>>::value));
-}
-
-TEST(CompressedTupleTest, Reference) {
-  int i = 7;
-  std::string s = "Very long std::string that goes in the heap";
-  CompressedTuple<int, int&, std::string, std::string&> x(i, i, s, s);
-
-  // Sanity check. We should have not moved from `s`
-  EXPECT_EQ(s, "Very long std::string that goes in the heap");
-
-  EXPECT_EQ(x.get<0>(), x.get<1>());
-  EXPECT_NE(&x.get<0>(), &x.get<1>());
-  EXPECT_EQ(&x.get<1>(), &i);
-
-  EXPECT_EQ(x.get<2>(), x.get<3>());
-  EXPECT_NE(&x.get<2>(), &x.get<3>());
-  EXPECT_EQ(&x.get<3>(), &s);
-}
-
-TEST(CompressedTupleTest, NoElements) {
-  CompressedTuple<> x;
-  static_cast<void>(x);  // Silence -Wunused-variable.
-  EXPECT_TRUE(std::is_empty<CompressedTuple<>>::value);
-}
-
-TEST(CompressedTupleTest, MoveOnlyElements) {
-  CompressedTuple<std::unique_ptr<std::string>> str_tup(
-       phmap::make_unique<std::string>("str"));
-
-  CompressedTuple<CompressedTuple<std::unique_ptr<std::string>>,
-                  std::unique_ptr<int>>
-  x(std::move(str_tup), phmap::make_unique<int>(5));
-
-  EXPECT_EQ(*x.get<0>().get<0>(), "str");
-  EXPECT_EQ(*x.get<1>(), 5);
-
-  std::unique_ptr<std::string> x0 = std::move(x.get<0>()).get<0>();
-  std::unique_ptr<int> x1 = std::move(x).get<1>();
-
-  EXPECT_EQ(*x0, "str");
-  EXPECT_EQ(*x1, 5);
-}
-
-TEST(CompressedTupleTest, Constexpr) {
-  constexpr CompressedTuple<int, double, CompressedTuple<int>, Empty<0>> x(
-      7, 1.25, CompressedTuple<int>(5), {});
-  constexpr int x0 = x.get<0>();
-  constexpr double x1 = x.get<1>();
-  constexpr int x2 = x.get<2>().get<0>();
-  constexpr CallType x3 = x.get<3>().value();
-
-  EXPECT_EQ(x0, 7);
-  EXPECT_EQ(x1, 1.25);
-  EXPECT_EQ(x2, 5);
-  EXPECT_EQ(x3, CallType::kConstRef);
-
-#if defined(__clang__)
-  // An apparent bug in earlier versions of gcc claims these are ambiguous.
-  constexpr int x2m = std::move(x.get<2>()).get<0>();
-  constexpr CallType x3m = std::move(x).get<3>().value();
-  EXPECT_EQ(x2m, 5);
-  EXPECT_EQ(x3m, CallType::kConstMove);
-#endif
-}
-
-#if defined(__clang__) || defined(__GNUC__)
-TEST(CompressedTupleTest, EmptyFinalClass) {
-  struct S final {
-    int f() const { return 5; }
-  };
-  CompressedTuple<S> x;
-  EXPECT_EQ(x.get<0>().f(), 5);
-}
-#endif
-
-}  // namespace
-}  // namespace priv
-}  // namespace phmap
--- a/third_party/parallel-hashmap/tests/flat_hash_map_test.cc
+++ b/third_party/parallel-hashmap/tests/flat_hash_map_test.cc
@@ -22,6 +22,10 @@
    #define THIS_EXTRA_TPL_PARAMS 
 #endif

+#ifndef THIS_EXTRA_TPL_PARAMS_NULLMUTEX
+    #define THIS_EXTRA_TPL_PARAMS_NULLMUTEX 
+#endif
+
 #include "parallel_hashmap/phmap.h"

 #if defined(PHMAP_HAVE_STD_ANY)
@@ -62,6 +66,12 @@ template <class K, class V, class H = phmap::priv::hash_default_hash<K>,
          class Alloc =  phmap::priv::Allocator<
              phmap::priv::Pair<const K, V>>>
 using ThisMap = THIS_HASH_MAP<K, V, H, Eq, Alloc THIS_EXTRA_TPL_PARAMS>;
+    
+template <class K, class V, class H = phmap::priv::hash_default_hash<K>,
+          class Eq = phmap::priv::hash_default_eq<K>,
+          class Alloc =  phmap::priv::Allocator<
+              phmap::priv::Pair<const K, V>>>
+using ThisMap_NullMutex = THIS_HASH_MAP<K, V, H, Eq, Alloc THIS_EXTRA_TPL_PARAMS_NULLMUTEX>;

 static_assert(!std::is_standard_layout<NonStandardLayout>(), "");


--- a/third_party/parallel-hashmap/tests/parallel_flat_hash_map_mutex_test.cc
+++ b/third_party/parallel-hashmap/tests/parallel_flat_hash_map_mutex_test.cc
@@ -9,4 +9,6 @@
    #define THIS_EXTRA_TPL_PARAMS , 4, boost::upgrade_mutex
 #endif

+#define THIS_EXTRA_TPL_PARAMS_NULLMUTEX , 4, phmap::NullMutex
+
 #include "parallel_hash_map_test.cc"
--- a/third_party/parallel-hashmap/tests/parallel_hash_map_test.cc
+++ b/third_party/parallel-hashmap/tests/parallel_hash_map_test.cc
@@ -9,6 +9,24 @@ namespace phmap {
 namespace priv {
 namespace {

+TEST(THIS_TEST_NAME, Swap) {
+    using Map = ThisMap<int, int>;
+    using MapB = ThisMap_NullMutex<int, int>;
+    
+    Map t;
+    EXPECT_TRUE(t.find(0) == t.end());
+    auto res = t.emplace(0, 1);
+    EXPECT_TRUE(res.second);
+    EXPECT_EQ(1, t.size());
+    MapB u;
+    t.swap(u);
+    EXPECT_EQ(0, t.size());
+    EXPECT_EQ(1, u.size());
+    EXPECT_TRUE(t.find(0) == t.end());
+    EXPECT_TRUE(u[0] == 1);
+}
+
+
 TEST(THIS_TEST_NAME, IfContains) {
    // ----------------
    // test if_contains
@@ -116,6 +134,17 @@ TEST(THIS_TEST_NAME, ForEach) {
            EXPECT_EQ(pair.first + 7, pair.second);
        });
    EXPECT_EQ(counter, 3);
+    
+    counter = 0;
+    for (size_t i=0; i<m.subcnt(); ++i) {
+        m.with_submap(i, [&](const Map::EmbeddedSet& set) {
+            for (auto& p : set) {
+                ++counter;
+                EXPECT_EQ(p.first + 7, p.second);
+            }
+        });
+    }
+    EXPECT_EQ(counter, 3);
 }

 TEST(THIS_TEST_NAME, EmplaceSingle) {

--- a/third_party/parallel-hashmap/tests/raw_hash_set_allocator_test.cc
+++ b/third_party/parallel-hashmap/tests/raw_hash_set_allocator_test.cc
@@ -67,7 +67,7 @@ class CheckedAlloc {
      std::integral_constant<bool, (Spec & kPropagateOnSwap) != 0>;

  CheckedAlloc select_on_container_copy_construction() const {
-    if (Spec & kPropagateOnCopy) return *this;
+    PHMAP_IF_CONSTEXPR (Spec & kPropagateOnCopy) return *this;
    return {};
  }

@@ -129,6 +129,7 @@ struct Policy {
  using slot_type = Tracked<int32_t>;
  using init_type = Tracked<int32_t>;
  using key_type = int32_t;
+  using is_flat = std::false_type;

  template <class allocator_type, class... Args>
  static void construct(allocator_type* alloc, slot_type* slot,

--- a/third_party/parallel-hashmap/tests/raw_hash_set_test.cc
+++ b/third_party/parallel-hashmap/tests/raw_hash_set_test.cc
@@ -138,27 +138,27 @@ TEST(BitMask, WithShift) {
  EXPECT_EQ(0x0000000080800000, mask);

  BitMask<uint64_t, 8, 3> b(mask);
-  EXPECT_EQ(*b, 2);
+  EXPECT_EQ(*b, 2u);
 }

 TEST(BitMask, LeadingTrailing) {
-  EXPECT_EQ((BitMask<uint32_t, 16>(0x00001a40).LeadingZeros()), 3);
-  EXPECT_EQ((BitMask<uint32_t, 16>(0x00001a40).TrailingZeros()), 6);
+  EXPECT_EQ((BitMask<uint32_t, 16>(0x00001a40).LeadingZeros()), 3u);
+  EXPECT_EQ((BitMask<uint32_t, 16>(0x00001a40).TrailingZeros()), 6u);

-  EXPECT_EQ((BitMask<uint32_t, 16>(0x00000001).LeadingZeros()), 15);
-  EXPECT_EQ((BitMask<uint32_t, 16>(0x00000001).TrailingZeros()), 0);
+  EXPECT_EQ((BitMask<uint32_t, 16>(0x00000001).LeadingZeros()), 15u);
+  EXPECT_EQ((BitMask<uint32_t, 16>(0x00000001).TrailingZeros()), 0u);

-  EXPECT_EQ((BitMask<uint32_t, 16>(0x00008000).LeadingZeros()), 0);
-  EXPECT_EQ((BitMask<uint32_t, 16>(0x00008000).TrailingZeros()), 15);
+  EXPECT_EQ((BitMask<uint32_t, 16>(0x00008000).LeadingZeros()), 0u);
+  EXPECT_EQ((BitMask<uint32_t, 16>(0x00008000).TrailingZeros()), 15u);

-  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x0000008080808000).LeadingZeros()), 3);
-  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x0000008080808000).TrailingZeros()), 1);
+  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x0000008080808000).LeadingZeros()), 3u);
+  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x0000008080808000).TrailingZeros()), 1u);

-  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x0000000000000080).LeadingZeros()), 7);
-  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x0000000000000080).TrailingZeros()), 0);
+  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x0000000000000080).LeadingZeros()), 7u);
+  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x0000000000000080).TrailingZeros()), 0u);

-  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x8000000000000000).LeadingZeros()), 0);
-  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x8000000000000000).TrailingZeros()), 7);
+  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x8000000000000000).LeadingZeros()), 0u);
+  EXPECT_EQ((BitMask<uint64_t, 8, 3>(0x8000000000000000).TrailingZeros()), 7u);
 }

 TEST(Group, EmptyGroup) {
@@ -259,6 +259,7 @@ struct IntPolicy {
  using slot_type = int64_t;
  using key_type = int64_t;
  using init_type = int64_t;
+  using is_flat = std::false_type;

  static void construct(void*, int64_t* slot, int64_t v) { *slot = v; }
  static void destroy(void*, int64_t*) {}
@@ -301,6 +302,7 @@ class StringPolicy {

  using key_type = std::string;
  using init_type = std::pair<std::string, std::string>;
+  using is_flat = std::false_type;

  template <class allocator_type, class... Args>
  static void construct(allocator_type* alloc, slot_type* slot, Args... args) {
@@ -389,40 +391,6 @@ struct BadTable : raw_hash_set<IntPolicy, BadFastHash, std::equal_to<int>,
  using Base::Base;
 };

-#if PHMAP_HAVE_STD_STRING_VIEW
-TEST(Table, EmptyFunctorOptimization) {
-  static_assert(std::is_empty<std::equal_to<std::string_view>>::value, "");
-  static_assert(std::is_empty<std::allocator<int>>::value, "");
-
-  struct MockTable {
-    void* ctrl;
-    void* slots;
-    size_t size;
-    size_t capacity;
-    size_t growth_left;
-    void* infoz;
-  };
-  struct StatelessHash {
-    size_t operator()(std::string_view) const { return 0; }
-  };
-  struct StatefulHash : StatelessHash {
-    size_t dummy;
-  };
-
-  EXPECT_EQ(
-      sizeof(MockTable),
-      sizeof(
-          raw_hash_set<StringPolicy, StatelessHash,
-                       std::equal_to<std::string_view>, std::allocator<int>>));
-
-  EXPECT_EQ(
-      sizeof(MockTable) + sizeof(StatefulHash),
-      sizeof(
-          raw_hash_set<StringPolicy, StatefulHash,
-                       std::equal_to<std::string_view>, std::allocator<int>>));
-}
-#endif
-
 TEST(Table, Empty) {
  IntTable t;
  EXPECT_EQ(0, t.size());
@@ -632,6 +600,7 @@ struct DecomposePolicy {
  using slot_type = DecomposeType;
  using key_type = DecomposeType;
  using init_type = DecomposeType;
+  using is_flat = std::false_type;

  template <typename T>
  static void construct(void*, DecomposeType* slot, T&& v) {

--- a/torch_sparse.egg-info/PKG-INFO
+++ b/torch_sparse.egg-info/PKG-INFO
 Metadata-Version: 2.1
 Name: torch-sparse
-Version: 0.6.13
+Version: 0.6.18
 Summary: PyTorch Extension Library of Optimized Autograd Sparse Matrix Operations
 Home-page: https://github.com/rusty1s/pytorch_sparse
-Download-URL: https://github.com/rusty1s/pytorch_sparse/archive/0.6.13.tar.gz
 Author: Matthias Fey
 Author-email: matthias.fey@tu-dortmund.de
+License: UNKNOWN
+Download-URL: https://github.com/rusty1s/pytorch_sparse/archive/0.6.18.tar.gz
+Description: [pypi-image]: https://badge.fury.io/py/torch-sparse.svg
+        [pypi-url]: https://pypi.python.org/pypi/torch-sparse
+        [testing-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml/badge.svg
+        [testing-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml
+        [linting-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml/badge.svg
+        [linting-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml
+        [coverage-image]: https://codecov.io/gh/rusty1s/pytorch_sparse/branch/master/graph/badge.svg
+        [coverage-url]: https://codecov.io/github/rusty1s/pytorch_sparse?branch=master
+        
+        # PyTorch Sparse
+        
+        [![PyPI Version][pypi-image]][pypi-url]
+        [![Testing Status][testing-image]][testing-url]
+        [![Linting Status][linting-image]][linting-url]
+        [![Code Coverage][coverage-image]][coverage-url]
+        
+        --------------------------------------------------------------------------------
+        
+        This package consists of a small extension library of optimized sparse matrix operations with autograd support.
+        This package currently consists of the following methods:
+        
+        * **[Coalesce](#coalesce)**
+        * **[Transpose](#transpose)**
+        * **[Sparse Dense Matrix Multiplication](#sparse-dense-matrix-multiplication)**
+        * **[Sparse Sparse Matrix Multiplication](#sparse-sparse-matrix-multiplication)**
+        
+        All included operations work on varying data types and are implemented both for CPU and GPU.
+        To avoid the hazzle of creating [`torch.sparse_coo_tensor`](https://pytorch.org/docs/stable/torch.html?highlight=sparse_coo_tensor#torch.sparse_coo_tensor), this package defines operations on sparse tensors by simply passing `index` and `value` tensors as arguments ([with same shapes as defined in PyTorch](https://pytorch.org/docs/stable/sparse.html)).
+        Note that only `value` comes with autograd support, as `index` is discrete and therefore not differentiable.
+        
+        ## Installation
+        
+        ### Anaconda
+        
+        **Update:** You can now install `pytorch-sparse` via [Anaconda](https://anaconda.org/pyg/pytorch-sparse) for all major OS/PyTorch/CUDA combinations 🤗
+        Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
+        
+        ```
+        conda install pytorch-sparse -c pyg
+        ```
+        
+        ### Binaries
+        
+        We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
+        
+        #### PyTorch 2.2
+        
+        To install the binaries for PyTorch 2.2.0, simply run
+        
+        ```
+        pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.2.0+${CUDA}.html
+        ```
+        
+        where `${CUDA}` should be replaced by either `cpu`, `cu118`, or `cu121` depending on your PyTorch installation.
+        
+        |             | `cpu` | `cu118` | `cu121` |
+        |-------------|-------|---------|---------|
+        | **Linux**   | ✅    | ✅      | ✅      |
+        | **Windows** | ✅    | ✅      | ✅      |
+        | **macOS**   | ✅    |         |         |
+        
+        
+        #### PyTorch 2.1
+        
+        To install the binaries for PyTorch 2.1.0, simply run
+        
+        ```
+        pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
+        ```
+        
+        where `${CUDA}` should be replaced by either `cpu`, `cu118`, or `cu121` depending on your PyTorch installation.
+        
+        |             | `cpu` | `cu118` | `cu121` |
+        |-------------|-------|---------|---------|
+        | **Linux**   | ✅    | ✅      | ✅      |
+        | **Windows** | ✅    | ✅      | ✅      |
+        | **macOS**   | ✅    |         |         |
+        
+        **Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1, PyTorch 1.9.0, PyTorch 1.10.0/1.10.1/1.10.2, PyTorch 1.11.0, PyTorch 1.12.0/1.12.1, PyTorch 1.13.0/1.13.1, and PyTorch 2.0.0 (following the same procedure).
+        For older versions, you need to explicitly specify the latest supported version number or install via `pip install --no-index` in order to prevent a manual installation from source.
+        You can look up the latest supported version number [here](https://data.pyg.org/whl).
+        
+        ### From source
+        
+        Ensure that at least PyTorch 1.7.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+        
+        ```
+        $ python -c "import torch; print(torch.__version__)"
+        >>> 1.7.0
+        
+        $ echo $PATH
+        >>> /usr/local/cuda/bin:...
+        
+        $ echo $CPATH
+        >>> /usr/local/cuda/include:...
+        ```
+        
+        If you want to additionally build `torch-sparse` with METIS support, *e.g.* for partioning, please download and install the [METIS library](https://web.archive.org/web/20211119110155/http://glaros.dtc.umn.edu/gkhome/metis/metis/download) by following the instructions in the `Install.txt` file.
+        Note that METIS needs to be installed with 64 bit `IDXTYPEWIDTH` by changing `include/metis.h`.
+        Afterwards, set the environment variable `WITH_METIS=1`.
+        
+        Then run:
+        
+        ```
+        pip install torch-scatter torch-sparse
+        ```
+        
+        When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
+        In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
+        
+        ```
+        export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.2+PTX 7.5+PTX"
+        ```
+        
+        ## Functions
+        
+        ### Coalesce
+        
+        ```
+        torch_sparse.coalesce(index, value, m, n, op="add") -> (torch.LongTensor, torch.Tensor)
+        ```
+        
+        Row-wise sorts `index` and removes duplicate entries.
+        Duplicate entries are removed by scattering them together.
+        For scattering, any operation of [`torch_scatter`](https://github.com/rusty1s/pytorch_scatter) can be used.
+        
+        #### Parameters
+        
+        * **index** *(LongTensor)* - The index tensor of sparse matrix.
+        * **value** *(Tensor)* - The value tensor of sparse matrix.
+        * **m** *(int)* - The first dimension of sparse matrix.
+        * **n** *(int)* - The second dimension of sparse matrix.
+        * **op** *(string, optional)* - The scatter operation to use. (default: `"add"`)
+        
+        #### Returns
+        
+        * **index** *(LongTensor)* - The coalesced index tensor of sparse matrix.
+        * **value** *(Tensor)* - The coalesced value tensor of sparse matrix.
+        
+        #### Example
+        
+        ```python
+        import torch
+        from torch_sparse import coalesce
+        
+        index = torch.tensor([[1, 0, 1, 0, 2, 1],
+                              [0, 1, 1, 1, 0, 0]])
+        value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
+        
+        index, value = coalesce(index, value, m=3, n=2)
+        ```
+        
+        ```
+        print(index)
+        tensor([[0, 1, 1, 2],
+                [1, 0, 1, 0]])
+        print(value)
+        tensor([[6.0, 8.0],
+                [7.0, 9.0],
+                [3.0, 4.0],
+                [5.0, 6.0]])
+        ```
+        
+        ### Transpose
+        
+        ```
+        torch_sparse.transpose(index, value, m, n) -> (torch.LongTensor, torch.Tensor)
+        ```
+        
+        Transposes dimensions 0 and 1 of a sparse matrix.
+        
+        #### Parameters
+        
+        * **index** *(LongTensor)* - The index tensor of sparse matrix.
+        * **value** *(Tensor)* - The value tensor of sparse matrix.
+        * **m** *(int)* - The first dimension of sparse matrix.
+        * **n** *(int)* - The second dimension of sparse matrix.
+        * **coalesced** *(bool, optional)* - If set to `False`, will not coalesce the output. (default: `True`)
+        
+        #### Returns
+        
+        * **index** *(LongTensor)* - The transposed index tensor of sparse matrix.
+        * **value** *(Tensor)* - The transposed value tensor of sparse matrix.
+        
+        #### Example
+        
+        ```python
+        import torch
+        from torch_sparse import transpose
+        
+        index = torch.tensor([[1, 0, 1, 0, 2, 1],
+                              [0, 1, 1, 1, 0, 0]])
+        value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
+        
+        index, value = transpose(index, value, 3, 2)
+        ```
+        
+        ```
+        print(index)
+        tensor([[0, 0, 1, 1],
+                [1, 2, 0, 1]])
+        print(value)
+        tensor([[7.0, 9.0],
+                [5.0, 6.0],
+                [6.0, 8.0],
+                [3.0, 4.0]])
+        ```
+        
+        ### Sparse Dense Matrix Multiplication
+        
+        ```
+        torch_sparse.spmm(index, value, m, n, matrix) -> torch.Tensor
+        ```
+        
+        Matrix product of a sparse matrix with a dense matrix.
+        
+        #### Parameters
+        
+        * **index** *(LongTensor)* - The index tensor of sparse matrix.
+        * **value** *(Tensor)* - The value tensor of sparse matrix.
+        * **m** *(int)* - The first dimension of sparse matrix.
+        * **n** *(int)* - The second dimension of sparse matrix.
+        * **matrix** *(Tensor)* - The dense matrix.
+        
+        #### Returns
+        
+        * **out** *(Tensor)* - The dense output matrix.
+        
+        #### Example
+        
+        ```python
+        import torch
+        from torch_sparse import spmm
+        
+        index = torch.tensor([[0, 0, 1, 2, 2],
+                              [0, 2, 1, 0, 1]])
+        value = torch.Tensor([1, 2, 4, 1, 3])
+        matrix = torch.Tensor([[1, 4], [2, 5], [3, 6]])
+        
+        out = spmm(index, value, 3, 3, matrix)
+        ```
+        
+        ```
+        print(out)
+        tensor([[7.0, 16.0],
+                [8.0, 20.0],
+                [7.0, 19.0]])
+        ```
+        
+        ### Sparse Sparse Matrix Multiplication
+        
+        ```
+        torch_sparse.spspmm(indexA, valueA, indexB, valueB, m, k, n) -> (torch.LongTensor, torch.Tensor)
+        ```
+        
+        Matrix product of two sparse tensors.
+        Both input sparse matrices need to be **coalesced** (use the `coalesced` attribute to force).
+        
+        #### Parameters
+        
+        * **indexA** *(LongTensor)* - The index tensor of first sparse matrix.
+        * **valueA** *(Tensor)* - The value tensor of first sparse matrix.
+        * **indexB** *(LongTensor)* - The index tensor of second sparse matrix.
+        * **valueB** *(Tensor)* - The value tensor of second sparse matrix.
+        * **m** *(int)* - The first dimension of first sparse matrix.
+        * **k** *(int)* - The second dimension of first sparse matrix and first dimension of second sparse matrix.
+        * **n** *(int)* - The second dimension of second sparse matrix.
+        * **coalesced** *(bool, optional)*: If set to `True`, will coalesce both input sparse matrices. (default: `False`)
+        
+        #### Returns
+        
+        * **index** *(LongTensor)* - The output index tensor of sparse matrix.
+        * **value** *(Tensor)* - The output value tensor of sparse matrix.
+        
+        #### Example
+        
+        ```python
+        import torch
+        from torch_sparse import spspmm
+        
+        indexA = torch.tensor([[0, 0, 1, 2, 2], [1, 2, 0, 0, 1]])
+        valueA = torch.Tensor([1, 2, 3, 4, 5])
+        
+        indexB = torch.tensor([[0, 2], [1, 0]])
+        valueB = torch.Tensor([2, 4])
+        
+        indexC, valueC = spspmm(indexA, valueA, indexB, valueB, 3, 3, 2)
+        ```
+        
+        ```
+        print(indexC)
+        tensor([[0, 1, 2],
+                [0, 1, 1]])
+        print(valueC)
+        tensor([8.0, 6.0, 8.0])
+        ```
+        
+        ## Running tests
+        
+        ```
+        pytest
+        ```
+        
+        ## C++ API
+        
+        `torch-sparse` also offers a C++ API that contains C++ equivalent of python models.
+        For this, we need to add `TorchLib` to the `-DCMAKE_PREFIX_PATH` (*e.g.*, it may exists in `{CONDA}/lib/python{X.X}/site-packages/torch` if installed via `conda`):
+        
+        ```
+        mkdir build
+        cd build
+        # Add -DWITH_CUDA=on support for CUDA support
+        cmake -DCMAKE_PREFIX_PATH="..." ..
+        make
+        make install
+        ```
+        
 Keywords: pytorch,sparse,sparse-matrices,autograd
+Platform: UNKNOWN
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3.7
 Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3 :: Only
-Requires-Python: >=3.7
+Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 Provides-Extra: test
-License-File: LICENSE
-
-[pypi-image]: https://badge.fury.io/py/torch-sparse.svg
-[pypi-url]: https://pypi.python.org/pypi/torch-sparse
-[testing-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml/badge.svg
-[testing-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml
-[linting-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml/badge.svg
-[linting-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml
-[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_sparse/branch/master/graph/badge.svg
-[coverage-url]: https://codecov.io/github/rusty1s/pytorch_sparse?branch=master
-
-# PyTorch Sparse
-
-[![PyPI Version][pypi-image]][pypi-url]
-[![Testing Status][testing-image]][testing-url]
-[![Linting Status][linting-image]][linting-url]
-[![Code Coverage][coverage-image]][coverage-url]
-
--------------------------------------------------------------------------------
-
-This package consists of a small extension library of optimized sparse matrix operations with autograd support.
-This package currently consists of the following methods:
-
-* **[Coalesce](#coalesce)**
-* **[Transpose](#transpose)**
-* **[Sparse Dense Matrix Multiplication](#sparse-dense-matrix-multiplication)**
-* **[Sparse Sparse Matrix Multiplication](#sparse-sparse-matrix-multiplication)**
-
-All included operations work on varying data types and are implemented both for CPU and GPU.
-To avoid the hazzle of creating [`torch.sparse_coo_tensor`](https://pytorch.org/docs/stable/torch.html?highlight=sparse_coo_tensor#torch.sparse_coo_tensor), this package defines operations on sparse tensors by simply passing `index` and `value` tensors as arguments ([with same shapes as defined in PyTorch](https://pytorch.org/docs/stable/sparse.html)).
-Note that only `value` comes with autograd support, as `index` is discrete and therefore not differentiable.
-
-## Installation
-
-### Anaconda
-
-**Update:** You can now install `pytorch-sparse` via [Anaconda](https://anaconda.org/pyg/pytorch-sparse) for all major OS/PyTorch/CUDA combinations 🤗
-Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
-
-```
-conda install pytorch-sparse -c pyg
-```
-
-### Binaries
-
-We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
-
-#### PyTorch 1.11
-
-To install the binaries for PyTorch 1.11.0, simply run
-
-```
-pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.11.0+${CUDA}.html
-```
-
-where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu113`, or `cu115` depending on your PyTorch installation.
-
-|             | `cpu` | `cu102` | `cu113` | `cu115` |
-|-------------|-------|---------|---------|---------|
-| **Linux**   | ✅    | ✅      | ✅      | ✅      |
-| **Windows** | ✅    |         | ✅      | ✅      |
-| **macOS**   | ✅    |         |         |         |
-
-#### PyTorch 1.10
-
-To install the binaries for PyTorch 1.10.0, PyTorch 1.10.1 and PyTorch 1.10.2, simply run
-
-```
-pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
-```
-
-where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu111`, or `cu113` depending on your PyTorch installation.
-
-|             | `cpu` | `cu102` | `cu111` | `cu113` |
-|-------------|-------|---------|---------|---------|
-| **Linux**   | ✅    | ✅      | ✅      | ✅      |
-| **Windows** | ✅    | ✅      | ✅      | ✅      |
-| **macOS**   | ✅    |         |         |         |
-
-**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1 and PyTorch 1.9.0 (following the same procedure).
-For older versions, you might need to explicitly specify the latest supported version number in order to prevent a manual installation from source.
-You can look up the latest supported version number [here](https://data.pyg.org/whl).
-
-### From source
-
-Ensure that at least PyTorch 1.7.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
-
-```
-$ python -c "import torch; print(torch.__version__)"
->>> 1.7.0
-
-$ echo $PATH
->>> /usr/local/cuda/bin:...
-
-$ echo $CPATH
->>> /usr/local/cuda/include:...
-```
-
-If you want to additionally build `torch-sparse` with METIS support, *e.g.* for partioning, please download and install the [METIS library](http://glaros.dtc.umn.edu/gkhome/metis/metis/download) by following the instructions in the `Install.txt` file.
-Note that METIS needs to be installed with 64 bit `IDXTYPEWIDTH` by changing `include/metis.h`.
-Afterwards, set the environment variable `WITH_METIS=1`.
-
-Then run:
-
-```
-pip install torch-scatter torch-sparse
-```
-
-When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
-In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
-
-```
-export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.2+PTX 7.5+PTX"
-```
-
-## Functions
-
-### Coalesce
-
-```
-torch_sparse.coalesce(index, value, m, n, op="add") -> (torch.LongTensor, torch.Tensor)
-```
-
-Row-wise sorts `index` and removes duplicate entries.
-Duplicate entries are removed by scattering them together.
-For scattering, any operation of [`torch_scatter`](https://github.com/rusty1s/pytorch_scatter) can be used.
-
-#### Parameters
-
-* **index** *(LongTensor)* - The index tensor of sparse matrix.
-* **value** *(Tensor)* - The value tensor of sparse matrix.
-* **m** *(int)* - The first dimension of sparse matrix.
-* **n** *(int)* - The second dimension of sparse matrix.
-* **op** *(string, optional)* - The scatter operation to use. (default: `"add"`)
-
-#### Returns
-
-* **index** *(LongTensor)* - The coalesced index tensor of sparse matrix.
-* **value** *(Tensor)* - The coalesced value tensor of sparse matrix.
-
-#### Example
-
-```python
-import torch
-from torch_sparse import coalesce
-
-index = torch.tensor([[1, 0, 1, 0, 2, 1],
-                      [0, 1, 1, 1, 0, 0]])
-value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
-
-index, value = coalesce(index, value, m=3, n=2)
-```
-
-```
-print(index)
-tensor([[0, 1, 1, 2],
-        [1, 0, 1, 0]])
-print(value)
-tensor([[6.0, 8.0],
-        [7.0, 9.0],
-        [3.0, 4.0],
-        [5.0, 6.0]])
-```
-
-### Transpose
-
-```
-torch_sparse.transpose(index, value, m, n) -> (torch.LongTensor, torch.Tensor)
-```
-
-Transposes dimensions 0 and 1 of a sparse matrix.
-
-#### Parameters
-
-* **index** *(LongTensor)* - The index tensor of sparse matrix.
-* **value** *(Tensor)* - The value tensor of sparse matrix.
-* **m** *(int)* - The first dimension of sparse matrix.
-* **n** *(int)* - The second dimension of sparse matrix.
-* **coalesced** *(bool, optional)* - If set to `False`, will not coalesce the output. (default: `True`)
-
-#### Returns
-
-* **index** *(LongTensor)* - The transposed index tensor of sparse matrix.
-* **value** *(Tensor)* - The transposed value tensor of sparse matrix.
-
-#### Example
-
-```python
-import torch
-from torch_sparse import transpose
-
-index = torch.tensor([[1, 0, 1, 0, 2, 1],
-                      [0, 1, 1, 1, 0, 0]])
-value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
-
-index, value = transpose(index, value, 3, 2)
-```
-
-```
-print(index)
-tensor([[0, 0, 1, 1],
-        [1, 2, 0, 1]])
-print(value)
-tensor([[7.0, 9.0],
-        [5.0, 6.0],
-        [6.0, 8.0],
-        [3.0, 4.0]])
-```
-
-### Sparse Dense Matrix Multiplication
-
-```
-torch_sparse.spmm(index, value, m, n, matrix) -> torch.Tensor
-```
-
-Matrix product of a sparse matrix with a dense matrix.
-
-#### Parameters
-
-* **index** *(LongTensor)* - The index tensor of sparse matrix.
-* **value** *(Tensor)* - The value tensor of sparse matrix.
-* **m** *(int)* - The first dimension of sparse matrix.
-* **n** *(int)* - The second dimension of sparse matrix.
-* **matrix** *(Tensor)* - The dense matrix.
-
-#### Returns
-
-* **out** *(Tensor)* - The dense output matrix.
-
-#### Example
-
-```python
-import torch
-from torch_sparse import spmm
-
-index = torch.tensor([[0, 0, 1, 2, 2],
-                      [0, 2, 1, 0, 1]])
-value = torch.Tensor([1, 2, 4, 1, 3])
-matrix = torch.Tensor([[1, 4], [2, 5], [3, 6]])
-
-out = spmm(index, value, 3, 3, matrix)
-```
-
-```
-print(out)
-tensor([[7.0, 16.0],
-        [8.0, 20.0],
-        [7.0, 19.0]])
-```
-
-### Sparse Sparse Matrix Multiplication
-
-```
-torch_sparse.spspmm(indexA, valueA, indexB, valueB, m, k, n) -> (torch.LongTensor, torch.Tensor)
-```
-
-Matrix product of two sparse tensors.
-Both input sparse matrices need to be **coalesced** (use the `coalesced` attribute to force).
-
-#### Parameters
-
-* **indexA** *(LongTensor)* - The index tensor of first sparse matrix.
-* **valueA** *(Tensor)* - The value tensor of first sparse matrix.
-* **indexB** *(LongTensor)* - The index tensor of second sparse matrix.
-* **valueB** *(Tensor)* - The value tensor of second sparse matrix.
-* **m** *(int)* - The first dimension of first sparse matrix.
-* **k** *(int)* - The second dimension of first sparse matrix and first dimension of second sparse matrix.
-* **n** *(int)* - The second dimension of second sparse matrix.
-* **coalesced** *(bool, optional)*: If set to `True`, will coalesce both input sparse matrices. (default: `False`)
-
-#### Returns
-
-* **index** *(LongTensor)* - The output index tensor of sparse matrix.
-* **value** *(Tensor)* - The output value tensor of sparse matrix.
-
-#### Example
-
-```python
-import torch
-from torch_sparse import spspmm
-
-indexA = torch.tensor([[0, 0, 1, 2, 2], [1, 2, 0, 0, 1]])
-valueA = torch.Tensor([1, 2, 3, 4, 5])
-
-indexB = torch.tensor([[0, 2], [1, 0]])
-valueB = torch.Tensor([2, 4])
-
-indexC, valueC = spspmm(indexA, valueA, indexB, valueB, 3, 3, 2)
-```
-
-```
-print(indexC)
-tensor([[0, 1, 2],
-        [0, 1, 1]])
-print(valueC)
-tensor([8.0, 6.0, 8.0])
-```
-
-## C++ API
-
-`torch-sparse` also offers a C++ API that contains C++ equivalent of python models.
-
-```
-mkdir build
-cd build
-# Add -DWITH_CUDA=on support for the CUDA if needed
-cmake ..
-make
-make install
-```
-
-## Running tests
-
-```
-pytest
-```
--- a/torch_sparse.egg-info/SOURCES.txt
+++ b/torch_sparse.egg-info/SOURCES.txt
@@ -3,94 +3,138 @@ MANIFEST.in
 README.md
 setup.cfg
 setup.py
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/convert.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/diag.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/ego_sample.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/hgt_sample.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/metis.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/neighbor_sample.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/relabel.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/rw.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/saint.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/sample.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/spmm.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/spspmm.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/version.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/convert_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/diag_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/ego_sample_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/hgt_sample_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/metis_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/neighbor_sample_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/relabel_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/rw_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/saint_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/sample_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/spmm_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/cpu/spspmm_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/hip/convert_hip_hip.hip
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/hip/diag_hip_hip.hip
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/hip/rw_hip_hip.hip
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/hip/spmm_hip_hip.hip
-/work/home/quyuanhao123/software/test_ocp/torch_sparse-0.6.13/csrc/hip/spspmm_hip.hip
 csrc/convert.cpp
+csrc/convert_hip.cpp
 csrc/diag.cpp
+csrc/diag_hip.cpp
 csrc/ego_sample.cpp
 csrc/extensions.h
 csrc/hgt_sample.cpp
+csrc/macros.h
 csrc/metis.cpp
 csrc/neighbor_sample.cpp
 csrc/relabel.cpp
 csrc/rw.cpp
+csrc/rw_hip.cpp
 csrc/saint.cpp
 csrc/sample.cpp
 csrc/sparse.h
 csrc/spmm.cpp
-csrc/spspmm.cpp
+csrc/spmm_hip.cpp
 csrc/version.cpp
+csrc/version_hip.cpp
 csrc/cpu/convert_cpu.cpp
 csrc/cpu/convert_cpu.h
+csrc/cpu/convert_cpu_hip.cpp
 csrc/cpu/diag_cpu.cpp
 csrc/cpu/diag_cpu.h
+csrc/cpu/diag_cpu_hip.cpp
 csrc/cpu/ego_sample_cpu.cpp
 csrc/cpu/ego_sample_cpu.h
+csrc/cpu/ego_sample_cpu_hip.cpp
 csrc/cpu/hgt_sample_cpu.cpp
 csrc/cpu/hgt_sample_cpu.h
+csrc/cpu/hgt_sample_cpu_hip.cpp
 csrc/cpu/metis_cpu.cpp
 csrc/cpu/metis_cpu.h
+csrc/cpu/metis_cpu_hip.cpp
 csrc/cpu/neighbor_sample_cpu.cpp
 csrc/cpu/neighbor_sample_cpu.h
+csrc/cpu/neighbor_sample_cpu_hip.cpp
 csrc/cpu/reducer.h
 csrc/cpu/relabel_cpu.cpp
 csrc/cpu/relabel_cpu.h
+csrc/cpu/relabel_cpu_hip.cpp
 csrc/cpu/rw_cpu.cpp
 csrc/cpu/rw_cpu.h
+csrc/cpu/rw_cpu_hip.cpp
 csrc/cpu/saint_cpu.cpp
 csrc/cpu/saint_cpu.h
+csrc/cpu/saint_cpu_hip.cpp
 csrc/cpu/sample_cpu.cpp
 csrc/cpu/sample_cpu.h
+csrc/cpu/sample_cpu_hip.cpp
 csrc/cpu/spmm_cpu.cpp
 csrc/cpu/spmm_cpu.h
-csrc/cpu/spspmm_cpu.cpp
-csrc/cpu/spspmm_cpu.h
+csrc/cpu/spmm_cpu_hip.cpp
 csrc/cpu/utils.h
-csrc/hip/atomics.cuh
-csrc/hip/convert_hip.h
-csrc/hip/convert_hip.hip
-csrc/hip/convert_hip_hip.hip
-csrc/hip/diag_hip.h
-csrc/hip/diag_hip.hip
-csrc/hip/diag_hip_hip.hip
+csrc/cpu/utils_hip.h
+csrc/cuda/atomics.cuh
+csrc/cuda/convert_cuda.cu
+csrc/cuda/convert_cuda.h
+csrc/cuda/diag_cuda.cu
+csrc/cuda/diag_cuda.h
+csrc/cuda/reducer.cuh
+csrc/cuda/rw_cuda.cu
+csrc/cuda/rw_cuda.h
+csrc/cuda/spmm_cuda.cu
+csrc/cuda/spmm_cuda.h
+csrc/cuda/utils.cuh
+csrc/hip/convert_cuda.h
+csrc/hip/convert_cuda.hip
+csrc/hip/diag_cuda.h
+csrc/hip/diag_cuda.hip
 csrc/hip/reducer.cuh
-csrc/hip/rw_hip.h
-csrc/hip/rw_hip.hip
-csrc/hip/rw_hip_hip.hip
-csrc/hip/spmm_hip.h
-csrc/hip/spmm_hip.hip
-csrc/hip/spmm_hip_hip.hip
-csrc/hip/spspmm_hip.h
-csrc/hip/spspmm_hip.hip
+csrc/hip/rw_cuda.h
+csrc/hip/rw_cuda.hip
+csrc/hip/spmm_cuda.h
+csrc/hip/spmm_cuda.hip
 csrc/hip/utils.cuh
+third_party/parallel-hashmap/.gitattributes
+third_party/parallel-hashmap/.gitignore
+third_party/parallel-hashmap/CMakeLists.txt
+third_party/parallel-hashmap/LICENSE
+third_party/parallel-hashmap/README.md
+third_party/parallel-hashmap/index.html
+third_party/parallel-hashmap/phmap.natvis
+third_party/parallel-hashmap/phmap_gdb.py
+third_party/parallel-hashmap/phmap_lldb.py
+third_party/parallel-hashmap/.git/HEAD
+third_party/parallel-hashmap/.git/config
+third_party/parallel-hashmap/.git/description
+third_party/parallel-hashmap/.git/index
+third_party/parallel-hashmap/.git/packed-refs
+third_party/parallel-hashmap/.git/hooks/applypatch-msg.sample
+third_party/parallel-hashmap/.git/hooks/commit-msg.sample
+third_party/parallel-hashmap/.git/hooks/post-update.sample
+third_party/parallel-hashmap/.git/hooks/pre-applypatch.sample
+third_party/parallel-hashmap/.git/hooks/pre-commit.sample
+third_party/parallel-hashmap/.git/hooks/pre-push.sample
+third_party/parallel-hashmap/.git/hooks/pre-rebase.sample
+third_party/parallel-hashmap/.git/hooks/prepare-commit-msg.sample
+third_party/parallel-hashmap/.git/hooks/update.sample
+third_party/parallel-hashmap/.git/info/exclude
+third_party/parallel-hashmap/.git/logs/HEAD
+third_party/parallel-hashmap/.git/logs/refs/heads/master
+third_party/parallel-hashmap/.git/logs/refs/remotes/origin/HEAD
+third_party/parallel-hashmap/.git/objects/pack/pack-3c85fde45df5c893a21cee3d06d971471906d621.idx
+third_party/parallel-hashmap/.git/objects/pack/pack-3c85fde45df5c893a21cee3d06d971471906d621.pack
+third_party/parallel-hashmap/.git/refs/heads/master
+third_party/parallel-hashmap/.git/refs/remotes/origin/HEAD
+third_party/parallel-hashmap/.github/FUNDING.yml
+third_party/parallel-hashmap/.github/workflows/linux.yml
+third_party/parallel-hashmap/.github/workflows/macos.yml
+third_party/parallel-hashmap/.github/workflows/windows.yml
+third_party/parallel-hashmap/cmake/CMakeLists.txt.in
+third_party/parallel-hashmap/cmake/DetectVersion.cmake
+third_party/parallel-hashmap/cmake/DownloadGTest.cmake
+third_party/parallel-hashmap/cmake/helpers.cmake
+third_party/parallel-hashmap/cmake/phmap.cmake
+third_party/parallel-hashmap/doc/new_release.md
+third_party/parallel-hashmap/parallel_hashmap/btree.h
+third_party/parallel-hashmap/parallel_hashmap/meminfo.h
+third_party/parallel-hashmap/parallel_hashmap/phmap.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_base.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_base_hip.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_bits.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_bits_hip.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_config.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_config_hip.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_dump.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_fwd_decl.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_hip.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_utils.h
+third_party/parallel-hashmap/parallel_hashmap/phmap_utils_hip.h
 torch_sparse/__init__.py
 torch_sparse/add.py
 torch_sparse/bandwidth.py
@@ -105,7 +149,6 @@ torch_sparse/matmul.py
 torch_sparse/metis.py
 torch_sparse/mul.py
 torch_sparse/narrow.py
-torch_sparse/padding.py
 torch_sparse/permute.py
 torch_sparse/reduce.py
 torch_sparse/rw.py
@@ -117,7 +160,9 @@ torch_sparse/spmm.py
 torch_sparse/spspmm.py
 torch_sparse/storage.py
 torch_sparse/tensor.py
+torch_sparse/testing.py
 torch_sparse/transpose.py
+torch_sparse/typing.py
 torch_sparse/utils.py
 torch_sparse.egg-info/PKG-INFO
 torch_sparse.egg-info/SOURCES.txt

--- a/torch_sparse/__init__.py
+++ b/torch_sparse/__init__.py
@@ -3,12 +3,11 @@ import os.path as osp

 import torch

-__version__ = '0.6.15'
+__version__ = '0.6.16'

 for library in [
-        '_version', '_convert', '_diag', '_spmm', '_spspmm', '_metis', '_rw',
-        '_saint', '_sample', '_ego_sample', '_hgt_sample', '_neighbor_sample',
-        '_relabel'
+        '_version', '_convert', '_diag', '_spmm', '_metis', '_rw', '_saint',
+        '_sample', '_ego_sample', '_hgt_sample', '_neighbor_sample', '_relabel'
 ]:
    cuda_spec = importlib.machinery.PathFinder().find_spec(
        f'{library}_cuda', [osp.dirname(__file__)])
@@ -55,7 +54,6 @@ from .rw import random_walk  # noqa
 from .metis import partition  # noqa
 from .bandwidth import reverse_cuthill_mckee  # noqa
 from .saint import saint_subgraph  # noqa
-from .padding import padded_index, padded_index_select  # noqa
 from .sample import sample, sample_adj  # noqa

 from .convert import to_torch_sparse, from_torch_sparse  # noqa
@@ -101,8 +99,6 @@ __all__ = [
    'partition',
    'reverse_cuthill_mckee',
    'saint_subgraph',
-    'padded_index',
-    'padded_index_select',
    'to_torch_sparse',
    'from_torch_sparse',
    'to_scipy',

--- a/torch_sparse/__pycache__/__init__.cpython-39.pyc
+++ b/torch_sparse/__pycache__/__init__.cpython-39.pyc
--- a/torch_sparse/cat.py
+++ b/torch_sparse/cat.py
-from typing import Optional, List, Tuple
+from typing import Optional, List, Tuple  # noqa

 import torch
 from torch_sparse.storage import SparseStorage