id_hash_map.h 5.22 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/**
 *  Copyright (c) 2023 by Contributors
 * @file array/cpu/id_hash_map.h
 * @brief Class about id hash map
 */

#ifndef DGL_ARRAY_CPU_ID_HASH_MAP_H_
#define DGL_ARRAY_CPU_ID_HASH_MAP_H_

#include <dgl/aten/types.h>

#include <functional>
#include <memory>
#include <vector>

namespace dgl {
namespace aten {

/**
 * @brief A CPU targeted hashmap for mapping duplicate and non-consecutive ids
 * in the provided array to unique and consecutive ones. It utilizes
 * multi-threading to accelerate the insert and search speed. Currently it is
 * only designed to be used in `ToBlockCpu` for optimizing, so it only support
 * key insertions once with Init function, and it does not support key deletion.
 *
 * The hash map should be prepared in two phases before using. With the first
 * being creating the hashmap, and then init it with an id array.
 *
 * For example, for an array A with following entries:
 * [98, 98, 100, 99, 97, 99, 101, 100, 102]
 * Create the hashmap H with:
 * `H = CpuIdHashMap()` (1)
 * And Init it with:
 * `U = H.Init(A)` (2)  (U is an id array used to store the unqiue
 * ids in A).
 * Then U should be (U is not exclusive as the element order is not
 * guaranteed to be steady):
 * [98, 100, 99, 97, 101, 102]
 * And the hashmap should generate following mappings:
 *  * [
 *   {key: 98, value: 0},
 *   {key: 100, value: 1},
 *   {key: 99, value: 2},
 *   {key: 97, value: 3},
 *   {key: 101, value: 4},
 *   {key: 102, value: 5}
 * ]
 * Search the hashmap with array I=[98, 99, 102]:
 * R = H.Map(I) (3)
 * R should be:
 * [0, 2, 5]
 **/
template <typename IdType>
class IdHashMap {
 public:
  /**
   * @brief An entry in the hashtable.
   */
  struct Mapping {
    /**
     * @brief The ID of the item inserted.
     */
    IdType key;
    /**
     * @brief The value of the item inserted.
     */
    IdType value;
  };

  /**
   * @brief Cross platform CAS operation.
   * It is an atomic operation that compares the contents of a memory
   * location with a given value and, only if they are the same, modifies
   * the contents of that memory location to a new given value.
   *
   * @param ptr The pointer to the object to test and modify .
   * @param old_val The value expected to be found in `ptr`.
   * @param new_val The value to store in `ptr` if it is as expected.
   *
   * @return Old value pointed by the `ptr`.
   */
  static IdType CompareAndSwap(IdType* ptr, IdType old_val, IdType new_val);

  IdHashMap();

  IdHashMap(const IdHashMap& other) = delete;
  IdHashMap& operator=(const IdHashMap& other) = delete;

  /**
   * @brief Init the hashmap with an array of ids.
   * Firstly allocating the memeory and init the entire space with empty key.
   * And then insert the items in `ids` concurrently to generate the
   * mappings, in passing returning the unique ids in `ids`.
   *
   * @param ids The array of ids to be inserted as keys.
   *
   * @return Unique ids for the input `ids`.
   */
  IdArray Init(const IdArray& ids);

  /**
   * @brief Find the mappings of given keys.
   *
   * @param ids The keys to map for.
   *
   * @return Mapping results corresponding to `ids`.
   */
  IdArray MapIds(const IdArray& ids) const;

 private:
  /**
   * @brief Get the next position and delta for probing.
   *
   * @param[in,out] pos Calculate the next position with quadric probing.
   * @param[in,out] delta Calculate the next delta by adding 1.
   */
  void Next(IdType* pos, IdType* delta) const;

  /**
   * @brief Find the mapping of a given key.
   *
   * @param id The key to map for.
   *
   * @return Mapping result for the `id`.
   */
  IdType MapId(const IdType id) const;

  /**
   * @brief Insert an id into the hash map.
   *
   * @param id The id to be inserted.
   * @param valid The item at index will be set to indicate
   * whether the `id` at `index` is inserted or not.
   * @param index The index of the `id`.
   *
   */
  void Insert(IdType id, std::vector<int16_t>* valid, size_t index);

  /**
   * @brief Set the value for the key in the hash map.
   *
   * @param key The key to set for.
   * @param value The value to be set for the `key`.
   *
   * @warning Key must exist.
   */
  void Set(IdType key, IdType value);

  /**
   * @brief Attempt to insert the key into the hash map at the given position.
   * 1. If the key at `pos` is empty -> Set the key, return true and set
   * `valid[index]` to true.
   * 2. If the key at `pos` is equal to `key` -> Return true.
   * 3. If the key at `pos` is non-empty and not equal to `key` -> Return false.
   * @param pos The position in the hash map to be inserted at.
   * @param key The key to be inserted.
   * @param valid The item at index will be set to indicate
   * whether the `key` at `index` is inserted or not.
   * @param index The index of the `key`.
   *
   * @return Whether the key exists in the map now.
   */
  bool AttemptInsertAt(
      int64_t pos, IdType key, std::vector<int16_t>* valid, size_t index);

 private:
  /**
   * @brief Hash maps which is used to store all elements.
   */
  std::unique_ptr<Mapping[], std::function<void(Mapping*)>> hash_map_;

  /**
   * @brief Mask which is assisted to get the position in the table
   * for a key by performing `&` operation with it.
   */
  IdType mask_;
};

}  // namespace aten
}  // namespace dgl

#endif  // DGL_ARRAY_CPU_ID_HASH_MAP_H_