uvm_table.hpp 5.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#pragma once
#include <nv_util.h>

#include <thread>
#include <unordered_map>
#include <vector>

namespace gpu_cache {

template <typename key_type, typename index_type>
class HashBlock {
 public:
  key_type* keys;
  size_t num_sets;
  size_t capacity;

  HashBlock(size_t expected_capacity, int set_size, int batch_size);
  ~HashBlock();
  void add(const key_type* new_keys, const size_t num_keys, key_type* missing_keys,
           int* num_missing_keys, cudaStream_t stream);
  void query(const key_type* query_keys, const size_t num_keys, index_type* output_indices,
             key_type* missing_keys, int* missing_positions, int* num_missing_keys,
             cudaStream_t stream);
  void query(const key_type* query_keys, int* num_keys, index_type* output_indices,
             cudaStream_t stream);
  void clear(cudaStream_t stream);

 private:
  int max_set_size_;
  int batch_size_;
  int* set_sizes_;
};

template <typename vec_type>
class H2HCopy {
 public:
  H2HCopy(int num_threads) : num_threads_(num_threads), working_(num_threads) {
    for (int i = 0; i < num_threads_; i++) {
      threads_.emplace_back(
          [&](int idx) {
            while (!terminate_) {
              if (working_[idx].load(std::memory_order_relaxed)) {
                working_[idx].store(false, std::memory_order_relaxed);
                if (num_keys_ == 0) continue;
                size_t num_keys_this_thread = (num_keys_ - 1) / num_threads_ + 1;
                size_t begin = idx * num_keys_this_thread;
                if (idx == num_threads_ - 1) {
                  num_keys_this_thread = num_keys_ - num_keys_this_thread * idx;
                }
                size_t end = begin + num_keys_this_thread;

                for (size_t i = begin; i < end; i++) {
                  size_t idx_vec = get_index_(i);
                  if (idx_vec == std::numeric_limits<size_t>::max()) {
                    continue;
                  }
                  memcpy(dst_data_ptr_ + i * vec_size_, src_data_ptr_ + idx_vec * vec_size_,
                         sizeof(vec_type) * vec_size_);
                }
                num_finished_workers_++;
              }
            }
            std::this_thread::sleep_for(std::chrono::microseconds(1));
          },
          i);
    }
  };

  void copy(vec_type* dst_data_ptr, vec_type* src_data_ptr, size_t num_keys, int vec_size,
            std::function<size_t(size_t)> get_index_func) {
    std::lock_guard<std::mutex> guard(submit_mutex_);
    dst_data_ptr_ = dst_data_ptr;
    src_data_ptr_ = src_data_ptr;
    get_index_ = get_index_func;
    num_keys_ = num_keys;
    vec_size_ = vec_size;
    num_finished_workers_.store(0, std::memory_order_acquire);

    for (auto& working : working_) {
      working.store(true, std::memory_order_relaxed);
    }

    while (num_finished_workers_ != num_threads_) {
      continue;
    }
  }

  ~H2HCopy() {
    terminate_ = true;
    for (auto& t : threads_) {
      t.join();
    }
  }

 private:
  vec_type* src_data_ptr_;
  vec_type* dst_data_ptr_;

  std::function<size_t(size_t)> get_index_;

  size_t num_keys_;
  int vec_size_;

  std::mutex submit_mutex_;
  const int num_threads_;
  std::vector<std::thread> threads_;
  std::vector<std::atomic<bool>> working_;
  volatile bool terminate_{false};
  std::atomic<int> num_finished_workers_{0};
};

template <typename key_type, typename index_type, typename vec_type = float>
class UvmTable {
 public:
  UvmTable(const size_t device_table_capacity, const size_t host_table_capacity,
           const int max_batch_size, const int vec_size,
           const vec_type default_value = (vec_type)0);
  ~UvmTable();
  void query(const key_type* d_keys, const int len, vec_type* d_vectors, cudaStream_t stream = 0);
  void add(const key_type* h_keys, const vec_type* h_vectors, const size_t len);
  void clear(cudaStream_t stream = 0);

 private:
  static constexpr int num_buffers_ = 2;
  key_type* d_keys_buffer_;
  vec_type* d_vectors_buffer_;
  vec_type* d_vectors_;

  index_type* d_output_indices_;
  index_type* d_output_host_indices_;
  index_type* h_output_host_indices_;

  key_type* d_missing_keys_;
  int* d_missing_positions_;
  int* d_missing_count_;

  std::vector<vec_type> h_vectors_;
  key_type* h_missing_keys_;

  cudaStream_t query_stream_;
  cudaEvent_t query_event_;

  vec_type* h_cpy_buffers_[num_buffers_];
  vec_type* d_cpy_buffers_[num_buffers_];
  cudaStream_t cpy_streams_[num_buffers_];
  cudaEvent_t cpy_events_[num_buffers_];

  std::unordered_map<key_type, index_type> h_final_missing_items_;

  int max_batch_size_;
  int vec_size_;
  size_t num_set_;
  size_t num_host_set_;
  size_t table_capacity_;
  std::vector<vec_type> default_vector_;

  HashBlock<key_type, index_type> device_table_;
  HashBlock<key_type, index_type> host_table_;
};
}  // namespace gpu_cache