Commit 1d28bf8b authored by sangwzh's avatar sangwzh
Browse files

update third_party/HugeCTR/gpu_cache codes to hip

parent f119ea7c
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* /*
* Copyright (c) 2021, NVIDIA CORPORATION. * Copyright (c) 2021, NVIDIA CORPORATION.
* *
...@@ -31,22 +33,22 @@ class gpu_cache_api { ...@@ -31,22 +33,22 @@ class gpu_cache_api {
// Query API, i.e. A single read from the cache // Query API, i.e. A single read from the cache
virtual void Query(const key_type* d_keys, const size_t len, float* d_values, virtual void Query(const key_type* d_keys, const size_t len, float* d_values,
uint64_t* d_missing_index, key_type* d_missing_keys, size_t* d_missing_len, uint64_t* d_missing_index, key_type* d_missing_keys, size_t* d_missing_len,
cudaStream_t stream, hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0; const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
virtual void Replace(const key_type* d_keys, const size_t len, const float* d_values, virtual void Replace(const key_type* d_keys, const size_t len, const float* d_values,
cudaStream_t stream, hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0; const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
// Update API, i.e. update the embeddings which exist in the cache // Update API, i.e. update the embeddings which exist in the cache
virtual void Update(const key_type* d_keys, const size_t len, const float* d_values, virtual void Update(const key_type* d_keys, const size_t len, const float* d_values,
cudaStream_t stream, hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0; const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
// Dump API, i.e. dump some slabsets' keys from the cache // Dump API, i.e. dump some slabsets' keys from the cache
virtual void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index, virtual void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
const size_t end_set_index, cudaStream_t stream) = 0; const size_t end_set_index, hipStream_t stream) = 0;
}; };
} // namespace gpu_cache } // namespace gpu_cache
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* /*
* Copyright (c) 2023, NVIDIA CORPORATION. * Copyright (c) 2023, NVIDIA CORPORATION.
* *
...@@ -61,20 +63,20 @@ class gpu_cache : public gpu_cache_api<key_type> { ...@@ -61,20 +63,20 @@ class gpu_cache : public gpu_cache_api<key_type> {
// Query API, i.e. A single read from the cache // Query API, i.e. A single read from the cache
void Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index, void Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index,
key_type* d_missing_keys, size_t* d_missing_len, cudaStream_t stream, key_type* d_missing_keys, size_t* d_missing_len, hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override; const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void Replace(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream, void Replace(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override; const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
// Update API, i.e. update the embeddings which exist in the cache // Update API, i.e. update the embeddings which exist in the cache
void Update(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream, void Update(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override; const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
// Dump API, i.e. dump some slabsets' keys from the cache // Dump API, i.e. dump some slabsets' keys from the cache
void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index, void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
const size_t end_set_index, cudaStream_t stream) override; const size_t end_set_index, hipStream_t stream) override;
public: public:
using slabset = slab_set<set_associativity, key_type, warp_size>; using slabset = slab_set<set_associativity, key_type, warp_size>;
......
// !!! This is a file automatically generated by hipify!!!
/* /*
* Copyright (c) 2023, NVIDIA CORPORATION. * Copyright (c) 2023, NVIDIA CORPORATION.
* *
...@@ -15,7 +16,7 @@ ...@@ -15,7 +16,7 @@
*/ */
#pragma once #pragma once
#include <cuda_runtime_api.h> #include <hip/hip_runtime_api.h>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
...@@ -30,17 +31,17 @@ class CudaException : public std::runtime_error { ...@@ -30,17 +31,17 @@ class CudaException : public std::runtime_error {
CudaException(const std::string& what) : runtime_error(what) {} CudaException(const std::string& what) : runtime_error(what) {}
}; };
inline void cuda_check_(cudaError_t val, const char* file, int line) { inline void cuda_check_(hipError_t val, const char* file, int line) {
if (val != cudaSuccess) { if (val != hipSuccess) {
throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " + throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " +
std::to_string(val) + ": " + cudaGetErrorString(val)); std::to_string(val) + ": " + hipGetErrorString(val));
} }
} }
class CudaDeviceRestorer { class CudaDeviceRestorer {
public: public:
CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); } CudaDeviceRestorer() { CUDA_CHECK(hipGetDevice(&dev_)); }
~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); } ~CudaDeviceRestorer() { CUDA_CHECK(hipSetDevice(dev_)); }
void check_device(int device) const { void check_device(int device) const {
if (device != dev_) { if (device != dev_) {
throw std::runtime_error( throw std::runtime_error(
...@@ -54,14 +55,14 @@ class CudaDeviceRestorer { ...@@ -54,14 +55,14 @@ class CudaDeviceRestorer {
}; };
inline int get_dev(const void* ptr) { inline int get_dev(const void* ptr) {
cudaPointerAttributes attr; hipPointerAttribute_t attr;
CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); CUDA_CHECK(hipPointerGetAttributes(&attr, ptr));
int dev = -1; int dev = -1;
#if CUDART_VERSION >= 10000 #if DTKRT_VERSION >= 10000
if (attr.type == cudaMemoryTypeDevice) if (attr.type == hipMemoryTypeDevice)
#else #else
if (attr.memoryType == cudaMemoryTypeDevice) if (attr.memoryType == hipMemoryTypeDevice)
#endif #endif
{ {
dev = attr.device; dev = attr.device;
...@@ -72,7 +73,7 @@ inline int get_dev(const void* ptr) { ...@@ -72,7 +73,7 @@ inline int get_dev(const void* ptr) {
inline void switch_to_dev(const void* ptr) { inline void switch_to_dev(const void* ptr) {
int dev = get_dev(ptr); int dev = get_dev(ptr);
if (dev >= 0) { if (dev >= 0) {
CUDA_CHECK(cudaSetDevice(dev)); CUDA_CHECK(hipSetDevice(dev));
} }
} }
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* /*
* Copyright (c) 2023, NVIDIA CORPORATION. * Copyright (c) 2023, NVIDIA CORPORATION.
* *
...@@ -50,17 +52,17 @@ class StaticHashTable { ...@@ -50,17 +52,17 @@ class StaticHashTable {
return keys_bytes + indices_bytes + values_bytes; return keys_bytes + indices_bytes + values_bytes;
} }
void clear(cudaStream_t stream = 0); void clear(hipStream_t stream = 0);
// Note: // Note:
// 1. Please make sure the key to be inserted is not duplicated. // 1. Please make sure the key to be inserted is not duplicated.
// 2. Please make sure the key to be inserted does not exist in the table. // 2. Please make sure the key to be inserted does not exist in the table.
// 3. Please make sure (size() + num_keys) <= capacity(). // 3. Please make sure (size() + num_keys) <= capacity().
void insert(const key_type *keys, const value_type *values, size_type num_keys, void insert(const key_type *keys, const value_type *values, size_type num_keys,
cudaStream_t stream = 0); hipStream_t stream = 0);
void lookup(const key_type *keys, value_type *values, int num_keys, value_type default_value = 0, void lookup(const key_type *keys, value_type *values, int num_keys, value_type default_value = 0,
cudaStream_t stream = 0); hipStream_t stream = 0);
private: private:
key_type *table_keys_; key_type *table_keys_;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* /*
* Copyright (c) 2023, NVIDIA CORPORATION. * Copyright (c) 2023, NVIDIA CORPORATION.
* *
...@@ -36,12 +38,12 @@ class static_table { ...@@ -36,12 +38,12 @@ class static_table {
~static_table(){}; ~static_table(){};
// Query API, i.e. A single read from the cache // Query API, i.e. A single read from the cache
void Query(const key_type* d_keys, const size_t len, float* d_values, cudaStream_t stream); void Query(const key_type* d_keys, const size_t len, float* d_values, hipStream_t stream);
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void Init(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream); void Init(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream);
void Clear(cudaStream_t stream); void Clear(hipStream_t stream);
private: private:
StaticHashTable<key_type, float> static_hash_table_; StaticHashTable<key_type, float> static_hash_table_;
......
/* // !!! This is a file automatically generated by hipify!!!
* Copyright (c) 2023, NVIDIA CORPORATION. #include "hip/hip_runtime.h"
* /*
* Licensed under the Apache License, Version 2.0 (the "License"); * Copyright (c) 2023, NVIDIA CORPORATION.
* you may not use this file except in compliance with the License. *
* You may obtain a copy of the License at * Licensed under the Apache License, Version 2.0 (the "License");
* * you may not use this file except in compliance with the License.
* http://www.apache.org/licenses/LICENSE-2.0 * You may obtain a copy of the License at
* *
* Unless required by applicable law or agreed to in writing, software * http://www.apache.org/licenses/LICENSE-2.0
* distributed under the License is distributed on an "AS IS" BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * Unless required by applicable law or agreed to in writing, software
* See the License for the specific language governing permissions and * distributed under the License is distributed on an "AS IS" BASIS,
* limitations under the License. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/ * See the License for the specific language governing permissions and
#pragma once * limitations under the License.
#include <nv_util.h> */
#pragma once
#include <thread> #include <nv_util.h>
#include <unordered_map>
#include <vector> #include <thread>
#include <unordered_map>
namespace gpu_cache { #include <vector>
template <typename key_type, typename index_type> namespace gpu_cache {
class HashBlock {
public: template <typename key_type, typename index_type>
key_type* keys; class HashBlock {
size_t num_sets; public:
size_t capacity; key_type* keys;
size_t num_sets;
HashBlock(size_t expected_capacity, int set_size, int batch_size); size_t capacity;
~HashBlock();
void add(const key_type* new_keys, const size_t num_keys, key_type* missing_keys, HashBlock(size_t expected_capacity, int set_size, int batch_size);
int* num_missing_keys, cudaStream_t stream); ~HashBlock();
void query(const key_type* query_keys, const size_t num_keys, index_type* output_indices, void add(const key_type* new_keys, const size_t num_keys, key_type* missing_keys,
key_type* missing_keys, int* missing_positions, int* num_missing_keys, int* num_missing_keys, hipStream_t stream);
cudaStream_t stream); void query(const key_type* query_keys, const size_t num_keys, index_type* output_indices,
void query(const key_type* query_keys, int* num_keys, index_type* output_indices, key_type* missing_keys, int* missing_positions, int* num_missing_keys,
cudaStream_t stream); hipStream_t stream);
void clear(cudaStream_t stream); void query(const key_type* query_keys, int* num_keys, index_type* output_indices,
hipStream_t stream);
private: void clear(hipStream_t stream);
int max_set_size_;
int batch_size_; private:
int* set_sizes_; int max_set_size_;
}; int batch_size_;
int* set_sizes_;
template <typename vec_type> };
class H2HCopy {
public: template <typename vec_type>
H2HCopy(int num_threads) : num_threads_(num_threads), working_(num_threads) { class H2HCopy {
for (int i = 0; i < num_threads_; i++) { public:
threads_.emplace_back( H2HCopy(int num_threads) : num_threads_(num_threads), working_(num_threads) {
[&](int idx) { for (int i = 0; i < num_threads_; i++) {
while (!terminate_) { threads_.emplace_back(
if (working_[idx].load(std::memory_order_relaxed)) { [&](int idx) {
working_[idx].store(false, std::memory_order_relaxed); while (!terminate_) {
if (num_keys_ == 0) continue; if (working_[idx].load(std::memory_order_relaxed)) {
size_t num_keys_this_thread = (num_keys_ - 1) / num_threads_ + 1; working_[idx].store(false, std::memory_order_relaxed);
size_t begin = idx * num_keys_this_thread; if (num_keys_ == 0) continue;
if (idx == num_threads_ - 1) { size_t num_keys_this_thread = (num_keys_ - 1) / num_threads_ + 1;
num_keys_this_thread = num_keys_ - num_keys_this_thread * idx; size_t begin = idx * num_keys_this_thread;
} if (idx == num_threads_ - 1) {
size_t end = begin + num_keys_this_thread; num_keys_this_thread = num_keys_ - num_keys_this_thread * idx;
}
for (size_t i = begin; i < end; i++) { size_t end = begin + num_keys_this_thread;
size_t idx_vec = get_index_(i);
if (idx_vec == std::numeric_limits<size_t>::max()) { for (size_t i = begin; i < end; i++) {
continue; size_t idx_vec = get_index_(i);
} if (idx_vec == std::numeric_limits<size_t>::max()) {
memcpy(dst_data_ptr_ + i * vec_size_, src_data_ptr_ + idx_vec * vec_size_, continue;
sizeof(vec_type) * vec_size_); }
} memcpy(dst_data_ptr_ + i * vec_size_, src_data_ptr_ + idx_vec * vec_size_,
num_finished_workers_++; sizeof(vec_type) * vec_size_);
} }
} num_finished_workers_++;
std::this_thread::sleep_for(std::chrono::microseconds(1)); }
}, }
i); std::this_thread::sleep_for(std::chrono::microseconds(1));
} },
}; i);
}
void copy(vec_type* dst_data_ptr, vec_type* src_data_ptr, size_t num_keys, int vec_size, };
std::function<size_t(size_t)> get_index_func) {
std::lock_guard<std::mutex> guard(submit_mutex_); void copy(vec_type* dst_data_ptr, vec_type* src_data_ptr, size_t num_keys, int vec_size,
dst_data_ptr_ = dst_data_ptr; std::function<size_t(size_t)> get_index_func) {
src_data_ptr_ = src_data_ptr; std::lock_guard<std::mutex> guard(submit_mutex_);
get_index_ = get_index_func; dst_data_ptr_ = dst_data_ptr;
num_keys_ = num_keys; src_data_ptr_ = src_data_ptr;
vec_size_ = vec_size; get_index_ = get_index_func;
num_finished_workers_.store(0, std::memory_order_acquire); num_keys_ = num_keys;
vec_size_ = vec_size;
for (auto& working : working_) { num_finished_workers_.store(0, std::memory_order_acquire);
working.store(true, std::memory_order_relaxed);
} for (auto& working : working_) {
working.store(true, std::memory_order_relaxed);
while (num_finished_workers_ != num_threads_) { }
continue;
} while (num_finished_workers_ != num_threads_) {
} continue;
}
~H2HCopy() { }
terminate_ = true;
for (auto& t : threads_) { ~H2HCopy() {
t.join(); terminate_ = true;
} for (auto& t : threads_) {
} t.join();
}
private: }
vec_type* src_data_ptr_;
vec_type* dst_data_ptr_; private:
vec_type* src_data_ptr_;
std::function<size_t(size_t)> get_index_; vec_type* dst_data_ptr_;
size_t num_keys_; std::function<size_t(size_t)> get_index_;
int vec_size_;
size_t num_keys_;
std::mutex submit_mutex_; int vec_size_;
const int num_threads_;
std::vector<std::thread> threads_; std::mutex submit_mutex_;
std::vector<std::atomic<bool>> working_; const int num_threads_;
volatile bool terminate_{false}; std::vector<std::thread> threads_;
std::atomic<int> num_finished_workers_{0}; std::vector<std::atomic<bool>> working_;
}; volatile bool terminate_{false};
std::atomic<int> num_finished_workers_{0};
template <typename key_type, typename index_type, typename vec_type = float> };
class UvmTable {
public: template <typename key_type, typename index_type, typename vec_type = float>
UvmTable(const size_t device_table_capacity, const size_t host_table_capacity, class UvmTable {
const int max_batch_size, const int vec_size, public:
const vec_type default_value = (vec_type)0); UvmTable(const size_t device_table_capacity, const size_t host_table_capacity,
~UvmTable(); const int max_batch_size, const int vec_size,
void query(const key_type* d_keys, const int len, vec_type* d_vectors, cudaStream_t stream = 0); const vec_type default_value = (vec_type)0);
void add(const key_type* h_keys, const vec_type* h_vectors, const size_t len); ~UvmTable();
void clear(cudaStream_t stream = 0); void query(const key_type* d_keys, const int len, vec_type* d_vectors, hipStream_t stream = 0);
void add(const key_type* h_keys, const vec_type* h_vectors, const size_t len);
private: void clear(hipStream_t stream = 0);
static constexpr int num_buffers_ = 2;
key_type* d_keys_buffer_; private:
vec_type* d_vectors_buffer_; static constexpr int num_buffers_ = 2;
vec_type* d_vectors_; key_type* d_keys_buffer_;
vec_type* d_vectors_buffer_;
index_type* d_output_indices_; vec_type* d_vectors_;
index_type* d_output_host_indices_;
index_type* h_output_host_indices_; index_type* d_output_indices_;
index_type* d_output_host_indices_;
key_type* d_missing_keys_; index_type* h_output_host_indices_;
int* d_missing_positions_;
int* d_missing_count_; key_type* d_missing_keys_;
int* d_missing_positions_;
std::vector<vec_type> h_vectors_; int* d_missing_count_;
key_type* h_missing_keys_;
std::vector<vec_type> h_vectors_;
cudaStream_t query_stream_; key_type* h_missing_keys_;
cudaEvent_t query_event_;
hipStream_t query_stream_;
vec_type* h_cpy_buffers_[num_buffers_]; hipEvent_t query_event_;
vec_type* d_cpy_buffers_[num_buffers_];
cudaStream_t cpy_streams_[num_buffers_]; vec_type* h_cpy_buffers_[num_buffers_];
cudaEvent_t cpy_events_[num_buffers_]; vec_type* d_cpy_buffers_[num_buffers_];
hipStream_t cpy_streams_[num_buffers_];
std::unordered_map<key_type, index_type> h_final_missing_items_; hipEvent_t cpy_events_[num_buffers_];
int max_batch_size_; std::unordered_map<key_type, index_type> h_final_missing_items_;
int vec_size_;
size_t num_set_; int max_batch_size_;
size_t num_host_set_; int vec_size_;
size_t table_capacity_; size_t num_set_;
std::vector<vec_type> default_vector_; size_t num_host_set_;
size_t table_capacity_;
HashBlock<key_type, index_type> device_table_; std::vector<vec_type> default_vector_;
HashBlock<key_type, index_type> host_table_;
}; HashBlock<key_type, index_type> device_table_;
HashBlock<key_type, index_type> host_table_;
};
} // namespace gpu_cache } // namespace gpu_cache
\ No newline at end of file
...@@ -15,15 +15,14 @@ ...@@ -15,15 +15,14 @@
cmake_minimum_required(VERSION 3.8) cmake_minimum_required(VERSION 3.8)
file(GLOB gpu_cache_src file(GLOB gpu_cache_src
nv_gpu_cache.cu nv_gpu_cache.hip
static_table.cu static_table.hip
static_hash_table.cu static_hash_table.hip
uvm_table.cu uvm_table.hip
) )
add_library(gpu_cache SHARED ${gpu_cache_src}) add_library(gpu_cache SHARED ${gpu_cache_src})
target_compile_features(gpu_cache PUBLIC cxx_std_11) target_compile_features(gpu_cache PUBLIC cxx_std_11)
set_target_properties(gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_target_properties(gpu_cache PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties(gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) # set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)
set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* /*
* Copyright (c) 2023, NVIDIA CORPORATION. * Copyright (c) 2023, NVIDIA CORPORATION.
* *
...@@ -14,7 +16,7 @@ ...@@ -14,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include <cooperative_groups.h> #include <hip/hip_cooperative_groups.h>
#include <nv_gpu_cache.hpp> #include <nv_gpu_cache.hpp>
...@@ -29,9 +31,11 @@ __forceinline__ __device__ long long atomicAdd(long long* address, long long val ...@@ -29,9 +31,11 @@ __forceinline__ __device__ long long atomicAdd(long long* address, long long val
return (long long)atomicAdd((unsigned long long*)address, (unsigned long long)val); return (long long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
} }
#ifndef __HIPCC__
__forceinline__ __device__ unsigned long atomicAdd(unsigned long* address, unsigned long val) { __forceinline__ __device__ unsigned long atomicAdd(unsigned long* address, unsigned long val) {
return (unsigned long)atomicAdd((unsigned long long*)address, (unsigned long long)val); return (unsigned long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
} }
#endif
namespace gpu_cache { namespace gpu_cache {
...@@ -1253,27 +1257,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s ...@@ -1253,27 +1257,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
} }
// Get the current CUDA dev // Get the current CUDA dev
CUDA_CHECK(cudaGetDevice(&dev_)); CUDA_CHECK(hipGetDevice(&dev_));
// Calculate # of slot // Calculate # of slot
num_slot_ = capacity_in_set_ * set_associativity * warp_size; num_slot_ = capacity_in_set_ * set_associativity * warp_size;
// Allocate GPU memory for cache // Allocate GPU memory for cache
CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_)); CUDA_CHECK(hipMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_)); CUDA_CHECK(hipMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_)); CUDA_CHECK(hipMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(atomic_ref_counter_type))); CUDA_CHECK(hipMalloc((void**)&global_counter_, sizeof(atomic_ref_counter_type)));
// Allocate GPU memory for set mutex // Allocate GPU memory for set mutex
CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(mutex) * capacity_in_set_)); CUDA_CHECK(hipMalloc((void**)&set_mutex_, sizeof(mutex) * capacity_in_set_));
// Initialize the cache, set all entry to unused <K,V> // Initialize the cache, set all entry to unused <K,V>
init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>( hipLaunchKernelGGL(( init_cache), dim3(((num_slot_ - 1) / BLOCK_SIZE_) + 1), dim3(BLOCK_SIZE_), 0, 0,
keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_); keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);
// Wait for initialization to finish // Wait for initialization to finish
CUDA_CHECK(cudaStreamSynchronize(0)); CUDA_CHECK(hipStreamSynchronize(0));
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#else #else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity, template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
...@@ -1301,27 +1305,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s ...@@ -1301,27 +1305,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
} }
// Get the current CUDA dev // Get the current CUDA dev
CUDA_CHECK(cudaGetDevice(&dev_)); CUDA_CHECK(hipGetDevice(&dev_));
// Calculate # of slot // Calculate # of slot
num_slot_ = capacity_in_set_ * set_associativity * warp_size; num_slot_ = capacity_in_set_ * set_associativity * warp_size;
// Allocate GPU memory for cache // Allocate GPU memory for cache
CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_)); CUDA_CHECK(hipMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_)); CUDA_CHECK(hipMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_)); CUDA_CHECK(hipMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(ref_counter_type))); CUDA_CHECK(hipMalloc((void**)&global_counter_, sizeof(ref_counter_type)));
// Allocate GPU memory for set mutex // Allocate GPU memory for set mutex
CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(int) * capacity_in_set_)); CUDA_CHECK(hipMalloc((void**)&set_mutex_, sizeof(int) * capacity_in_set_));
// Initialize the cache, set all entry to unused <K,V> // Initialize the cache, set all entry to unused <K,V>
init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>( hipLaunchKernelGGL(( init_cache), dim3(((num_slot_ - 1) / BLOCK_SIZE_) + 1), dim3(BLOCK_SIZE_), 0, 0,
keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_); keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);
// Wait for initialization to finish // Wait for initialization to finish
CUDA_CHECK(cudaStreamSynchronize(0)); CUDA_CHECK(hipStreamSynchronize(0));
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#endif #endif
...@@ -1337,18 +1341,18 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s ...@@ -1337,18 +1341,18 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
dev_restorer.check_device(dev_); dev_restorer.check_device(dev_);
// Destruct CUDA std object // Destruct CUDA std object
destruct_kernel<<<((capacity_in_set_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>( hipLaunchKernelGGL(( destruct_kernel), dim3(((capacity_in_set_ - 1) / BLOCK_SIZE_) + 1), dim3(BLOCK_SIZE_), 0, 0,
global_counter_, set_mutex_, capacity_in_set_); global_counter_, set_mutex_, capacity_in_set_);
// Wait for destruction to finish // Wait for destruction to finish
CUDA_CHECK(cudaStreamSynchronize(0)); CUDA_CHECK(hipStreamSynchronize(0));
// Free GPU memory for cache // Free GPU memory for cache
CUDA_CHECK(cudaFree(keys_)); CUDA_CHECK(hipFree(keys_));
CUDA_CHECK(cudaFree(vals_)); CUDA_CHECK(hipFree(vals_));
CUDA_CHECK(cudaFree(slot_counter_)); CUDA_CHECK(hipFree(slot_counter_));
CUDA_CHECK(cudaFree(global_counter_)); CUDA_CHECK(hipFree(global_counter_));
// Free GPU memory for set mutex // Free GPU memory for set mutex
CUDA_CHECK(cudaFree(set_mutex_)); CUDA_CHECK(hipFree(set_mutex_));
} }
#else #else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity, template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
...@@ -1362,12 +1366,12 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s ...@@ -1362,12 +1366,12 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
dev_restorer.check_device(dev_); dev_restorer.check_device(dev_);
// Free GPU memory for cache // Free GPU memory for cache
CUDA_CHECK(cudaFree(keys_)); CUDA_CHECK(hipFree(keys_));
CUDA_CHECK(cudaFree(vals_)); CUDA_CHECK(hipFree(vals_));
CUDA_CHECK(cudaFree(slot_counter_)); CUDA_CHECK(hipFree(slot_counter_));
CUDA_CHECK(cudaFree(global_counter_)); CUDA_CHECK(hipFree(global_counter_));
// Free GPU memory for set mutex // Free GPU memory for set mutex
CUDA_CHECK(cudaFree(set_mutex_)); CUDA_CHECK(hipFree(set_mutex_));
} }
#endif #endif
...@@ -1377,7 +1381,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int ...@@ -1377,7 +1381,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher, void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values, slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
uint64_t* d_missing_index, key_type* d_missing_keys, uint64_t* d_missing_index, key_type* d_missing_keys,
size_t* d_missing_len, cudaStream_t stream, size_t* d_missing_len, hipStream_t stream,
const size_t task_per_warp_tile) { const size_t task_per_warp_tile) {
// Device Restorer // Device Restorer
nv::CudaDeviceRestorer dev_restorer; nv::CudaDeviceRestorer dev_restorer;
...@@ -1387,27 +1391,27 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si ...@@ -1387,27 +1391,27 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Check if it is a valid query // Check if it is a valid query
if (len == 0) { if (len == 0) {
// Set the d_missing_len to 0 before return // Set the d_missing_len to 0 before return
CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream)); CUDA_CHECK(hipMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
return; return;
} }
// Update the global counter as user perform a new(most recent) read operation to the cache // Update the global counter as user perform a new(most recent) read operation to the cache
// Resolve distance overflow issue as well. // Resolve distance overflow issue as well.
update_kernel_overflow_ignore<atomic_ref_counter_type> hipLaunchKernelGGL(( update_kernel_overflow_ignore<atomic_ref_counter_type>)
<<<1, 1, 0, stream>>>(global_counter_, d_missing_len); , dim3(1), dim3(1), 0, stream, global_counter_, d_missing_len);
// Read from the cache // Read from the cache
// Touch and refresh the hitting slot // Touch and refresh the hitting slot
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1; const size_t grid_size = ((len - 1) / keys_per_block) + 1;
get_kernel<key_type, ref_counter_type, atomic_ref_counter_type, slabset, set_hasher, slab_hasher, hipLaunchKernelGGL(( get_kernel<key_type, ref_counter_type, atomic_ref_counter_type, slabset, set_hasher, slab_hasher,
mutex, empty_key, set_associativity, warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>( mutex, empty_key, set_associativity, warp_size>), dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream,
d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len, d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len,
global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_, global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_,
task_per_warp_tile); task_per_warp_tile);
// Check for GPU error before return // Check for GPU error before return
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#else #else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity, template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
...@@ -1415,7 +1419,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int ...@@ -1415,7 +1419,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher, void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values, slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
uint64_t* d_missing_index, key_type* d_missing_keys, uint64_t* d_missing_index, key_type* d_missing_keys,
size_t* d_missing_len, cudaStream_t stream, size_t* d_missing_len, hipStream_t stream,
const size_t task_per_warp_tile) { const size_t task_per_warp_tile) {
// Device Restorer // Device Restorer
nv::CudaDeviceRestorer dev_restorer; nv::CudaDeviceRestorer dev_restorer;
...@@ -1425,27 +1429,27 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si ...@@ -1425,27 +1429,27 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Check if it is a valid query // Check if it is a valid query
if (len == 0) { if (len == 0) {
// Set the d_missing_len to 0 before return // Set the d_missing_len to 0 before return
CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream)); CUDA_CHECK(hipMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
return; return;
} }
// Update the global counter as user perform a new(most recent) read operation to the cache // Update the global counter as user perform a new(most recent) read operation to the cache
// Resolve distance overflow issue as well. // Resolve distance overflow issue as well.
update_kernel_overflow_ignore<ref_counter_type> hipLaunchKernelGGL(( update_kernel_overflow_ignore<ref_counter_type>)
<<<1, 1, 0, stream>>>(global_counter_, d_missing_len); , dim3(1), dim3(1), 0, stream, global_counter_, d_missing_len);
// Read from the cache // Read from the cache
// Touch and refresh the hitting slot // Touch and refresh the hitting slot
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1; const size_t grid_size = ((len - 1) / keys_per_block) + 1;
get_kernel<key_type, ref_counter_type, slabset, set_hasher, slab_hasher, empty_key, hipLaunchKernelGGL(( get_kernel<key_type, ref_counter_type, slabset, set_hasher, slab_hasher, empty_key,
set_associativity, warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>( set_associativity, warp_size>), dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream,
d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len, d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len,
global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_, global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_,
task_per_warp_tile); task_per_warp_tile);
// Check for GPU error before return // Check for GPU error before return
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#endif #endif
...@@ -1454,7 +1458,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int ...@@ -1454,7 +1458,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
int warp_size, typename set_hasher, typename slab_hasher> int warp_size, typename set_hasher, typename slab_hasher>
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher, void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Replace(const key_type* d_keys, const size_t len, slab_hasher>::Replace(const key_type* d_keys, const size_t len,
const float* d_values, cudaStream_t stream, const float* d_values, hipStream_t stream,
const size_t task_per_warp_tile) { const size_t task_per_warp_tile) {
// Check if it is a valid replacement // Check if it is a valid replacement
if (len == 0) { if (len == 0) {
...@@ -1470,21 +1474,21 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si ...@@ -1470,21 +1474,21 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Then replace the <k,v> pairs into the cache // Then replace the <k,v> pairs into the cache
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1; const size_t grid_size = ((len - 1) / keys_per_block) + 1;
insert_replace_kernel<key_type, slabset, ref_counter_type, mutex, atomic_ref_counter_type, hipLaunchKernelGGL(( insert_replace_kernel<key_type, slabset, ref_counter_type, mutex, atomic_ref_counter_type,
set_hasher, slab_hasher, empty_key, set_associativity, warp_size> set_hasher, slab_hasher, empty_key, set_associativity, warp_size>)
<<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, d_values, embedding_vec_size_, len, keys_, , dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, d_keys, d_values, embedding_vec_size_, len, keys_,
vals_, slot_counter_, set_mutex_, global_counter_, vals_, slot_counter_, set_mutex_, global_counter_,
capacity_in_set_, task_per_warp_tile); capacity_in_set_, task_per_warp_tile);
// Check for GPU error before return // Check for GPU error before return
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#else #else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity, template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
int warp_size, typename set_hasher, typename slab_hasher> int warp_size, typename set_hasher, typename slab_hasher>
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher, void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Replace(const key_type* d_keys, const size_t len, slab_hasher>::Replace(const key_type* d_keys, const size_t len,
const float* d_values, cudaStream_t stream, const float* d_values, hipStream_t stream,
const size_t task_per_warp_tile) { const size_t task_per_warp_tile) {
// Check if it is a valid replacement // Check if it is a valid replacement
if (len == 0) { if (len == 0) {
...@@ -1500,13 +1504,13 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si ...@@ -1500,13 +1504,13 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Then replace the <k,v> pairs into the cache // Then replace the <k,v> pairs into the cache
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1; const size_t grid_size = ((len - 1) / keys_per_block) + 1;
insert_replace_kernel<key_type, slabset, ref_counter_type, set_hasher, slab_hasher, empty_key, hipLaunchKernelGGL(( insert_replace_kernel<key_type, slabset, ref_counter_type, set_hasher, slab_hasher, empty_key,
set_associativity, warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>( set_associativity, warp_size>), dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream,
d_keys, d_values, embedding_vec_size_, len, keys_, vals_, slot_counter_, set_mutex_, d_keys, d_values, embedding_vec_size_, len, keys_, vals_, slot_counter_, set_mutex_,
global_counter_, capacity_in_set_, task_per_warp_tile); global_counter_, capacity_in_set_, task_per_warp_tile);
// Check for GPU error before return // Check for GPU error before return
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#endif #endif
...@@ -1515,7 +1519,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int ...@@ -1515,7 +1519,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
int warp_size, typename set_hasher, typename slab_hasher> int warp_size, typename set_hasher, typename slab_hasher>
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher, void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values, slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
cudaStream_t stream, const size_t task_per_warp_tile) { hipStream_t stream, const size_t task_per_warp_tile) {
// Check if it is a valid update request // Check if it is a valid update request
if (len == 0) { if (len == 0) {
return; return;
...@@ -1529,20 +1533,20 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si ...@@ -1529,20 +1533,20 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Update the value of input keys that are existed in the cache // Update the value of input keys that are existed in the cache
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1; const size_t grid_size = ((len - 1) / keys_per_block) + 1;
update_kernel<key_type, slabset, set_hasher, slab_hasher, mutex, empty_key, set_associativity, hipLaunchKernelGGL(( update_kernel<key_type, slabset, set_hasher, slab_hasher, mutex, empty_key, set_associativity,
warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>( warp_size>), dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream,
d_keys, len, d_values, embedding_vec_size_, capacity_in_set_, keys_, vals_, set_mutex_, d_keys, len, d_values, embedding_vec_size_, capacity_in_set_, keys_, vals_, set_mutex_,
task_per_warp_tile); task_per_warp_tile);
// Check for GPU error before return // Check for GPU error before return
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#else #else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity, template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
int warp_size, typename set_hasher, typename slab_hasher> int warp_size, typename set_hasher, typename slab_hasher>
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher, void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values, slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
cudaStream_t stream, const size_t task_per_warp_tile) { hipStream_t stream, const size_t task_per_warp_tile) {
// Check if it is a valid update request // Check if it is a valid update request
if (len == 0) { if (len == 0) {
return; return;
...@@ -1556,13 +1560,13 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si ...@@ -1556,13 +1560,13 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Update the value of input keys that are existed in the cache // Update the value of input keys that are existed in the cache
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1; const size_t grid_size = ((len - 1) / keys_per_block) + 1;
update_kernel<key_type, slabset, set_hasher, slab_hasher, empty_key, set_associativity, warp_size> hipLaunchKernelGGL(( update_kernel<key_type, slabset, set_hasher, slab_hasher, empty_key, set_associativity, warp_size>)
<<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, len, d_values, embedding_vec_size_, , dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, d_keys, len, d_values, embedding_vec_size_,
capacity_in_set_, keys_, vals_, set_mutex_, capacity_in_set_, keys_, vals_, set_mutex_,
task_per_warp_tile); task_per_warp_tile);
// Check for GPU error before return // Check for GPU error before return
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#endif #endif
...@@ -1572,7 +1576,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int ...@@ -1572,7 +1576,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher, void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter, slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
const size_t start_set_index, const size_t end_set_index, const size_t start_set_index, const size_t end_set_index,
cudaStream_t stream) { hipStream_t stream) {
// Check if it is a valid dump request // Check if it is a valid dump request
if (start_set_index >= capacity_in_set_) { if (start_set_index >= capacity_in_set_) {
printf("Error: Invalid value for start_set_index. Nothing dumped.\n"); printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
...@@ -1589,17 +1593,17 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si ...@@ -1589,17 +1593,17 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
dev_restorer.check_device(dev_); dev_restorer.check_device(dev_);
// Set the global counter to 0 first // Set the global counter to 0 first
CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream)); CUDA_CHECK(hipMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
// Dump keys from the cache // Dump keys from the cache
const size_t grid_size = const size_t grid_size =
(((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1; (((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1;
dump_kernel<key_type, slabset, mutex, empty_key, set_associativity, warp_size> hipLaunchKernelGGL(( dump_kernel<key_type, slabset, mutex, empty_key, set_associativity, warp_size>)
<<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, d_dump_counter, keys_, set_mutex_, , dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, d_keys, d_dump_counter, keys_, set_mutex_,
start_set_index, end_set_index); start_set_index, end_set_index);
// Check for GPU error before return // Check for GPU error before return
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#else #else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity, template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
...@@ -1607,7 +1611,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int ...@@ -1607,7 +1611,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher, void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter, slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
const size_t start_set_index, const size_t end_set_index, const size_t start_set_index, const size_t end_set_index,
cudaStream_t stream) { hipStream_t stream) {
// Check if it is a valid dump request // Check if it is a valid dump request
if (start_set_index >= capacity_in_set_) { if (start_set_index >= capacity_in_set_) {
printf("Error: Invalid value for start_set_index. Nothing dumped.\n"); printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
...@@ -1624,17 +1628,17 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si ...@@ -1624,17 +1628,17 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
dev_restorer.check_device(dev_); dev_restorer.check_device(dev_);
// Set the global counter to 0 first // Set the global counter to 0 first
CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream)); CUDA_CHECK(hipMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
// Dump keys from the cache // Dump keys from the cache
const size_t grid_size = const size_t grid_size =
(((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1; (((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1;
dump_kernel<key_type, slabset, empty_key, set_associativity, warp_size> hipLaunchKernelGGL(( dump_kernel<key_type, slabset, empty_key, set_associativity, warp_size>)
<<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, d_dump_counter, keys_, set_mutex_, , dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, d_keys, d_dump_counter, keys_, set_mutex_,
start_set_index, end_set_index); start_set_index, end_set_index);
// Check for GPU error before return // Check for GPU error before return
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(hipGetLastError());
} }
#endif #endif
......
// !!! This is a file automatically generated by hipify!!!
/* /*
* Copyright (c) 2023, NVIDIA CORPORATION. * Copyright (c) 2023, NVIDIA CORPORATION.
* *
...@@ -14,8 +15,8 @@ ...@@ -14,8 +15,8 @@
* limitations under the License. * limitations under the License.
*/ */
#include <cooperative_groups.h> #include <hip/hip_cooperative_groups.h>
#include <cuda.h> #include <hip/hip_runtime.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
...@@ -49,7 +50,7 @@ __device__ size_type insert(key_type *table, size_type capacity, key_type key, c ...@@ -49,7 +50,7 @@ __device__ size_type insert(key_type *table, size_type capacity, key_type key, c
// otherwise return invalid_slot. // otherwise return invalid_slot.
const size_type num_groups = capacity / group_size; const size_type num_groups = capacity / group_size;
#if (CUDA_VERSION < 11060) #if (DTK_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size(); unsigned long long num_threads_per_group = cg.size();
#else #else
unsigned long long num_threads_per_group = cg.num_threads(); unsigned long long num_threads_per_group = cg.num_threads();
...@@ -152,7 +153,7 @@ __device__ size_type lookup(key_type *table, size_type capacity, key_type key, c ...@@ -152,7 +153,7 @@ __device__ size_type lookup(key_type *table, size_type capacity, key_type key, c
const size_type num_groups = capacity / group_size; const size_type num_groups = capacity / group_size;
#if (CUDA_VERSION < 11060) #if (DTK_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size(); unsigned long long num_threads_per_group = cg.size();
#else #else
unsigned long long num_threads_per_group = cg.num_threads(); unsigned long long num_threads_per_group = cg.num_threads();
...@@ -300,19 +301,19 @@ StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHash ...@@ -300,19 +301,19 @@ StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHash
size_t align_m = 16; size_t align_m = 16;
size_t num_keys = key_capacity_ + 1; size_t num_keys = key_capacity_ + 1;
size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m; size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m;
CUDA_CHECK(cudaMalloc(&table_keys_, sizeof(key_type) * num_keys)); CUDA_CHECK(hipMalloc(&table_keys_, sizeof(key_type) * num_keys));
CUDA_CHECK(cudaMalloc(&table_indices_, sizeof(size_type) * num_keys)); CUDA_CHECK(hipMalloc(&table_indices_, sizeof(size_type) * num_keys));
CUDA_CHECK(cudaMalloc(&table_values_, sizeof(value_type) * num_values)); CUDA_CHECK(hipMalloc(&table_values_, sizeof(value_type) * num_values));
// Initialize table_keys_ // Initialize table_keys_
CUDA_CHECK(cudaMemset(table_keys_, 0xff, sizeof(key_type) * key_capacity_)); CUDA_CHECK(hipMemset(table_keys_, 0xff, sizeof(key_type) * key_capacity_));
CUDA_CHECK(cudaMemset(table_keys_ + key_capacity_, 0, sizeof(key_type))); CUDA_CHECK(hipMemset(table_keys_ + key_capacity_, 0, sizeof(key_type)));
} }
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size, template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher> typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert( void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert(
const key_type *keys, const value_type *values, size_type num_keys, cudaStream_t stream) { const key_type *keys, const value_type *values, size_type num_keys, hipStream_t stream) {
if (num_keys == 0) { if (num_keys == 0) {
return; return;
} }
...@@ -324,12 +325,12 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser ...@@ -324,12 +325,12 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
// Insert keys // Insert keys
constexpr int block = 256; constexpr int block = 256;
int grid = (num_keys - 1) / block + 1; int grid = (num_keys - 1) / block + 1;
InsertKeyKernel<tile_size, group_size> hipLaunchKernelGGL(( InsertKeyKernel<tile_size, group_size>)
<<<grid, block, 0, stream>>>(table_keys_, table_indices_, key_capacity_, keys, num_keys, , dim3(grid), dim3(block), 0, stream, table_keys_, table_indices_, key_capacity_, keys, num_keys,
size_, hash_, empty_key, invalid_slot); size_, hash_, empty_key, invalid_slot);
// Copy values // Copy values
CUDA_CHECK(cudaMemcpyAsync(table_values_ + size_ * value_dim_, values, CUDA_CHECK(hipMemcpyAsync(table_values_ + size_ * value_dim_, values,
sizeof(value_type) * num_keys * value_dim_, cudaMemcpyDeviceToDevice, sizeof(value_type) * num_keys * value_dim_, hipMemcpyDeviceToDevice,
stream)); stream));
size_ += num_keys; size_ += num_keys;
} }
...@@ -337,25 +338,25 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser ...@@ -337,25 +338,25 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size, template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher> typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear( void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear(
cudaStream_t stream) { hipStream_t stream) {
CUDA_CHECK(cudaMemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream)); CUDA_CHECK(hipMemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream));
CUDA_CHECK(cudaMemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream)); CUDA_CHECK(hipMemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream));
size_ = 0; size_ = 0;
} }
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size, template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher> typename hasher>
StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() { StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() {
CUDA_CHECK(cudaFree(table_keys_)); CUDA_CHECK(hipFree(table_keys_));
CUDA_CHECK(cudaFree(table_indices_)); CUDA_CHECK(hipFree(table_indices_));
CUDA_CHECK(cudaFree(table_values_)); CUDA_CHECK(hipFree(table_values_));
} }
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size, template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher> typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup( void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup(
const key_type *keys, value_type *values, int num_keys, value_type default_value, const key_type *keys, value_type *values, int num_keys, value_type default_value,
cudaStream_t stream) { hipStream_t stream) {
if (num_keys == 0) { if (num_keys == 0) {
return; return;
} }
...@@ -363,7 +364,7 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::looku ...@@ -363,7 +364,7 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::looku
constexpr int block = 256; constexpr int block = 256;
const int grid = (num_keys - 1) / block + 1; const int grid = (num_keys - 1) / block + 1;
// Lookup keys // Lookup keys
LookupKernel<tile_size, group_size><<<grid, block, 0, stream>>>( hipLaunchKernelGGL(( LookupKernel<tile_size, group_size>), dim3(grid), dim3(block), 0, stream,
table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values, table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values,
hash_, empty_key, default_value, invalid_slot); hash_, empty_key, default_value, invalid_slot);
} }
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* /*
* Copyright (c) 2023, NVIDIA CORPORATION. * Copyright (c) 2023, NVIDIA CORPORATION.
* *
...@@ -14,7 +16,7 @@ ...@@ -14,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include <cooperative_groups.h> #include <hip/hip_cooperative_groups.h>
#include <nv_util.h> #include <nv_util.h>
#include <iostream> #include <iostream>
...@@ -38,18 +40,18 @@ static_table<key_type>::static_table(const size_t table_size, const size_t embed ...@@ -38,18 +40,18 @@ static_table<key_type>::static_table(const size_t table_size, const size_t embed
template <typename key_type> template <typename key_type>
void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values, void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values,
cudaStream_t stream) { hipStream_t stream) {
static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream); static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream);
} }
template <typename key_type> template <typename key_type>
void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values, void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values,
cudaStream_t stream) { hipStream_t stream) {
static_hash_table_.insert(d_keys, d_values, len, stream); static_hash_table_.insert(d_keys, d_values, len, stream);
} }
template <typename key_type> template <typename key_type>
void static_table<key_type>::Clear(cudaStream_t stream) { void static_table<key_type>::Clear(hipStream_t stream) {
static_hash_table_.clear(stream); static_hash_table_.clear(stream);
} }
......
/* // !!! This is a file automatically generated by hipify!!!
* Copyright (c) 2023, NVIDIA CORPORATION. /*
* * Copyright (c) 2023, NVIDIA CORPORATION.
* Licensed under the Apache License, Version 2.0 (the "License"); *
* you may not use this file except in compliance with the License. * Licensed under the Apache License, Version 2.0 (the "License");
* You may obtain a copy of the License at * you may not use this file except in compliance with the License.
* * You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0 *
* * http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an "AS IS" BASIS, * Unless required by applicable law or agreed to in writing, software
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* limitations under the License. * See the License for the specific language governing permissions and
*/ * limitations under the License.
*/
#include <cooperative_groups.h>
#include <cuda_runtime_api.h> #include <hip/hip_cooperative_groups.h>
#include <immintrin.h> #include <hip/hip_runtime_api.h>
#include <immintrin.h>
#include <atomic>
#include <iostream> #include <atomic>
#include <limits> #include <iostream>
#include <mutex> #include <limits>
#include <uvm_table.hpp> #include <mutex>
#include <uvm_table.hpp>
namespace cg = cooperative_groups;
namespace cg = cooperative_groups;
namespace {
namespace {
constexpr int set_size = 4;
constexpr int block_size = 256; constexpr int set_size = 4;
constexpr int block_size = 256;
template <typename key_type>
__host__ __device__ key_type hash(key_type key) { template <typename key_type>
return key; __host__ __device__ key_type hash(key_type key) {
} return key;
}
template <typename key_type>
__global__ void hash_add_kernel(const key_type* new_keys, const int num_keys, key_type* keys, template <typename key_type>
const int num_sets, int* set_sizes, const int max_set_size, __global__ void hash_add_kernel(const key_type* new_keys, const int num_keys, key_type* keys,
key_type* missing_keys, int* num_missing_keys) { const int num_sets, int* set_sizes, const int max_set_size,
__shared__ key_type s_missing_keys[block_size]; key_type* missing_keys, int* num_missing_keys) {
__shared__ int s_missing_count; __shared__ key_type s_missing_keys[block_size];
__shared__ size_t s_missing_idx; __shared__ int s_missing_count;
__shared__ size_t s_missing_idx;
auto grid = cg::this_grid();
auto block = cg::this_thread_block(); auto grid = cg::this_grid();
auto block = cg::this_thread_block();
if (block.thread_rank() == 0) {
s_missing_count = 0; if (block.thread_rank() == 0) {
} s_missing_count = 0;
block.sync(); }
block.sync();
size_t idx = grid.thread_rank();
if (idx < num_keys) { size_t idx = grid.thread_rank();
auto key = new_keys[idx]; if (idx < num_keys) {
size_t idx_set = hash(key) % num_sets; auto key = new_keys[idx];
int prev_set_size = atomicAdd(&set_sizes[idx_set], 1); size_t idx_set = hash(key) % num_sets;
if (prev_set_size < max_set_size) { int prev_set_size = atomicAdd(&set_sizes[idx_set], 1);
keys[idx_set * max_set_size + prev_set_size] = key; if (prev_set_size < max_set_size) {
} else { keys[idx_set * max_set_size + prev_set_size] = key;
int count = atomicAdd(&s_missing_count, 1); } else {
s_missing_keys[count] = key; int count = atomicAdd(&s_missing_count, 1);
} s_missing_keys[count] = key;
} }
}
block.sync();
if (block.thread_rank() == 0) { block.sync();
s_missing_idx = atomicAdd(num_missing_keys, s_missing_count); if (block.thread_rank() == 0) {
} s_missing_idx = atomicAdd(num_missing_keys, s_missing_count);
block.sync(); }
for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) { block.sync();
missing_keys[s_missing_idx + i] = s_missing_keys[i]; for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
} missing_keys[s_missing_idx + i] = s_missing_keys[i];
} }
}
template <typename key_type, typename index_type>
__global__ void hash_query_kernel(const key_type* query_keys, int* num_keys_ptr, template <typename key_type, typename index_type>
const key_type* keys, const size_t num_sets, __global__ void hash_query_kernel(const key_type* query_keys, int* num_keys_ptr,
const int max_set_size, index_type* output_indices) { const key_type* keys, const size_t num_sets,
constexpr int tile_size = set_size; const int max_set_size, index_type* output_indices) {
auto grid = cg::this_grid(); constexpr int tile_size = set_size;
auto block = cg::this_thread_block(); auto grid = cg::this_grid();
auto tile = cg::tiled_partition<tile_size>(block); auto block = cg::this_thread_block();
int num_keys = *num_keys_ptr; auto tile = cg::tiled_partition<tile_size>(block);
if (num_keys == 0) return; int num_keys = *num_keys_ptr;
if (num_keys == 0) return;
#if (CUDA_VERSION < 11060)
size_t num_threads_per_grid = grid.size(); #if (DTK_VERSION < 11060)
#else size_t num_threads_per_grid = grid.size();
size_t num_threads_per_grid = grid.num_threads(); #else
#endif size_t num_threads_per_grid = grid.num_threads();
#endif
size_t step = (num_keys - 1) / num_threads_per_grid + 1;
for (size_t i = 0; i < step; i++) { size_t step = (num_keys - 1) / num_threads_per_grid + 1;
size_t idx = i * num_threads_per_grid + grid.thread_rank(); for (size_t i = 0; i < step; i++) {
key_type query_key = std::numeric_limits<key_type>::max(); size_t idx = i * num_threads_per_grid + grid.thread_rank();
if (idx < num_keys) { key_type query_key = std::numeric_limits<key_type>::max();
query_key = query_keys[idx]; if (idx < num_keys) {
} query_key = query_keys[idx];
auto idx_set = hash(query_key) % num_sets; }
for (int j = 0; j < tile_size; j++) { auto idx_set = hash(query_key) % num_sets;
auto current_idx_set = tile.shfl(idx_set, j); for (int j = 0; j < tile_size; j++) {
auto current_query_key = tile.shfl(query_key, j); auto current_idx_set = tile.shfl(idx_set, j);
if (current_query_key == std::numeric_limits<key_type>::max()) { auto current_query_key = tile.shfl(query_key, j);
continue; if (current_query_key == std::numeric_limits<key_type>::max()) {
} continue;
auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()]; }
int existed = tile.ballot(current_query_key == candidate_key); auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
auto current_idx = tile.shfl(idx, 0) + j; int existed = tile.ballot(current_query_key == candidate_key);
if (existed) { auto current_idx = tile.shfl(idx, 0) + j;
int src_lane = __ffs(existed) - 1; if (existed) {
size_t found_idx = current_idx_set * set_size + src_lane; int src_lane = __ffs(existed) - 1;
output_indices[current_idx] = num_sets * src_lane + current_idx_set; size_t found_idx = current_idx_set * set_size + src_lane;
} else { output_indices[current_idx] = num_sets * src_lane + current_idx_set;
output_indices[current_idx] = std::numeric_limits<index_type>::max(); } else {
} output_indices[current_idx] = std::numeric_limits<index_type>::max();
} }
} }
} }
}
template <typename key_type, typename index_type>
__global__ void hash_query_kernel(const key_type* query_keys, const int num_keys, template <typename key_type, typename index_type>
const key_type* keys, const size_t num_sets, __global__ void hash_query_kernel(const key_type* query_keys, const int num_keys,
const int max_set_size, index_type* output_indices, const key_type* keys, const size_t num_sets,
key_type* missing_keys, int* missing_positions, const int max_set_size, index_type* output_indices,
int* missing_count) { key_type* missing_keys, int* missing_positions,
__shared__ key_type s_missing_keys[block_size]; int* missing_count) {
__shared__ key_type s_missing_positions[block_size]; __shared__ key_type s_missing_keys[block_size];
__shared__ int s_missing_count; __shared__ key_type s_missing_positions[block_size];
__shared__ int s_missing_idx; __shared__ int s_missing_count;
__shared__ int s_missing_idx;
constexpr int tile_size = set_size;
constexpr int tile_size = set_size;
auto grid = cg::this_grid();
auto block = cg::this_thread_block(); auto grid = cg::this_grid();
auto tile = cg::tiled_partition<tile_size>(block); auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<tile_size>(block);
if (block.thread_rank() == 0) {
s_missing_count = 0; if (block.thread_rank() == 0) {
} s_missing_count = 0;
block.sync(); }
block.sync();
size_t idx = grid.thread_rank();
key_type query_key = std::numeric_limits<key_type>::max(); size_t idx = grid.thread_rank();
if (idx < num_keys) { key_type query_key = std::numeric_limits<key_type>::max();
query_key = query_keys[idx]; if (idx < num_keys) {
} query_key = query_keys[idx];
auto idx_set = hash(query_key) % num_sets; }
auto idx_set = hash(query_key) % num_sets;
for (int j = 0; j < tile_size; j++) {
auto current_idx_set = tile.shfl(idx_set, j); for (int j = 0; j < tile_size; j++) {
auto current_query_key = tile.shfl(query_key, j); auto current_idx_set = tile.shfl(idx_set, j);
if (current_query_key == std::numeric_limits<key_type>::max()) { auto current_query_key = tile.shfl(query_key, j);
continue; if (current_query_key == std::numeric_limits<key_type>::max()) {
} continue;
auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()]; }
int existed = tile.ballot(current_query_key == candidate_key); auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
if (existed) { int existed = tile.ballot(current_query_key == candidate_key);
int src_lane = __ffs(existed) - 1; if (existed) {
size_t found_idx = current_idx_set * set_size + src_lane; int src_lane = __ffs(existed) - 1;
output_indices[tile.shfl(idx, 0) + j] = num_sets * src_lane + current_idx_set; size_t found_idx = current_idx_set * set_size + src_lane;
} else { output_indices[tile.shfl(idx, 0) + j] = num_sets * src_lane + current_idx_set;
auto current_idx = tile.shfl(idx, 0) + j; } else {
output_indices[current_idx] = std::numeric_limits<index_type>::max(); auto current_idx = tile.shfl(idx, 0) + j;
if (tile.thread_rank() == 0) { output_indices[current_idx] = std::numeric_limits<index_type>::max();
int s_count = atomicAdd(&s_missing_count, 1); if (tile.thread_rank() == 0) {
s_missing_keys[s_count] = current_query_key; int s_count = atomicAdd(&s_missing_count, 1);
s_missing_positions[s_count] = current_idx; s_missing_keys[s_count] = current_query_key;
} s_missing_positions[s_count] = current_idx;
} }
} }
}
if (missing_keys == nullptr) {
if (grid.thread_rank() == 0 && missing_count) { if (missing_keys == nullptr) {
*missing_count = 0; if (grid.thread_rank() == 0 && missing_count) {
} *missing_count = 0;
return; }
} return;
block.sync(); }
if (block.thread_rank() == 0) { block.sync();
s_missing_idx = atomicAdd(missing_count, s_missing_count); if (block.thread_rank() == 0) {
} s_missing_idx = atomicAdd(missing_count, s_missing_count);
block.sync(); }
for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) { block.sync();
missing_keys[s_missing_idx + i] = s_missing_keys[i]; for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
missing_positions[s_missing_idx + i] = s_missing_positions[i]; missing_keys[s_missing_idx + i] = s_missing_keys[i];
} missing_positions[s_missing_idx + i] = s_missing_positions[i];
} }
}
template <int warp_size>
__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx, template <int warp_size>
const size_t emb_vec_size_in_float, __forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
volatile float* d_dst, const float* d_src) { const size_t emb_vec_size_in_float,
// 16 bytes align volatile float* d_dst, const float* d_src) {
if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) { // 16 bytes align
#pragma unroll if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) {
for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) { #pragma unroll
d_dst[i] = d_src[i]; for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
} d_dst[i] = d_src[i];
} else { }
#pragma unroll } else {
for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) { #pragma unroll
*(float4*)(d_dst + i * 4) = __ldg((const float4*)(d_src + i * 4)); for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) {
} *(float4*)(d_dst + i * 4) = __ldg((const float4*)(d_src + i * 4));
} }
} }
}
template <typename index_type, typename vec_type>
__global__ void read_vectors_kernel(const index_type* query_indices, const int num_keys, template <typename index_type, typename vec_type>
const vec_type* vectors, const int vec_size, __global__ void read_vectors_kernel(const index_type* query_indices, const int num_keys,
vec_type* output_vectors) { const vec_type* vectors, const int vec_size,
constexpr int warp_size = 32; vec_type* output_vectors) {
constexpr int warp_size = 32;
auto grid = cg::this_grid();
auto block = cg::this_thread_block(); auto grid = cg::this_grid();
auto tile = cg::tiled_partition<warp_size>(block); auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<warp_size>(block);
#if (CUDA_VERSION < 11060)
auto num_threads_per_grid = grid.size(); #if (DTK_VERSION < 11060)
#else auto num_threads_per_grid = grid.size();
auto num_threads_per_grid = grid.num_threads(); #else
#endif auto num_threads_per_grid = grid.num_threads();
#endif
for (int step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
int key_num = step * num_threads_per_grid + grid.thread_rank(); for (int step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
index_type idx = std::numeric_limits<index_type>::max(); int key_num = step * num_threads_per_grid + grid.thread_rank();
if (key_num < num_keys) { index_type idx = std::numeric_limits<index_type>::max();
idx = query_indices[key_num]; if (key_num < num_keys) {
} idx = query_indices[key_num];
#pragma unroll 4 }
for (size_t j = 0; j < warp_size; j++) { #pragma unroll 4
index_type current_idx = tile.shfl(idx, j); for (size_t j = 0; j < warp_size; j++) {
index_type idx_write = tile.shfl(key_num, 0) + j; index_type current_idx = tile.shfl(idx, j);
if (current_idx == std::numeric_limits<index_type>::max()) continue; index_type idx_write = tile.shfl(key_num, 0) + j;
warp_tile_copy<warp_size>(tile.thread_rank(), vec_size, output_vectors + idx_write * vec_size, if (current_idx == std::numeric_limits<index_type>::max()) continue;
vectors + current_idx * vec_size); warp_tile_copy<warp_size>(tile.thread_rank(), vec_size, output_vectors + idx_write * vec_size,
} vectors + current_idx * vec_size);
} }
} }
}
template <typename index_type, typename vec_type>
__global__ void distribute_vectors_kernel(const index_type* postions, const size_t num_keys, template <typename index_type, typename vec_type>
const vec_type* vectors, const int vec_size, __global__ void distribute_vectors_kernel(const index_type* postions, const size_t num_keys,
vec_type* output_vectors) { const vec_type* vectors, const int vec_size,
constexpr int warp_size = 32; vec_type* output_vectors) {
constexpr int warp_size = 32;
auto grid = cg::this_grid();
auto block = cg::this_thread_block(); auto grid = cg::this_grid();
auto tile = cg::tiled_partition<warp_size>(block); auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<warp_size>(block);
#if (CUDA_VERSION < 11060)
auto num_threads_per_grid = grid.size(); #if (DTK_VERSION < 11060)
#else auto num_threads_per_grid = grid.size();
auto num_threads_per_grid = grid.num_threads(); #else
#endif auto num_threads_per_grid = grid.num_threads();
#endif
for (size_t step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
size_t key_num = step * num_threads_per_grid + grid.thread_rank(); for (size_t step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
index_type idx = std::numeric_limits<index_type>::max(); size_t key_num = step * num_threads_per_grid + grid.thread_rank();
if (key_num < num_keys) { index_type idx = std::numeric_limits<index_type>::max();
idx = postions[key_num]; if (key_num < num_keys) {
} idx = postions[key_num];
#pragma unroll 4 }
for (size_t j = 0; j < warp_size; j++) { #pragma unroll 4
size_t idx_write = tile.shfl(idx, j); for (size_t j = 0; j < warp_size; j++) {
size_t idx_read = tile.shfl(key_num, 0) + j; size_t idx_write = tile.shfl(idx, j);
if (idx_write == std::numeric_limits<index_type>::max()) continue; size_t idx_read = tile.shfl(key_num, 0) + j;
warp_tile_copy<warp_size>(tile.thread_rank(), vec_size, if (idx_write == std::numeric_limits<index_type>::max()) continue;
output_vectors + (size_t)idx_write * vec_size, warp_tile_copy<warp_size>(tile.thread_rank(), vec_size,
vectors + (size_t)idx_read * vec_size); output_vectors + (size_t)idx_write * vec_size,
} vectors + (size_t)idx_read * vec_size);
} }
} }
}
} // namespace
} // namespace
namespace gpu_cache {
template <typename key_type, typename index_type, typename vec_type> namespace gpu_cache {
UvmTable<key_type, index_type, vec_type>::UvmTable(const size_t device_table_capacity, template <typename key_type, typename index_type, typename vec_type>
const size_t host_table_capacity, UvmTable<key_type, index_type, vec_type>::UvmTable(const size_t device_table_capacity,
const int max_batch_size, const int vec_size, const size_t host_table_capacity,
const vec_type default_value) const int max_batch_size, const int vec_size,
: max_batch_size_(std::max(100000, max_batch_size)), const vec_type default_value)
vec_size_(vec_size), : max_batch_size_(::max(100000, max_batch_size)),
num_set_((device_table_capacity - 1) / set_size + 1), vec_size_(vec_size),
num_host_set_((host_table_capacity - 1) / set_size + 1), num_set_((device_table_capacity - 1) / set_size + 1),
table_capacity_(num_set_ * set_size), num_host_set_((host_table_capacity - 1) / set_size + 1),
default_vector_(vec_size, default_value), table_capacity_(num_set_ * set_size),
device_table_(device_table_capacity, set_size, max_batch_size_), default_vector_(vec_size, default_value),
host_table_(host_table_capacity * 1.3, set_size, max_batch_size_) { device_table_(device_table_capacity, set_size, max_batch_size_),
CUDA_CHECK(cudaMalloc(&d_keys_buffer_, sizeof(key_type) * max_batch_size_)); host_table_(host_table_capacity * 1.3, set_size, max_batch_size_) {
CUDA_CHECK(cudaMalloc(&d_vectors_buffer_, sizeof(vec_type) * max_batch_size_ * vec_size_)); CUDA_CHECK(hipMalloc(&d_keys_buffer_, sizeof(key_type) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_vectors_, sizeof(vec_type) * device_table_.capacity * vec_size_)); CUDA_CHECK(hipMalloc(&d_vectors_buffer_, sizeof(vec_type) * max_batch_size_ * vec_size_));
CUDA_CHECK(hipMalloc(&d_vectors_, sizeof(vec_type) * device_table_.capacity * vec_size_));
CUDA_CHECK(cudaMalloc(&d_output_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_output_host_indices_, sizeof(index_type) * max_batch_size_)); CUDA_CHECK(hipMalloc(&d_output_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(cudaMallocHost(&h_output_host_indices_, sizeof(index_type) * max_batch_size_)); CUDA_CHECK(hipMalloc(&d_output_host_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_missing_keys_, sizeof(key_type) * max_batch_size_)); CUDA_CHECK(hipHostMalloc(&h_output_host_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_missing_positions_, sizeof(int) * max_batch_size_)); CUDA_CHECK(hipMalloc(&d_missing_keys_, sizeof(key_type) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_missing_count_, sizeof(int))); CUDA_CHECK(hipMalloc(&d_missing_positions_, sizeof(int) * max_batch_size_));
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(int))); CUDA_CHECK(hipMalloc(&d_missing_count_, sizeof(int)));
CUDA_CHECK(cudaStreamCreate(&query_stream_)); CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(int)));
for (int i = 0; i < num_buffers_; i++) { CUDA_CHECK(hipStreamCreate(&query_stream_));
int batch_size_per_buffer = ceil(1.0 * max_batch_size_ / num_buffers_); for (int i = 0; i < num_buffers_; i++) {
CUDA_CHECK( int batch_size_per_buffer = ceil(1.0 * max_batch_size_ / num_buffers_);
cudaMallocHost(&h_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size)); CUDA_CHECK(
CUDA_CHECK(cudaMalloc(&d_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size)); hipHostMalloc(&h_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
CUDA_CHECK(cudaStreamCreate(&cpy_streams_[i])); CUDA_CHECK(hipMalloc(&d_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
CUDA_CHECK(cudaEventCreate(&cpy_events_[i])); CUDA_CHECK(hipStreamCreate(&cpy_streams_[i]));
} CUDA_CHECK(hipEventCreate(&cpy_events_[i]));
CUDA_CHECK(cudaMallocHost(&h_missing_keys_, sizeof(key_type) * max_batch_size_)); }
CUDA_CHECK(cudaEventCreate(&query_event_)); CUDA_CHECK(hipHostMalloc(&h_missing_keys_, sizeof(key_type) * max_batch_size_));
h_vectors_.resize(host_table_.capacity * vec_size_); CUDA_CHECK(hipEventCreate(&query_event_));
} h_vectors_.resize(host_table_.capacity * vec_size_);
}
template <typename key_type, typename index_type, typename vec_type>
void UvmTable<key_type, index_type, vec_type>::add(const key_type* h_keys, template <typename key_type, typename index_type, typename vec_type>
const vec_type* h_vectors, void UvmTable<key_type, index_type, vec_type>::add(const key_type* h_keys,
const size_t num_keys) { const vec_type* h_vectors,
std::vector<key_type> h_missing_keys; const size_t num_keys) {
size_t num_batches = (num_keys - 1) / max_batch_size_ + 1; std::vector<key_type> h_missing_keys;
for (size_t i = 0; i < num_batches; i++) { size_t num_batches = (num_keys - 1) / max_batch_size_ + 1;
size_t this_batch_size = for (size_t i = 0; i < num_batches; i++) {
i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_; size_t this_batch_size =
CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_, i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice)); CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_))); sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice));
device_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0); CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
CUDA_CHECK(cudaDeviceSynchronize()); device_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
int num_missing_keys; CUDA_CHECK(hipDeviceSynchronize());
CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys), int num_missing_keys;
cudaMemcpyDeviceToHost)); CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
size_t prev_size = h_missing_keys.size(); hipMemcpyDeviceToHost));
h_missing_keys.resize(prev_size + num_missing_keys); size_t prev_size = h_missing_keys.size();
CUDA_CHECK(cudaMemcpy(h_missing_keys.data() + prev_size, d_missing_keys_, h_missing_keys.resize(prev_size + num_missing_keys);
sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost)); CUDA_CHECK(hipMemcpy(h_missing_keys.data() + prev_size, d_missing_keys_,
} sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost));
}
std::vector<key_type> h_final_missing_keys;
num_batches = h_missing_keys.size() ? (h_missing_keys.size() - 1) / max_batch_size_ + 1 : 0; std::vector<key_type> h_final_missing_keys;
for (size_t i = 0; i < num_batches; i++) { num_batches = h_missing_keys.size() ? (h_missing_keys.size() - 1) / max_batch_size_ + 1 : 0;
size_t this_batch_size = for (size_t i = 0; i < num_batches; i++) {
i != num_batches - 1 ? max_batch_size_ : h_missing_keys.size() - i * max_batch_size_; size_t this_batch_size =
CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_missing_keys.data() + i * max_batch_size_, i != num_batches - 1 ? max_batch_size_ : h_missing_keys.size() - i * max_batch_size_;
sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice)); CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_missing_keys.data() + i * max_batch_size_,
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_))); sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice));
host_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0); CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
CUDA_CHECK(cudaDeviceSynchronize()); host_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
int num_missing_keys; CUDA_CHECK(hipDeviceSynchronize());
CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys), int num_missing_keys;
cudaMemcpyDeviceToHost)); CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
size_t prev_size = h_final_missing_keys.size(); hipMemcpyDeviceToHost));
h_final_missing_keys.resize(prev_size + num_missing_keys); size_t prev_size = h_final_missing_keys.size();
CUDA_CHECK(cudaMemcpy(h_final_missing_keys.data() + prev_size, d_missing_keys_, h_final_missing_keys.resize(prev_size + num_missing_keys);
sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost)); CUDA_CHECK(hipMemcpy(h_final_missing_keys.data() + prev_size, d_missing_keys_,
} sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost));
}
std::vector<key_type> h_keys_buffer(max_batch_size_);
std::vector<index_type> h_indices_buffer(max_batch_size_); std::vector<key_type> h_keys_buffer(max_batch_size_);
std::vector<int> h_positions_buffer(max_batch_size_); std::vector<index_type> h_indices_buffer(max_batch_size_);
std::vector<int> h_positions_buffer(max_batch_size_);
num_batches = (num_keys - 1) / max_batch_size_ + 1;
num_batches = (num_keys - 1) / max_batch_size_ + 1;
size_t num_hit_keys = 0;
for (size_t i = 0; i < num_batches; i++) { size_t num_hit_keys = 0;
size_t this_batch_size = for (size_t i = 0; i < num_batches; i++) {
i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_; size_t this_batch_size =
CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_, i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice)); CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_))); sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice));
device_table_.query(d_keys_buffer_, this_batch_size, d_output_indices_, d_missing_keys_, CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
d_missing_positions_, d_missing_count_, 0); device_table_.query(d_keys_buffer_, this_batch_size, d_output_indices_, d_missing_keys_,
CUDA_CHECK(cudaStreamSynchronize(0)); d_missing_positions_, d_missing_count_, 0);
CUDA_CHECK(hipStreamSynchronize(0));
CUDA_CHECK(cudaMemcpy(d_vectors_buffer_, h_vectors + i * max_batch_size_ * vec_size_,
sizeof(*d_vectors_) * this_batch_size * vec_size_, CUDA_CHECK(hipMemcpy(d_vectors_buffer_, h_vectors + i * max_batch_size_ * vec_size_,
cudaMemcpyHostToDevice)); sizeof(*d_vectors_) * this_batch_size * vec_size_,
CUDA_CHECK(cudaStreamSynchronize(0)); hipMemcpyHostToDevice));
if (num_hit_keys < device_table_.capacity) { CUDA_CHECK(hipStreamSynchronize(0));
distribute_vectors_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, 0>>>( if (num_hit_keys < device_table_.capacity) {
d_output_indices_, this_batch_size, d_vectors_buffer_, vec_size_, d_vectors_); hipLaunchKernelGGL(( distribute_vectors_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, 0,
CUDA_CHECK(cudaStreamSynchronize(0)); d_output_indices_, this_batch_size, d_vectors_buffer_, vec_size_, d_vectors_);
} CUDA_CHECK(hipStreamSynchronize(0));
}
int num_missing_keys;
CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys), int num_missing_keys;
cudaMemcpyDeviceToHost)); CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
num_hit_keys += this_batch_size - num_missing_keys; hipMemcpyDeviceToHost));
host_table_.query(d_missing_keys_, num_missing_keys, d_output_indices_, nullptr, nullptr, num_hit_keys += this_batch_size - num_missing_keys;
nullptr, 0); host_table_.query(d_missing_keys_, num_missing_keys, d_output_indices_, nullptr, nullptr,
nullptr, 0);
CUDA_CHECK(cudaMemcpy(h_keys_buffer.data(), d_missing_keys_,
sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost)) CUDA_CHECK(hipMemcpy(h_keys_buffer.data(), d_missing_keys_,
sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost))
CUDA_CHECK(cudaMemcpy(h_indices_buffer.data(), d_output_indices_,
sizeof(*d_output_indices_) * num_missing_keys, cudaMemcpyDeviceToHost)) CUDA_CHECK(hipMemcpy(h_indices_buffer.data(), d_output_indices_,
sizeof(*d_output_indices_) * num_missing_keys, hipMemcpyDeviceToHost))
CUDA_CHECK(cudaMemcpy(h_positions_buffer.data(), d_missing_positions_,
sizeof(*d_missing_positions_) * num_missing_keys, cudaMemcpyDeviceToHost)) CUDA_CHECK(hipMemcpy(h_positions_buffer.data(), d_missing_positions_,
sizeof(*d_missing_positions_) * num_missing_keys, hipMemcpyDeviceToHost))
for (int j = 0; j < num_missing_keys; j++) {
if (h_indices_buffer[j] != std::numeric_limits<index_type>::max()) { for (int j = 0; j < num_missing_keys; j++) {
memcpy(h_vectors_.data() + h_indices_buffer[j] * vec_size_, if (h_indices_buffer[j] != std::numeric_limits<index_type>::max()) {
h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_, memcpy(h_vectors_.data() + h_indices_buffer[j] * vec_size_,
sizeof(*h_vectors) * vec_size_); h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
} else { sizeof(*h_vectors) * vec_size_);
size_t prev_idx = h_vectors_.size() / vec_size_; } else {
h_final_missing_items_.emplace(h_keys_buffer[j], prev_idx); size_t prev_idx = h_vectors_.size() / vec_size_;
h_vectors_.resize(h_vectors_.size() + vec_size_); h_final_missing_items_.emplace(h_keys_buffer[j], prev_idx);
memcpy(h_vectors_.data() + prev_idx * vec_size_, h_vectors_.resize(h_vectors_.size() + vec_size_);
h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_, memcpy(h_vectors_.data() + prev_idx * vec_size_,
sizeof(*h_vectors) * vec_size_); h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
} sizeof(*h_vectors) * vec_size_);
} }
} }
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_))); }
} CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
}
template <typename key_type, typename index_type, typename vec_type>
void UvmTable<key_type, index_type, vec_type>::query(const key_type* d_keys, const int num_keys, template <typename key_type, typename index_type, typename vec_type>
vec_type* d_vectors, cudaStream_t stream) { void UvmTable<key_type, index_type, vec_type>::query(const key_type* d_keys, const int num_keys,
if (!num_keys) return; vec_type* d_vectors, hipStream_t stream) {
CUDA_CHECK(cudaEventRecord(query_event_, stream)); if (!num_keys) return;
CUDA_CHECK(cudaStreamWaitEvent(query_stream_, query_event_)); CUDA_CHECK(hipEventRecord(query_event_, stream));
CUDA_CHECK(hipStreamWaitEvent(query_stream_, query_event_));
static_assert(num_buffers_ >= 2);
device_table_.query(d_keys, num_keys, d_output_indices_, d_missing_keys_, d_missing_positions_, static_assert(num_buffers_ >= 2);
d_missing_count_, query_stream_); device_table_.query(d_keys, num_keys, d_output_indices_, d_missing_keys_, d_missing_positions_,
d_missing_count_, query_stream_);
CUDA_CHECK(cudaEventRecord(query_event_, query_stream_));
CUDA_CHECK(cudaStreamWaitEvent(cpy_streams_[0], query_event_)); CUDA_CHECK(hipEventRecord(query_event_, query_stream_));
CUDA_CHECK(hipStreamWaitEvent(cpy_streams_[0], query_event_));
int num_missing_keys;
CUDA_CHECK(cudaMemcpyAsync(&num_missing_keys, d_missing_count_, sizeof(*d_missing_count_), int num_missing_keys;
cudaMemcpyDeviceToHost, cpy_streams_[0])); CUDA_CHECK(hipMemcpyAsync(&num_missing_keys, d_missing_count_, sizeof(*d_missing_count_),
hipMemcpyDeviceToHost, cpy_streams_[0]));
host_table_.query(d_missing_keys_, d_missing_count_, d_output_host_indices_, query_stream_);
CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0])); host_table_.query(d_missing_keys_, d_missing_count_, d_output_host_indices_, query_stream_);
CUDA_CHECK(hipStreamSynchronize(cpy_streams_[0]));
CUDA_CHECK(cudaMemsetAsync(d_missing_count_, 0, sizeof(*d_missing_count_), query_stream_));
CUDA_CHECK(hipMemsetAsync(d_missing_count_, 0, sizeof(*d_missing_count_), query_stream_));
CUDA_CHECK(cudaMemcpyAsync(h_output_host_indices_, d_output_host_indices_,
sizeof(index_type) * num_missing_keys, cudaMemcpyDeviceToHost, CUDA_CHECK(hipMemcpyAsync(h_output_host_indices_, d_output_host_indices_,
query_stream_)); sizeof(index_type) * num_missing_keys, hipMemcpyDeviceToHost,
query_stream_));
CUDA_CHECK(cudaMemcpyAsync(h_missing_keys_, d_missing_keys_, sizeof(key_type) * num_missing_keys,
cudaMemcpyDeviceToHost, cpy_streams_[0])); CUDA_CHECK(hipMemcpyAsync(h_missing_keys_, d_missing_keys_, sizeof(key_type) * num_missing_keys,
hipMemcpyDeviceToHost, cpy_streams_[0]));
read_vectors_kernel<<<(num_keys - 1) / block_size + 1, block_size, 0, cpy_streams_[1]>>>(
d_output_indices_, num_keys, d_vectors_, vec_size_, d_vectors); hipLaunchKernelGGL(( read_vectors_kernel), dim3((num_keys - 1) / block_size + 1), dim3(block_size), 0, cpy_streams_[1],
d_output_indices_, num_keys, d_vectors_, vec_size_, d_vectors);
CUDA_CHECK(cudaStreamSynchronize(query_stream_));
CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0])); CUDA_CHECK(hipStreamSynchronize(query_stream_));
CUDA_CHECK(hipStreamSynchronize(cpy_streams_[0]));
int num_keys_per_buffer = ceil(1.0 * num_missing_keys / num_buffers_);
int num_keys_per_buffer = ceil(1.0 * num_missing_keys / num_buffers_);
for (int buffer_num = 0; buffer_num < num_buffers_; buffer_num++) {
int num_keys_this_buffer = buffer_num != num_buffers_ - 1 for (int buffer_num = 0; buffer_num < num_buffers_; buffer_num++) {
? num_keys_per_buffer int num_keys_this_buffer = buffer_num != num_buffers_ - 1
: num_missing_keys - num_keys_per_buffer * buffer_num; ? num_keys_per_buffer
if (!num_keys_this_buffer) break; : num_missing_keys - num_keys_per_buffer * buffer_num;
#pragma omp parallel for num_threads(8) if (!num_keys_this_buffer) break;
for (size_t i = 0; i < static_cast<size_t>(num_keys_this_buffer); i++) { #pragma omp parallel for num_threads(8)
size_t idx_key = buffer_num * num_keys_per_buffer + i; for (size_t i = 0; i < static_cast<size_t>(num_keys_this_buffer); i++) {
index_type index = h_output_host_indices_[idx_key]; size_t idx_key = buffer_num * num_keys_per_buffer + i;
if (index == std::numeric_limits<index_type>::max()) { index_type index = h_output_host_indices_[idx_key];
key_type key = h_missing_keys_[idx_key]; if (index == std::numeric_limits<index_type>::max()) {
auto iterator = h_final_missing_items_.find(key); key_type key = h_missing_keys_[idx_key];
if (iterator != h_final_missing_items_.end()) { auto iterator = h_final_missing_items_.find(key);
index = iterator->second; if (iterator != h_final_missing_items_.end()) {
} index = iterator->second;
} }
if (index != std::numeric_limits<index_type>::max()) { }
memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, h_vectors_.data() + index * vec_size_, if (index != std::numeric_limits<index_type>::max()) {
sizeof(vec_type) * vec_size_); memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, h_vectors_.data() + index * vec_size_,
} else { sizeof(vec_type) * vec_size_);
memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, default_vector_.data(), } else {
sizeof(vec_type) * vec_size_); memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, default_vector_.data(),
} sizeof(vec_type) * vec_size_);
} }
CUDA_CHECK(cudaMemcpyAsync(d_cpy_buffers_[buffer_num], h_cpy_buffers_[buffer_num], }
sizeof(vec_type) * num_keys_this_buffer * vec_size_, CUDA_CHECK(hipMemcpyAsync(d_cpy_buffers_[buffer_num], h_cpy_buffers_[buffer_num],
cudaMemcpyHostToDevice, cpy_streams_[buffer_num])); sizeof(vec_type) * num_keys_this_buffer * vec_size_,
hipMemcpyHostToDevice, cpy_streams_[buffer_num]));
distribute_vectors_kernel<<<(num_keys_this_buffer - 1) / block_size + 1, block_size, 0,
cpy_streams_[buffer_num]>>>( hipLaunchKernelGGL(( distribute_vectors_kernel), dim3((num_keys_this_buffer - 1) / block_size + 1), dim3(block_size), 0,
d_missing_positions_ + buffer_num * num_keys_per_buffer, num_keys_this_buffer, cpy_streams_[buffer_num],
d_cpy_buffers_[buffer_num], vec_size_, d_vectors); d_missing_positions_ + buffer_num * num_keys_per_buffer, num_keys_this_buffer,
} d_cpy_buffers_[buffer_num], vec_size_, d_vectors);
}
for (int i = 0; i < num_buffers_; i++) {
CUDA_CHECK(cudaEventRecord(cpy_events_[i], cpy_streams_[i])); for (int i = 0; i < num_buffers_; i++) {
CUDA_CHECK(cudaStreamWaitEvent(stream, cpy_events_[i])); CUDA_CHECK(hipEventRecord(cpy_events_[i], cpy_streams_[i]));
} CUDA_CHECK(hipStreamWaitEvent(stream, cpy_events_[i]));
} }
}
template <typename key_type, typename index_type, typename vec_type>
void UvmTable<key_type, index_type, vec_type>::clear(cudaStream_t stream) { template <typename key_type, typename index_type, typename vec_type>
device_table_.clear(stream); void UvmTable<key_type, index_type, vec_type>::clear(hipStream_t stream) {
host_table_.clear(stream); device_table_.clear(stream);
} host_table_.clear(stream);
}
template <typename key_type, typename index_type, typename vec_type>
UvmTable<key_type, index_type, vec_type>::~UvmTable() { template <typename key_type, typename index_type, typename vec_type>
CUDA_CHECK(cudaFree(d_keys_buffer_)); UvmTable<key_type, index_type, vec_type>::~UvmTable() {
CUDA_CHECK(cudaFree(d_vectors_buffer_)); CUDA_CHECK(hipFree(d_keys_buffer_));
CUDA_CHECK(cudaFree(d_vectors_)); CUDA_CHECK(hipFree(d_vectors_buffer_));
CUDA_CHECK(hipFree(d_vectors_));
CUDA_CHECK(cudaFree(d_output_indices_));
CUDA_CHECK(cudaFree(d_output_host_indices_)); CUDA_CHECK(hipFree(d_output_indices_));
CUDA_CHECK(cudaFreeHost(h_output_host_indices_)); CUDA_CHECK(hipFree(d_output_host_indices_));
CUDA_CHECK(hipHostFree(h_output_host_indices_));
CUDA_CHECK(cudaFree(d_missing_keys_));
CUDA_CHECK(cudaFree(d_missing_positions_)); CUDA_CHECK(hipFree(d_missing_keys_));
CUDA_CHECK(cudaFree(d_missing_count_)); CUDA_CHECK(hipFree(d_missing_positions_));
CUDA_CHECK(cudaFreeHost(h_missing_keys_)); CUDA_CHECK(hipFree(d_missing_count_));
CUDA_CHECK(hipHostFree(h_missing_keys_));
CUDA_CHECK(cudaStreamDestroy(query_stream_));
CUDA_CHECK(cudaEventDestroy(query_event_)); CUDA_CHECK(hipStreamDestroy(query_stream_));
CUDA_CHECK(hipEventDestroy(query_event_));
for (int i = 0; i < num_buffers_; i++) {
CUDA_CHECK(cudaFreeHost(h_cpy_buffers_[i])); for (int i = 0; i < num_buffers_; i++) {
CUDA_CHECK(cudaFree(d_cpy_buffers_[i])); CUDA_CHECK(hipHostFree(h_cpy_buffers_[i]));
CUDA_CHECK(cudaStreamDestroy(cpy_streams_[i])); CUDA_CHECK(hipFree(d_cpy_buffers_[i]));
CUDA_CHECK(cudaEventDestroy(cpy_events_[i])); CUDA_CHECK(hipStreamDestroy(cpy_streams_[i]));
} CUDA_CHECK(hipEventDestroy(cpy_events_[i]));
} }
}
template <typename key_type, typename index_type>
HashBlock<key_type, index_type>::HashBlock(size_t expected_capacity, int set_size, int batch_size) template <typename key_type, typename index_type>
: max_set_size_(set_size), batch_size_(batch_size) { HashBlock<key_type, index_type>::HashBlock(size_t expected_capacity, int set_size, int batch_size)
if (expected_capacity) { : max_set_size_(set_size), batch_size_(batch_size) {
num_sets = (expected_capacity - 1) / set_size + 1; if (expected_capacity) {
} else { num_sets = (expected_capacity - 1) / set_size + 1;
num_sets = 10000; } else {
} num_sets = 10000;
capacity = num_sets * set_size; }
CUDA_CHECK(cudaMalloc(&keys, sizeof(*keys) * capacity)); capacity = num_sets * set_size;
CUDA_CHECK(cudaMalloc(&set_sizes_, sizeof(*set_sizes_) * num_sets)); CUDA_CHECK(hipMalloc(&keys, sizeof(*keys) * capacity));
CUDA_CHECK(cudaMemset(set_sizes_, 0, sizeof(*set_sizes_) * num_sets)); CUDA_CHECK(hipMalloc(&set_sizes_, sizeof(*set_sizes_) * num_sets));
} CUDA_CHECK(hipMemset(set_sizes_, 0, sizeof(*set_sizes_) * num_sets));
}
template <typename key_type, typename index_type>
HashBlock<key_type, index_type>::~HashBlock() { template <typename key_type, typename index_type>
CUDA_CHECK(cudaFree(keys)); HashBlock<key_type, index_type>::~HashBlock() {
CUDA_CHECK(cudaFree(set_sizes_)); CUDA_CHECK(hipFree(keys));
} CUDA_CHECK(hipFree(set_sizes_));
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::query(const key_type* query_keys, const size_t num_keys, template <typename key_type, typename index_type>
index_type* output_indices, key_type* missing_keys, void HashBlock<key_type, index_type>::query(const key_type* query_keys, const size_t num_keys,
int* missing_positions, int* num_missing_keys, index_type* output_indices, key_type* missing_keys,
cudaStream_t stream) { int* missing_positions, int* num_missing_keys,
if (num_keys == 0) { hipStream_t stream) {
return; if (num_keys == 0) {
} return;
size_t num_batches = (num_keys - 1) / batch_size_ + 1; }
for (size_t i = 0; i < num_batches; i++) { size_t num_batches = (num_keys - 1) / batch_size_ + 1;
size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_; for (size_t i = 0; i < num_batches; i++) {
hash_query_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>( size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
query_keys, this_batch_size, keys, num_sets, max_set_size_, output_indices, missing_keys, hipLaunchKernelGGL(( hash_query_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, stream,
missing_positions, num_missing_keys); query_keys, this_batch_size, keys, num_sets, max_set_size_, output_indices, missing_keys,
} missing_positions, num_missing_keys);
} }
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::query(const key_type* query_keys, int* num_keys, template <typename key_type, typename index_type>
index_type* output_indices, cudaStream_t stream) { void HashBlock<key_type, index_type>::query(const key_type* query_keys, int* num_keys,
hash_query_kernel<<<128, 64, 0, stream>>>(query_keys, num_keys, keys, num_sets, max_set_size_, index_type* output_indices, hipStream_t stream) {
output_indices); hipLaunchKernelGGL(( hash_query_kernel), dim3(128), dim3(64), 0, stream, query_keys, num_keys, keys, num_sets, max_set_size_,
} output_indices);
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::add(const key_type* new_keys, const size_t num_keys, template <typename key_type, typename index_type>
key_type* missing_keys, int* num_missing_keys, void HashBlock<key_type, index_type>::add(const key_type* new_keys, const size_t num_keys,
cudaStream_t stream) { key_type* missing_keys, int* num_missing_keys,
if (num_keys == 0) { hipStream_t stream) {
return; if (num_keys == 0) {
} return;
size_t num_batches = (num_keys - 1) / batch_size_ + 1; }
for (size_t i = 0; i < num_batches; i++) { size_t num_batches = (num_keys - 1) / batch_size_ + 1;
size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_; for (size_t i = 0; i < num_batches; i++) {
hash_add_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>( size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
new_keys + i * this_batch_size, this_batch_size, keys, num_sets, set_sizes_, max_set_size_, hipLaunchKernelGGL(( hash_add_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, stream,
missing_keys, num_missing_keys); new_keys + i * this_batch_size, this_batch_size, keys, num_sets, set_sizes_, max_set_size_,
} missing_keys, num_missing_keys);
} }
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::clear(cudaStream_t stream) { template <typename key_type, typename index_type>
CUDA_CHECK(cudaMemsetAsync(set_sizes_, 0, sizeof(*set_sizes_) * num_sets, stream)); void HashBlock<key_type, index_type>::clear(hipStream_t stream) {
} CUDA_CHECK(hipMemsetAsync(set_sizes_, 0, sizeof(*set_sizes_) * num_sets, stream));
}
template class HashBlock<int, size_t>;
template class HashBlock<int64_t, size_t>; template class HashBlock<int, size_t>;
template class HashBlock<size_t, size_t>; template class HashBlock<int64_t, size_t>;
template class HashBlock<unsigned int, size_t>; template class HashBlock<size_t, size_t>;
template class HashBlock<long long, size_t>; template class HashBlock<unsigned int, size_t>;
template class HashBlock<long long, size_t>;
template class UvmTable<int, size_t>;
template class UvmTable<int64_t, size_t>; template class UvmTable<int, size_t>;
template class UvmTable<size_t, size_t>; template class UvmTable<int64_t, size_t>;
template class UvmTable<unsigned int, size_t>; template class UvmTable<size_t, size_t>;
template class UvmTable<long long, size_t>; template class UvmTable<unsigned int, size_t>;
template class UvmTable<long long, size_t>;
} // namespace gpu_cache } // namespace gpu_cache
\ No newline at end of file
...@@ -15,14 +15,14 @@ ...@@ -15,14 +15,14 @@
cmake_minimum_required(VERSION 3.8) cmake_minimum_required(VERSION 3.8)
file(GLOB gpu_cache_test_src file(GLOB gpu_cache_test_src
cache_op_sol_test.cu cache_op_sol_test.hip
../../HugeCTR/src/hps/embedding_cache_gpu.cu ../../HugeCTR/src/hps/embedding_cache_gpu.hip
) )
add_executable(cache_op_sol_test ${gpu_cache_test_src}) add_executable(cache_op_sol_test ${gpu_cache_test_src})
target_compile_features(cache_op_sol_test PUBLIC cxx_std_17) target_compile_features(cache_op_sol_test PUBLIC cxx_std_17)
target_link_libraries(cache_op_sol_test PUBLIC gpu_cache) target_link_libraries(cache_op_sol_test PUBLIC gpu_cache)
target_link_libraries(cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX)
set_target_properties(cache_op_sol_test PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_target_properties(cache_op_sol_test PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties(cache_op_sol_test PROPERTIES CUDA_ARCHITECTURES OFF) set_target_properties(cache_op_sol_test PROPERTIES HIP_ARCHITECTURES OFF)
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* /*
* Copyright (c) 2023, NVIDIA CORPORATION. * Copyright (c) 2023, NVIDIA CORPORATION.
* *
...@@ -155,7 +157,7 @@ void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_ ...@@ -155,7 +157,7 @@ void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_
template <typename T> template <typename T>
bool is_near(T a, T b) { bool is_near(T a, T b) {
double diff = abs(a - b); double diff = abs(a - b);
bool ret = diff <= std::min(a, b) * 1e-6; bool ret = diff <= ::min(a, b) * 1e-6;
if (!ret) { if (!ret) {
std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl; std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl;
} }
...@@ -224,7 +226,7 @@ int main(int argc, char** argv) { ...@@ -224,7 +226,7 @@ int main(int argc, char** argv) {
const size_t cache_type = atoi(argv[7]); const size_t cache_type = atoi(argv[7]);
// Since cache is designed for single-gpu, all threads just use GPU 0 // Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK(cudaSetDevice(0)); CUDA_CHECK(hipSetDevice(0));
// Host side buffers shared between threads // Host side buffers shared between threads
key_type* h_keys; // Buffer holding all keys in embedding table key_type* h_keys; // Buffer holding all keys in embedding table
...@@ -302,7 +304,7 @@ int main(int argc, char** argv) { ...@@ -302,7 +304,7 @@ int main(int argc, char** argv) {
int thread_id = omp_get_thread_num(); int thread_id = omp_get_thread_num();
printf("Worker %d starts testing cache.\n", thread_id); printf("Worker %d starts testing cache.\n", thread_id);
// Since cache is designed for single-gpu, all threads just use GPU 0 // Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK(cudaSetDevice(0)); CUDA_CHECK(hipSetDevice(0));
// Thread-private host side buffers // Thread-private host side buffers
size_t* h_query_keys_index; // Buffer holding index for keys to be queried size_t* h_query_keys_index; // Buffer holding index for keys to be queried
...@@ -324,32 +326,32 @@ int main(int argc, char** argv) { ...@@ -324,32 +326,32 @@ int main(int argc, char** argv) {
// host-only buffers placed in normal host memory // host-only buffers placed in normal host memory
h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t)); h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t));
// host-device interactive buffers placed in pinned memory // host-device interactive buffers placed in pinned memory
CUDA_CHECK(cudaHostAlloc((void**)&h_query_keys, query_length * sizeof(key_type), CUDA_CHECK(hipHostMalloc((void**)&h_query_keys, query_length * sizeof(key_type),
cudaHostAllocPortable)); hipHostMallocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_vals_retrieved, CUDA_CHECK(hipHostMalloc((void**)&h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float), query_length * embedding_vec_size * sizeof(float),
cudaHostAllocPortable)); hipHostMallocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_missing_keys, query_length * sizeof(key_type), CUDA_CHECK(hipHostMalloc((void**)&h_missing_keys, query_length * sizeof(key_type),
cudaHostAllocPortable)); hipHostMallocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_missing_vals, CUDA_CHECK(hipHostMalloc((void**)&h_missing_vals,
query_length * embedding_vec_size * sizeof(float), query_length * embedding_vec_size * sizeof(float),
cudaHostAllocPortable)); hipHostMallocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_missing_index, query_length * sizeof(uint64_t), CUDA_CHECK(hipHostMalloc((void**)&h_missing_index, query_length * sizeof(uint64_t),
cudaHostAllocPortable)); hipHostMallocPortable));
// Allocate device side buffers // Allocate device side buffers
CUDA_CHECK(cudaMalloc((void**)&d_query_keys, query_length * sizeof(key_type))); CUDA_CHECK(hipMalloc((void**)&d_query_keys, query_length * sizeof(key_type)));
CUDA_CHECK( CUDA_CHECK(
cudaMalloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float))); hipMalloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_keys, query_length * sizeof(key_type))); CUDA_CHECK(hipMalloc((void**)&d_missing_keys, query_length * sizeof(key_type)));
CUDA_CHECK( CUDA_CHECK(
cudaMalloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float))); hipMalloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_index, query_length * sizeof(uint64_t))); CUDA_CHECK(hipMalloc((void**)&d_missing_index, query_length * sizeof(uint64_t)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_len, sizeof(size_t))); CUDA_CHECK(hipMalloc((void**)&d_missing_len, sizeof(size_t)));
// Thread-private CUDA stream, all threads just use the #0 device // Thread-private CUDA stream, all threads just use the #0 device
cudaStream_t stream; hipStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream)); CUDA_CHECK(hipStreamCreate(&stream));
// Timimg variables // Timimg variables
double time_1; double time_1;
...@@ -382,33 +384,33 @@ int main(int argc, char** argv) { ...@@ -382,33 +384,33 @@ int main(int argc, char** argv) {
std::cout << std::endl; std::cout << std::endl;
// Copy the keys to GPU memory // Copy the keys to GPU memory
CUDA_CHECK(cudaMemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type), CUDA_CHECK(hipMemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type),
cudaMemcpyHostToDevice, stream)); hipMemcpyHostToDevice, stream));
// Wait for stream to complete // Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Record time // Record time
time_1 = W_time(); time_1 = W_time();
// Get pairs from hashtable // Get pairs from hashtable
cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys, cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream); d_missing_len, stream);
// Wait for stream to complete // Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Elapsed wall time // Elapsed wall time
time_2 = W_time() - time_1; time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n", printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n",
thread_id, i, time_2); thread_id, i, time_2);
// Copy the data back to host // Copy the data back to host
CUDA_CHECK(cudaMemcpyAsync(h_vals_retrieved, d_vals_retrieved, CUDA_CHECK(hipMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
query_length * embedding_vec_size * sizeof(float), query_length * embedding_vec_size * sizeof(float),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t), CUDA_CHECK(hipMemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type), CUDA_CHECK(hipMemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t), CUDA_CHECK(hipMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i, printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i,
h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f)); h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f));
...@@ -433,13 +435,13 @@ int main(int argc, char** argv) { ...@@ -433,13 +435,13 @@ int main(int argc, char** argv) {
thread_id, i, time_2); thread_id, i, time_2);
// Copy the missing value to device // Copy the missing value to device
CUDA_CHECK(cudaMemcpyAsync(d_missing_vals, h_missing_vals, CUDA_CHECK(hipMemcpyAsync(d_missing_vals, h_missing_vals,
query_length * embedding_vec_size * sizeof(float), query_length * embedding_vec_size * sizeof(float),
cudaMemcpyHostToDevice, stream)); hipMemcpyHostToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(d_vals_retrieved, h_vals_retrieved, CUDA_CHECK(hipMemcpyAsync(d_vals_retrieved, h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float), query_length * embedding_vec_size * sizeof(float),
cudaMemcpyHostToDevice, stream)); hipMemcpyHostToDevice, stream));
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Record time // Record time
time_1 = W_time(); time_1 = W_time();
...@@ -449,7 +451,7 @@ int main(int argc, char** argv) { ...@@ -449,7 +451,7 @@ int main(int argc, char** argv) {
else else
cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream); cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream);
// Wait for stream to complete // Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Elapsed wall time // Elapsed wall time
time_2 = W_time() - time_1; time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n", printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n",
...@@ -466,20 +468,20 @@ int main(int argc, char** argv) { ...@@ -466,20 +468,20 @@ int main(int argc, char** argv) {
printf("Worker %d : All Finished!\n", thread_id); printf("Worker %d : All Finished!\n", thread_id);
// Clean-up // Clean-up
cudaStreamDestroy(stream); hipStreamDestroy(stream);
free(h_query_keys_index); free(h_query_keys_index);
CUDA_CHECK(cudaFreeHost(h_query_keys)); CUDA_CHECK(hipHostFree(h_query_keys));
CUDA_CHECK(cudaFreeHost(h_vals_retrieved)); CUDA_CHECK(hipHostFree(h_vals_retrieved));
CUDA_CHECK(cudaFreeHost(h_missing_keys)); CUDA_CHECK(hipHostFree(h_missing_keys));
CUDA_CHECK(cudaFreeHost(h_missing_vals)); CUDA_CHECK(hipHostFree(h_missing_vals));
CUDA_CHECK(cudaFreeHost(h_missing_index)); CUDA_CHECK(hipHostFree(h_missing_index));
CUDA_CHECK(cudaFree(d_query_keys)); CUDA_CHECK(hipFree(d_query_keys));
CUDA_CHECK(cudaFree(d_vals_retrieved)); CUDA_CHECK(hipFree(d_vals_retrieved));
CUDA_CHECK(cudaFree(d_missing_keys)); CUDA_CHECK(hipFree(d_missing_keys));
CUDA_CHECK(cudaFree(d_missing_vals)); CUDA_CHECK(hipFree(d_missing_vals));
CUDA_CHECK(cudaFree(d_missing_index)); CUDA_CHECK(hipFree(d_missing_index));
CUDA_CHECK(cudaFree(d_missing_len)); CUDA_CHECK(hipFree(d_missing_len));
} }
// 1st test Clean-up // 1st test Clean-up
...@@ -547,57 +549,57 @@ int main(int argc, char** argv) { ...@@ -547,57 +549,57 @@ int main(int argc, char** argv) {
key_type* d_missing_keys; key_type* d_missing_keys;
size_t* d_missing_len; size_t* d_missing_len;
CUDA_CHECK(cudaHostAlloc((void**)&h_insert_keys, CUDA_CHECK(hipHostMalloc((void**)&h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type), SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cudaHostAllocPortable)); hipHostMallocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_insert_vals, CUDA_CHECK(hipHostMalloc((void**)&h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float), SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cudaHostAllocPortable)); hipHostMallocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_dump_keys, CUDA_CHECK(hipHostMalloc((void**)&h_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type), SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cudaHostAllocPortable)); hipHostMallocPortable));
CUDA_CHECK(cudaHostAlloc( CUDA_CHECK(hipHostMalloc(
(void**)&h_vals_retrieved, (void**)&h_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float), SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cudaHostAllocPortable)); hipHostMallocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_acc_keys, CUDA_CHECK(hipHostMalloc((void**)&h_acc_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type), SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cudaHostAllocPortable)); hipHostMallocPortable));
CUDA_CHECK(cudaMalloc((void**)&d_keys, CUDA_CHECK(hipMalloc((void**)&d_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type))); SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(cudaMalloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * CUDA_CHECK(hipMalloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
embedding_vec_size * sizeof(float))); embedding_vec_size * sizeof(float)));
CUDA_CHECK( CUDA_CHECK(
cudaMalloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type))); hipMalloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(cudaMalloc((void**)&d_insert_vals, CUDA_CHECK(hipMalloc((void**)&d_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float))); SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&d_dump_keys, CUDA_CHECK(hipMalloc((void**)&d_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type))); SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(cudaMalloc( CUDA_CHECK(hipMalloc(
(void**)&d_vals_retrieved, (void**)&d_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float))); SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&d_dump_counter, sizeof(size_t))); CUDA_CHECK(hipMalloc((void**)&d_dump_counter, sizeof(size_t)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_index, CUDA_CHECK(hipMalloc((void**)&d_missing_index,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t))); SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_keys, CUDA_CHECK(hipMalloc((void**)&d_missing_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type))); SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_len, sizeof(size_t))); CUDA_CHECK(hipMalloc((void**)&d_missing_len, sizeof(size_t)));
// CUDA stream // CUDA stream
cudaStream_t stream; hipStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream)); CUDA_CHECK(hipStreamCreate(&stream));
// Copy all keys and values from host to device // Copy all keys and values from host to device
CUDA_CHECK(cudaMemcpyAsync( CUDA_CHECK(hipMemcpyAsync(
d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type), d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cudaMemcpyHostToDevice, stream)); hipMemcpyHostToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync( CUDA_CHECK(hipMemcpyAsync(
d_vals, h_new_vals, d_vals, h_new_vals,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float), SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cudaMemcpyHostToDevice, stream)); hipMemcpyHostToDevice, stream));
// Wait for stream to complete // Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Each time insert 1 slab per slabset into the cache and check result // Each time insert 1 slab per slabset into the cache and check result
for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) { for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) {
...@@ -615,17 +617,17 @@ int main(int argc, char** argv) { ...@@ -615,17 +617,17 @@ int main(int argc, char** argv) {
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)); SLAB_SIZE * cache_capacity_in_set * sizeof(key_type));
// Copy the <k,v> pairs from host to device // Copy the <k,v> pairs from host to device
CUDA_CHECK(cudaMemcpyAsync(d_insert_keys, h_insert_keys, CUDA_CHECK(hipMemcpyAsync(d_insert_keys, h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type), SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cudaMemcpyHostToDevice, stream)); hipMemcpyHostToDevice, stream));
CUDA_CHECK( CUDA_CHECK(
cudaMemcpyAsync(d_insert_vals, h_insert_vals, hipMemcpyAsync(d_insert_vals, h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float), SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cudaMemcpyHostToDevice, stream)); hipMemcpyHostToDevice, stream));
// Insert the <k,v> pairs into the cache // Insert the <k,v> pairs into the cache
cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream); cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream);
// Wait for stream to complete // Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Record time // Record time
time_a = W_time(); time_a = W_time();
...@@ -633,7 +635,7 @@ int main(int argc, char** argv) { ...@@ -633,7 +635,7 @@ int main(int argc, char** argv) {
cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream, cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream,
SLAB_SIZE); SLAB_SIZE);
// Wait for stream to complete // Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Elapsed wall time // Elapsed wall time
time_b = W_time() - time_a; time_b = W_time() - time_a;
printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b); printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b);
...@@ -644,31 +646,31 @@ int main(int argc, char** argv) { ...@@ -644,31 +646,31 @@ int main(int argc, char** argv) {
// Dump the keys from the cache // Dump the keys from the cache
cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream); cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream);
// Wait for stream to complete // Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Elapsed wall time // Elapsed wall time
time_b = W_time() - time_a; time_b = W_time() - time_a;
printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b); printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b);
// Copy the dump counter from device to host // Copy the dump counter from device to host
CUDA_CHECK(cudaMemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t), CUDA_CHECK(hipMemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
// Wait for stream to complete // Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Check the dump counter // Check the dump counter
assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1)); assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1));
// Query all the dumped keys from the cache // Query all the dumped keys from the cache
cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys, cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream); d_missing_len, stream);
// Copy result from device to host // Copy result from device to host
CUDA_CHECK(cudaMemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type), CUDA_CHECK(hipMemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(h_vals_retrieved, d_vals_retrieved, CUDA_CHECK(hipMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
h_dump_counter * embedding_vec_size * sizeof(float), h_dump_counter * embedding_vec_size * sizeof(float),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t), CUDA_CHECK(hipMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
cudaMemcpyDeviceToHost, stream)); hipMemcpyDeviceToHost, stream));
// Wait for stream to complete // Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(hipStreamSynchronize(stream));
// Check result // Check result
assert(h_missing_len == 0); assert(h_missing_len == 0);
compare_key(h_dump_keys, h_acc_keys, h_dump_counter); compare_key(h_dump_keys, h_acc_keys, h_dump_counter);
...@@ -679,27 +681,27 @@ int main(int argc, char** argv) { ...@@ -679,27 +681,27 @@ int main(int argc, char** argv) {
printf("Update and Dump API test all finished!\n"); printf("Update and Dump API test all finished!\n");
// 2nd test clean-up // 2nd test clean-up
CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(hipStreamDestroy(stream));
free(h_keys); free(h_keys);
free(h_vals); free(h_vals);
free(h_new_vals); free(h_new_vals);
CUDA_CHECK(cudaFreeHost(h_insert_keys)); CUDA_CHECK(hipHostFree(h_insert_keys));
CUDA_CHECK(cudaFreeHost(h_insert_vals)); CUDA_CHECK(hipHostFree(h_insert_vals));
CUDA_CHECK(cudaFreeHost(h_dump_keys)); CUDA_CHECK(hipHostFree(h_dump_keys));
CUDA_CHECK(cudaFreeHost(h_vals_retrieved)); CUDA_CHECK(hipHostFree(h_vals_retrieved));
CUDA_CHECK(cudaFreeHost(h_acc_keys)); CUDA_CHECK(hipHostFree(h_acc_keys));
CUDA_CHECK(cudaFree(d_keys)); CUDA_CHECK(hipFree(d_keys));
CUDA_CHECK(cudaFree(d_vals)); CUDA_CHECK(hipFree(d_vals));
CUDA_CHECK(cudaFree(d_insert_keys)); CUDA_CHECK(hipFree(d_insert_keys));
CUDA_CHECK(cudaFree(d_insert_vals)); CUDA_CHECK(hipFree(d_insert_vals));
CUDA_CHECK(cudaFree(d_dump_keys)); CUDA_CHECK(hipFree(d_dump_keys));
CUDA_CHECK(cudaFree(d_vals_retrieved)); CUDA_CHECK(hipFree(d_vals_retrieved));
CUDA_CHECK(cudaFree(d_dump_counter)); CUDA_CHECK(hipFree(d_dump_counter));
CUDA_CHECK(cudaFree(d_missing_index)); CUDA_CHECK(hipFree(d_missing_index));
CUDA_CHECK(cudaFree(d_missing_keys)); CUDA_CHECK(hipFree(d_missing_keys));
CUDA_CHECK(cudaFree(d_missing_len)); CUDA_CHECK(hipFree(d_missing_len));
delete cache; delete cache;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment