Commit 1d28bf8b authored by sangwzh's avatar sangwzh
Browse files

update third_party/HugeCTR/gpu_cache codes to hip

parent f119ea7c
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
......@@ -31,22 +33,22 @@ class gpu_cache_api {
// Query API, i.e. A single read from the cache
virtual void Query(const key_type* d_keys, const size_t len, float* d_values,
uint64_t* d_missing_index, key_type* d_missing_keys, size_t* d_missing_len,
cudaStream_t stream,
hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
virtual void Replace(const key_type* d_keys, const size_t len, const float* d_values,
cudaStream_t stream,
hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
// Update API, i.e. update the embeddings which exist in the cache
virtual void Update(const key_type* d_keys, const size_t len, const float* d_values,
cudaStream_t stream,
hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
// Dump API, i.e. dump some slabsets' keys from the cache
virtual void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
const size_t end_set_index, cudaStream_t stream) = 0;
const size_t end_set_index, hipStream_t stream) = 0;
};
} // namespace gpu_cache
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
......@@ -61,20 +63,20 @@ class gpu_cache : public gpu_cache_api<key_type> {
// Query API, i.e. A single read from the cache
void Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index,
key_type* d_missing_keys, size_t* d_missing_len, cudaStream_t stream,
key_type* d_missing_keys, size_t* d_missing_len, hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void Replace(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream,
void Replace(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
// Update API, i.e. update the embeddings which exist in the cache
void Update(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream,
void Update(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream,
const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
// Dump API, i.e. dump some slabsets' keys from the cache
void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
const size_t end_set_index, cudaStream_t stream) override;
const size_t end_set_index, hipStream_t stream) override;
public:
using slabset = slab_set<set_associativity, key_type, warp_size>;
......
// !!! This is a file automatically generated by hipify!!!
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
......@@ -15,7 +16,7 @@
*/
#pragma once
#include <cuda_runtime_api.h>
#include <hip/hip_runtime_api.h>
#include <stdexcept>
#include <string>
......@@ -30,17 +31,17 @@ class CudaException : public std::runtime_error {
CudaException(const std::string& what) : runtime_error(what) {}
};
inline void cuda_check_(cudaError_t val, const char* file, int line) {
if (val != cudaSuccess) {
inline void cuda_check_(hipError_t val, const char* file, int line) {
if (val != hipSuccess) {
throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " +
std::to_string(val) + ": " + cudaGetErrorString(val));
std::to_string(val) + ": " + hipGetErrorString(val));
}
}
class CudaDeviceRestorer {
public:
CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); }
~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); }
CudaDeviceRestorer() { CUDA_CHECK(hipGetDevice(&dev_)); }
~CudaDeviceRestorer() { CUDA_CHECK(hipSetDevice(dev_)); }
void check_device(int device) const {
if (device != dev_) {
throw std::runtime_error(
......@@ -54,14 +55,14 @@ class CudaDeviceRestorer {
};
inline int get_dev(const void* ptr) {
cudaPointerAttributes attr;
CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
hipPointerAttribute_t attr;
CUDA_CHECK(hipPointerGetAttributes(&attr, ptr));
int dev = -1;
#if CUDART_VERSION >= 10000
if (attr.type == cudaMemoryTypeDevice)
#if DTKRT_VERSION >= 10000
if (attr.type == hipMemoryTypeDevice)
#else
if (attr.memoryType == cudaMemoryTypeDevice)
if (attr.memoryType == hipMemoryTypeDevice)
#endif
{
dev = attr.device;
......@@ -72,7 +73,7 @@ inline int get_dev(const void* ptr) {
inline void switch_to_dev(const void* ptr) {
int dev = get_dev(ptr);
if (dev >= 0) {
CUDA_CHECK(cudaSetDevice(dev));
CUDA_CHECK(hipSetDevice(dev));
}
}
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
......@@ -50,17 +52,17 @@ class StaticHashTable {
return keys_bytes + indices_bytes + values_bytes;
}
void clear(cudaStream_t stream = 0);
void clear(hipStream_t stream = 0);
// Note:
// 1. Please make sure the key to be inserted is not duplicated.
// 2. Please make sure the key to be inserted does not exist in the table.
// 3. Please make sure (size() + num_keys) <= capacity().
void insert(const key_type *keys, const value_type *values, size_type num_keys,
cudaStream_t stream = 0);
hipStream_t stream = 0);
void lookup(const key_type *keys, value_type *values, int num_keys, value_type default_value = 0,
cudaStream_t stream = 0);
hipStream_t stream = 0);
private:
key_type *table_keys_;
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
......@@ -36,12 +38,12 @@ class static_table {
~static_table(){};
// Query API, i.e. A single read from the cache
void Query(const key_type* d_keys, const size_t len, float* d_values, cudaStream_t stream);
void Query(const key_type* d_keys, const size_t len, float* d_values, hipStream_t stream);
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void Init(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream);
void Init(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream);
void Clear(cudaStream_t stream);
void Clear(hipStream_t stream);
private:
StaticHashTable<key_type, float> static_hash_table_;
......
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nv_util.h>
#include <thread>
#include <unordered_map>
#include <vector>
namespace gpu_cache {
template <typename key_type, typename index_type>
class HashBlock {
public:
key_type* keys;
size_t num_sets;
size_t capacity;
HashBlock(size_t expected_capacity, int set_size, int batch_size);
~HashBlock();
void add(const key_type* new_keys, const size_t num_keys, key_type* missing_keys,
int* num_missing_keys, cudaStream_t stream);
void query(const key_type* query_keys, const size_t num_keys, index_type* output_indices,
key_type* missing_keys, int* missing_positions, int* num_missing_keys,
cudaStream_t stream);
void query(const key_type* query_keys, int* num_keys, index_type* output_indices,
cudaStream_t stream);
void clear(cudaStream_t stream);
private:
int max_set_size_;
int batch_size_;
int* set_sizes_;
};
template <typename vec_type>
class H2HCopy {
public:
H2HCopy(int num_threads) : num_threads_(num_threads), working_(num_threads) {
for (int i = 0; i < num_threads_; i++) {
threads_.emplace_back(
[&](int idx) {
while (!terminate_) {
if (working_[idx].load(std::memory_order_relaxed)) {
working_[idx].store(false, std::memory_order_relaxed);
if (num_keys_ == 0) continue;
size_t num_keys_this_thread = (num_keys_ - 1) / num_threads_ + 1;
size_t begin = idx * num_keys_this_thread;
if (idx == num_threads_ - 1) {
num_keys_this_thread = num_keys_ - num_keys_this_thread * idx;
}
size_t end = begin + num_keys_this_thread;
for (size_t i = begin; i < end; i++) {
size_t idx_vec = get_index_(i);
if (idx_vec == std::numeric_limits<size_t>::max()) {
continue;
}
memcpy(dst_data_ptr_ + i * vec_size_, src_data_ptr_ + idx_vec * vec_size_,
sizeof(vec_type) * vec_size_);
}
num_finished_workers_++;
}
}
std::this_thread::sleep_for(std::chrono::microseconds(1));
},
i);
}
};
void copy(vec_type* dst_data_ptr, vec_type* src_data_ptr, size_t num_keys, int vec_size,
std::function<size_t(size_t)> get_index_func) {
std::lock_guard<std::mutex> guard(submit_mutex_);
dst_data_ptr_ = dst_data_ptr;
src_data_ptr_ = src_data_ptr;
get_index_ = get_index_func;
num_keys_ = num_keys;
vec_size_ = vec_size;
num_finished_workers_.store(0, std::memory_order_acquire);
for (auto& working : working_) {
working.store(true, std::memory_order_relaxed);
}
while (num_finished_workers_ != num_threads_) {
continue;
}
}
~H2HCopy() {
terminate_ = true;
for (auto& t : threads_) {
t.join();
}
}
private:
vec_type* src_data_ptr_;
vec_type* dst_data_ptr_;
std::function<size_t(size_t)> get_index_;
size_t num_keys_;
int vec_size_;
std::mutex submit_mutex_;
const int num_threads_;
std::vector<std::thread> threads_;
std::vector<std::atomic<bool>> working_;
volatile bool terminate_{false};
std::atomic<int> num_finished_workers_{0};
};
template <typename key_type, typename index_type, typename vec_type = float>
class UvmTable {
public:
UvmTable(const size_t device_table_capacity, const size_t host_table_capacity,
const int max_batch_size, const int vec_size,
const vec_type default_value = (vec_type)0);
~UvmTable();
void query(const key_type* d_keys, const int len, vec_type* d_vectors, cudaStream_t stream = 0);
void add(const key_type* h_keys, const vec_type* h_vectors, const size_t len);
void clear(cudaStream_t stream = 0);
private:
static constexpr int num_buffers_ = 2;
key_type* d_keys_buffer_;
vec_type* d_vectors_buffer_;
vec_type* d_vectors_;
index_type* d_output_indices_;
index_type* d_output_host_indices_;
index_type* h_output_host_indices_;
key_type* d_missing_keys_;
int* d_missing_positions_;
int* d_missing_count_;
std::vector<vec_type> h_vectors_;
key_type* h_missing_keys_;
cudaStream_t query_stream_;
cudaEvent_t query_event_;
vec_type* h_cpy_buffers_[num_buffers_];
vec_type* d_cpy_buffers_[num_buffers_];
cudaStream_t cpy_streams_[num_buffers_];
cudaEvent_t cpy_events_[num_buffers_];
std::unordered_map<key_type, index_type> h_final_missing_items_;
int max_batch_size_;
int vec_size_;
size_t num_set_;
size_t num_host_set_;
size_t table_capacity_;
std::vector<vec_type> default_vector_;
HashBlock<key_type, index_type> device_table_;
HashBlock<key_type, index_type> host_table_;
};
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nv_util.h>
#include <thread>
#include <unordered_map>
#include <vector>
namespace gpu_cache {
template <typename key_type, typename index_type>
class HashBlock {
public:
key_type* keys;
size_t num_sets;
size_t capacity;
HashBlock(size_t expected_capacity, int set_size, int batch_size);
~HashBlock();
void add(const key_type* new_keys, const size_t num_keys, key_type* missing_keys,
int* num_missing_keys, hipStream_t stream);
void query(const key_type* query_keys, const size_t num_keys, index_type* output_indices,
key_type* missing_keys, int* missing_positions, int* num_missing_keys,
hipStream_t stream);
void query(const key_type* query_keys, int* num_keys, index_type* output_indices,
hipStream_t stream);
void clear(hipStream_t stream);
private:
int max_set_size_;
int batch_size_;
int* set_sizes_;
};
template <typename vec_type>
class H2HCopy {
public:
H2HCopy(int num_threads) : num_threads_(num_threads), working_(num_threads) {
for (int i = 0; i < num_threads_; i++) {
threads_.emplace_back(
[&](int idx) {
while (!terminate_) {
if (working_[idx].load(std::memory_order_relaxed)) {
working_[idx].store(false, std::memory_order_relaxed);
if (num_keys_ == 0) continue;
size_t num_keys_this_thread = (num_keys_ - 1) / num_threads_ + 1;
size_t begin = idx * num_keys_this_thread;
if (idx == num_threads_ - 1) {
num_keys_this_thread = num_keys_ - num_keys_this_thread * idx;
}
size_t end = begin + num_keys_this_thread;
for (size_t i = begin; i < end; i++) {
size_t idx_vec = get_index_(i);
if (idx_vec == std::numeric_limits<size_t>::max()) {
continue;
}
memcpy(dst_data_ptr_ + i * vec_size_, src_data_ptr_ + idx_vec * vec_size_,
sizeof(vec_type) * vec_size_);
}
num_finished_workers_++;
}
}
std::this_thread::sleep_for(std::chrono::microseconds(1));
},
i);
}
};
void copy(vec_type* dst_data_ptr, vec_type* src_data_ptr, size_t num_keys, int vec_size,
std::function<size_t(size_t)> get_index_func) {
std::lock_guard<std::mutex> guard(submit_mutex_);
dst_data_ptr_ = dst_data_ptr;
src_data_ptr_ = src_data_ptr;
get_index_ = get_index_func;
num_keys_ = num_keys;
vec_size_ = vec_size;
num_finished_workers_.store(0, std::memory_order_acquire);
for (auto& working : working_) {
working.store(true, std::memory_order_relaxed);
}
while (num_finished_workers_ != num_threads_) {
continue;
}
}
~H2HCopy() {
terminate_ = true;
for (auto& t : threads_) {
t.join();
}
}
private:
vec_type* src_data_ptr_;
vec_type* dst_data_ptr_;
std::function<size_t(size_t)> get_index_;
size_t num_keys_;
int vec_size_;
std::mutex submit_mutex_;
const int num_threads_;
std::vector<std::thread> threads_;
std::vector<std::atomic<bool>> working_;
volatile bool terminate_{false};
std::atomic<int> num_finished_workers_{0};
};
template <typename key_type, typename index_type, typename vec_type = float>
class UvmTable {
public:
UvmTable(const size_t device_table_capacity, const size_t host_table_capacity,
const int max_batch_size, const int vec_size,
const vec_type default_value = (vec_type)0);
~UvmTable();
void query(const key_type* d_keys, const int len, vec_type* d_vectors, hipStream_t stream = 0);
void add(const key_type* h_keys, const vec_type* h_vectors, const size_t len);
void clear(hipStream_t stream = 0);
private:
static constexpr int num_buffers_ = 2;
key_type* d_keys_buffer_;
vec_type* d_vectors_buffer_;
vec_type* d_vectors_;
index_type* d_output_indices_;
index_type* d_output_host_indices_;
index_type* h_output_host_indices_;
key_type* d_missing_keys_;
int* d_missing_positions_;
int* d_missing_count_;
std::vector<vec_type> h_vectors_;
key_type* h_missing_keys_;
hipStream_t query_stream_;
hipEvent_t query_event_;
vec_type* h_cpy_buffers_[num_buffers_];
vec_type* d_cpy_buffers_[num_buffers_];
hipStream_t cpy_streams_[num_buffers_];
hipEvent_t cpy_events_[num_buffers_];
std::unordered_map<key_type, index_type> h_final_missing_items_;
int max_batch_size_;
int vec_size_;
size_t num_set_;
size_t num_host_set_;
size_t table_capacity_;
std::vector<vec_type> default_vector_;
HashBlock<key_type, index_type> device_table_;
HashBlock<key_type, index_type> host_table_;
};
} // namespace gpu_cache
\ No newline at end of file
......@@ -15,15 +15,14 @@
cmake_minimum_required(VERSION 3.8)
file(GLOB gpu_cache_src
nv_gpu_cache.cu
static_table.cu
static_hash_table.cu
uvm_table.cu
nv_gpu_cache.hip
static_table.hip
static_hash_table.hip
uvm_table.hip
)
add_library(gpu_cache SHARED ${gpu_cache_src})
target_compile_features(gpu_cache PUBLIC cxx_std_11)
set_target_properties(gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties(gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)
set_target_properties(gpu_cache PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS ON)
# set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)
// !!! This is a file automatically generated by hipify!!!
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
......@@ -14,8 +15,8 @@
* limitations under the License.
*/
#include <cooperative_groups.h>
#include <cuda.h>
#include <hip/hip_cooperative_groups.h>
#include <hip/hip_runtime.h>
#include <stdint.h>
#include <stdio.h>
......@@ -49,7 +50,7 @@ __device__ size_type insert(key_type *table, size_type capacity, key_type key, c
// otherwise return invalid_slot.
const size_type num_groups = capacity / group_size;
#if (CUDA_VERSION < 11060)
#if (DTK_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size();
#else
unsigned long long num_threads_per_group = cg.num_threads();
......@@ -152,7 +153,7 @@ __device__ size_type lookup(key_type *table, size_type capacity, key_type key, c
const size_type num_groups = capacity / group_size;
#if (CUDA_VERSION < 11060)
#if (DTK_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size();
#else
unsigned long long num_threads_per_group = cg.num_threads();
......@@ -300,19 +301,19 @@ StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHash
size_t align_m = 16;
size_t num_keys = key_capacity_ + 1;
size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m;
CUDA_CHECK(cudaMalloc(&table_keys_, sizeof(key_type) * num_keys));
CUDA_CHECK(cudaMalloc(&table_indices_, sizeof(size_type) * num_keys));
CUDA_CHECK(cudaMalloc(&table_values_, sizeof(value_type) * num_values));
CUDA_CHECK(hipMalloc(&table_keys_, sizeof(key_type) * num_keys));
CUDA_CHECK(hipMalloc(&table_indices_, sizeof(size_type) * num_keys));
CUDA_CHECK(hipMalloc(&table_values_, sizeof(value_type) * num_values));
// Initialize table_keys_
CUDA_CHECK(cudaMemset(table_keys_, 0xff, sizeof(key_type) * key_capacity_));
CUDA_CHECK(cudaMemset(table_keys_ + key_capacity_, 0, sizeof(key_type)));
CUDA_CHECK(hipMemset(table_keys_, 0xff, sizeof(key_type) * key_capacity_));
CUDA_CHECK(hipMemset(table_keys_ + key_capacity_, 0, sizeof(key_type)));
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert(
const key_type *keys, const value_type *values, size_type num_keys, cudaStream_t stream) {
const key_type *keys, const value_type *values, size_type num_keys, hipStream_t stream) {
if (num_keys == 0) {
return;
}
......@@ -324,12 +325,12 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
// Insert keys
constexpr int block = 256;
int grid = (num_keys - 1) / block + 1;
InsertKeyKernel<tile_size, group_size>
<<<grid, block, 0, stream>>>(table_keys_, table_indices_, key_capacity_, keys, num_keys,
hipLaunchKernelGGL(( InsertKeyKernel<tile_size, group_size>)
, dim3(grid), dim3(block), 0, stream, table_keys_, table_indices_, key_capacity_, keys, num_keys,
size_, hash_, empty_key, invalid_slot);
// Copy values
CUDA_CHECK(cudaMemcpyAsync(table_values_ + size_ * value_dim_, values,
sizeof(value_type) * num_keys * value_dim_, cudaMemcpyDeviceToDevice,
CUDA_CHECK(hipMemcpyAsync(table_values_ + size_ * value_dim_, values,
sizeof(value_type) * num_keys * value_dim_, hipMemcpyDeviceToDevice,
stream));
size_ += num_keys;
}
......@@ -337,25 +338,25 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear(
cudaStream_t stream) {
CUDA_CHECK(cudaMemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream));
CUDA_CHECK(cudaMemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream));
hipStream_t stream) {
CUDA_CHECK(hipMemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream));
CUDA_CHECK(hipMemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream));
size_ = 0;
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() {
CUDA_CHECK(cudaFree(table_keys_));
CUDA_CHECK(cudaFree(table_indices_));
CUDA_CHECK(cudaFree(table_values_));
CUDA_CHECK(hipFree(table_keys_));
CUDA_CHECK(hipFree(table_indices_));
CUDA_CHECK(hipFree(table_values_));
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup(
const key_type *keys, value_type *values, int num_keys, value_type default_value,
cudaStream_t stream) {
hipStream_t stream) {
if (num_keys == 0) {
return;
}
......@@ -363,7 +364,7 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::looku
constexpr int block = 256;
const int grid = (num_keys - 1) / block + 1;
// Lookup keys
LookupKernel<tile_size, group_size><<<grid, block, 0, stream>>>(
hipLaunchKernelGGL(( LookupKernel<tile_size, group_size>), dim3(grid), dim3(block), 0, stream,
table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values,
hash_, empty_key, default_value, invalid_slot);
}
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
......@@ -14,7 +16,7 @@
* limitations under the License.
*/
#include <cooperative_groups.h>
#include <hip/hip_cooperative_groups.h>
#include <nv_util.h>
#include <iostream>
......@@ -38,18 +40,18 @@ static_table<key_type>::static_table(const size_t table_size, const size_t embed
template <typename key_type>
void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values,
cudaStream_t stream) {
hipStream_t stream) {
static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream);
}
template <typename key_type>
void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values,
cudaStream_t stream) {
hipStream_t stream) {
static_hash_table_.insert(d_keys, d_values, len, stream);
}
template <typename key_type>
void static_table<key_type>::Clear(cudaStream_t stream) {
void static_table<key_type>::Clear(hipStream_t stream) {
static_hash_table_.clear(stream);
}
......
......@@ -15,14 +15,14 @@
cmake_minimum_required(VERSION 3.8)
file(GLOB gpu_cache_test_src
cache_op_sol_test.cu
../../HugeCTR/src/hps/embedding_cache_gpu.cu
cache_op_sol_test.hip
../../HugeCTR/src/hps/embedding_cache_gpu.hip
)
add_executable(cache_op_sol_test ${gpu_cache_test_src})
target_compile_features(cache_op_sol_test PUBLIC cxx_std_17)
target_link_libraries(cache_op_sol_test PUBLIC gpu_cache)
target_link_libraries(cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX)
set_target_properties(cache_op_sol_test PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties(cache_op_sol_test PROPERTIES CUDA_ARCHITECTURES OFF)
set_target_properties(cache_op_sol_test PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties(cache_op_sol_test PROPERTIES HIP_ARCHITECTURES OFF)
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
......@@ -155,7 +157,7 @@ void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_
template <typename T>
bool is_near(T a, T b) {
double diff = abs(a - b);
bool ret = diff <= std::min(a, b) * 1e-6;
bool ret = diff <= ::min(a, b) * 1e-6;
if (!ret) {
std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl;
}
......@@ -224,7 +226,7 @@ int main(int argc, char** argv) {
const size_t cache_type = atoi(argv[7]);
// Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK(cudaSetDevice(0));
CUDA_CHECK(hipSetDevice(0));
// Host side buffers shared between threads
key_type* h_keys; // Buffer holding all keys in embedding table
......@@ -302,7 +304,7 @@ int main(int argc, char** argv) {
int thread_id = omp_get_thread_num();
printf("Worker %d starts testing cache.\n", thread_id);
// Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK(cudaSetDevice(0));
CUDA_CHECK(hipSetDevice(0));
// Thread-private host side buffers
size_t* h_query_keys_index; // Buffer holding index for keys to be queried
......@@ -324,32 +326,32 @@ int main(int argc, char** argv) {
// host-only buffers placed in normal host memory
h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t));
// host-device interactive buffers placed in pinned memory
CUDA_CHECK(cudaHostAlloc((void**)&h_query_keys, query_length * sizeof(key_type),
cudaHostAllocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_vals_retrieved,
CUDA_CHECK(hipHostMalloc((void**)&h_query_keys, query_length * sizeof(key_type),
hipHostMallocPortable));
CUDA_CHECK(hipHostMalloc((void**)&h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
cudaHostAllocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_missing_keys, query_length * sizeof(key_type),
cudaHostAllocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_missing_vals,
hipHostMallocPortable));
CUDA_CHECK(hipHostMalloc((void**)&h_missing_keys, query_length * sizeof(key_type),
hipHostMallocPortable));
CUDA_CHECK(hipHostMalloc((void**)&h_missing_vals,
query_length * embedding_vec_size * sizeof(float),
cudaHostAllocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_missing_index, query_length * sizeof(uint64_t),
cudaHostAllocPortable));
hipHostMallocPortable));
CUDA_CHECK(hipHostMalloc((void**)&h_missing_index, query_length * sizeof(uint64_t),
hipHostMallocPortable));
// Allocate device side buffers
CUDA_CHECK(cudaMalloc((void**)&d_query_keys, query_length * sizeof(key_type)));
CUDA_CHECK(hipMalloc((void**)&d_query_keys, query_length * sizeof(key_type)));
CUDA_CHECK(
cudaMalloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_keys, query_length * sizeof(key_type)));
hipMalloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK(hipMalloc((void**)&d_missing_keys, query_length * sizeof(key_type)));
CUDA_CHECK(
cudaMalloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_index, query_length * sizeof(uint64_t)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_len, sizeof(size_t)));
hipMalloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK(hipMalloc((void**)&d_missing_index, query_length * sizeof(uint64_t)));
CUDA_CHECK(hipMalloc((void**)&d_missing_len, sizeof(size_t)));
// Thread-private CUDA stream, all threads just use the #0 device
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
hipStream_t stream;
CUDA_CHECK(hipStreamCreate(&stream));
// Timimg variables
double time_1;
......@@ -382,33 +384,33 @@ int main(int argc, char** argv) {
std::cout << std::endl;
// Copy the keys to GPU memory
CUDA_CHECK(cudaMemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type),
cudaMemcpyHostToDevice, stream));
CUDA_CHECK(hipMemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type),
hipMemcpyHostToDevice, stream));
// Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Record time
time_1 = W_time();
// Get pairs from hashtable
cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream);
// Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Elapsed wall time
time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n",
thread_id, i, time_2);
// Copy the data back to host
CUDA_CHECK(cudaMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
CUDA_CHECK(hipMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t),
cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type),
cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
hipMemcpyDeviceToHost, stream));
CUDA_CHECK(hipMemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t),
hipMemcpyDeviceToHost, stream));
CUDA_CHECK(hipMemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type),
hipMemcpyDeviceToHost, stream));
CUDA_CHECK(hipMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
hipMemcpyDeviceToHost, stream));
CUDA_CHECK(hipStreamSynchronize(stream));
printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i,
h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f));
......@@ -433,13 +435,13 @@ int main(int argc, char** argv) {
thread_id, i, time_2);
// Copy the missing value to device
CUDA_CHECK(cudaMemcpyAsync(d_missing_vals, h_missing_vals,
CUDA_CHECK(hipMemcpyAsync(d_missing_vals, h_missing_vals,
query_length * embedding_vec_size * sizeof(float),
cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(d_vals_retrieved, h_vals_retrieved,
hipMemcpyHostToDevice, stream));
CUDA_CHECK(hipMemcpyAsync(d_vals_retrieved, h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
hipMemcpyHostToDevice, stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Record time
time_1 = W_time();
......@@ -449,7 +451,7 @@ int main(int argc, char** argv) {
else
cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream);
// Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Elapsed wall time
time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n",
......@@ -466,20 +468,20 @@ int main(int argc, char** argv) {
printf("Worker %d : All Finished!\n", thread_id);
// Clean-up
cudaStreamDestroy(stream);
hipStreamDestroy(stream);
free(h_query_keys_index);
CUDA_CHECK(cudaFreeHost(h_query_keys));
CUDA_CHECK(cudaFreeHost(h_vals_retrieved));
CUDA_CHECK(cudaFreeHost(h_missing_keys));
CUDA_CHECK(cudaFreeHost(h_missing_vals));
CUDA_CHECK(cudaFreeHost(h_missing_index));
CUDA_CHECK(cudaFree(d_query_keys));
CUDA_CHECK(cudaFree(d_vals_retrieved));
CUDA_CHECK(cudaFree(d_missing_keys));
CUDA_CHECK(cudaFree(d_missing_vals));
CUDA_CHECK(cudaFree(d_missing_index));
CUDA_CHECK(cudaFree(d_missing_len));
CUDA_CHECK(hipHostFree(h_query_keys));
CUDA_CHECK(hipHostFree(h_vals_retrieved));
CUDA_CHECK(hipHostFree(h_missing_keys));
CUDA_CHECK(hipHostFree(h_missing_vals));
CUDA_CHECK(hipHostFree(h_missing_index));
CUDA_CHECK(hipFree(d_query_keys));
CUDA_CHECK(hipFree(d_vals_retrieved));
CUDA_CHECK(hipFree(d_missing_keys));
CUDA_CHECK(hipFree(d_missing_vals));
CUDA_CHECK(hipFree(d_missing_index));
CUDA_CHECK(hipFree(d_missing_len));
}
// 1st test Clean-up
......@@ -547,57 +549,57 @@ int main(int argc, char** argv) {
key_type* d_missing_keys;
size_t* d_missing_len;
CUDA_CHECK(cudaHostAlloc((void**)&h_insert_keys,
CUDA_CHECK(hipHostMalloc((void**)&h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cudaHostAllocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_insert_vals,
hipHostMallocPortable));
CUDA_CHECK(hipHostMalloc((void**)&h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cudaHostAllocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_dump_keys,
hipHostMallocPortable));
CUDA_CHECK(hipHostMalloc((void**)&h_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cudaHostAllocPortable));
CUDA_CHECK(cudaHostAlloc(
hipHostMallocPortable));
CUDA_CHECK(hipHostMalloc(
(void**)&h_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cudaHostAllocPortable));
CUDA_CHECK(cudaHostAlloc((void**)&h_acc_keys,
hipHostMallocPortable));
CUDA_CHECK(hipHostMalloc((void**)&h_acc_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cudaHostAllocPortable));
hipHostMallocPortable));
CUDA_CHECK(cudaMalloc((void**)&d_keys,
CUDA_CHECK(hipMalloc((void**)&d_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(cudaMalloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
CUDA_CHECK(hipMalloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
embedding_vec_size * sizeof(float)));
CUDA_CHECK(
cudaMalloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(cudaMalloc((void**)&d_insert_vals,
hipMalloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(hipMalloc((void**)&d_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&d_dump_keys,
CUDA_CHECK(hipMalloc((void**)&d_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(cudaMalloc(
CUDA_CHECK(hipMalloc(
(void**)&d_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&d_dump_counter, sizeof(size_t)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_index,
CUDA_CHECK(hipMalloc((void**)&d_dump_counter, sizeof(size_t)));
CUDA_CHECK(hipMalloc((void**)&d_missing_index,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_keys,
CUDA_CHECK(hipMalloc((void**)&d_missing_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(cudaMalloc((void**)&d_missing_len, sizeof(size_t)));
CUDA_CHECK(hipMalloc((void**)&d_missing_len, sizeof(size_t)));
// CUDA stream
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
hipStream_t stream;
CUDA_CHECK(hipStreamCreate(&stream));
// Copy all keys and values from host to device
CUDA_CHECK(cudaMemcpyAsync(
CUDA_CHECK(hipMemcpyAsync(
d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(
hipMemcpyHostToDevice, stream));
CUDA_CHECK(hipMemcpyAsync(
d_vals, h_new_vals,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cudaMemcpyHostToDevice, stream));
hipMemcpyHostToDevice, stream));
// Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Each time insert 1 slab per slabset into the cache and check result
for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) {
......@@ -615,17 +617,17 @@ int main(int argc, char** argv) {
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type));
// Copy the <k,v> pairs from host to device
CUDA_CHECK(cudaMemcpyAsync(d_insert_keys, h_insert_keys,
CUDA_CHECK(hipMemcpyAsync(d_insert_keys, h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cudaMemcpyHostToDevice, stream));
hipMemcpyHostToDevice, stream));
CUDA_CHECK(
cudaMemcpyAsync(d_insert_vals, h_insert_vals,
hipMemcpyAsync(d_insert_vals, h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cudaMemcpyHostToDevice, stream));
hipMemcpyHostToDevice, stream));
// Insert the <k,v> pairs into the cache
cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream);
// Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Record time
time_a = W_time();
......@@ -633,7 +635,7 @@ int main(int argc, char** argv) {
cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream,
SLAB_SIZE);
// Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Elapsed wall time
time_b = W_time() - time_a;
printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b);
......@@ -644,31 +646,31 @@ int main(int argc, char** argv) {
// Dump the keys from the cache
cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream);
// Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Elapsed wall time
time_b = W_time() - time_a;
printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b);
// Copy the dump counter from device to host
CUDA_CHECK(cudaMemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t),
cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(hipMemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t),
hipMemcpyDeviceToHost, stream));
// Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Check the dump counter
assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1));
// Query all the dumped keys from the cache
cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream);
// Copy result from device to host
CUDA_CHECK(cudaMemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type),
cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
CUDA_CHECK(hipMemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type),
hipMemcpyDeviceToHost, stream));
CUDA_CHECK(hipMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
h_dump_counter * embedding_vec_size * sizeof(float),
cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
cudaMemcpyDeviceToHost, stream));
hipMemcpyDeviceToHost, stream));
CUDA_CHECK(hipMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
hipMemcpyDeviceToHost, stream));
// Wait for stream to complete
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(hipStreamSynchronize(stream));
// Check result
assert(h_missing_len == 0);
compare_key(h_dump_keys, h_acc_keys, h_dump_counter);
......@@ -679,27 +681,27 @@ int main(int argc, char** argv) {
printf("Update and Dump API test all finished!\n");
// 2nd test clean-up
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(hipStreamDestroy(stream));
free(h_keys);
free(h_vals);
free(h_new_vals);
CUDA_CHECK(cudaFreeHost(h_insert_keys));
CUDA_CHECK(cudaFreeHost(h_insert_vals));
CUDA_CHECK(cudaFreeHost(h_dump_keys));
CUDA_CHECK(cudaFreeHost(h_vals_retrieved));
CUDA_CHECK(cudaFreeHost(h_acc_keys));
CUDA_CHECK(cudaFree(d_keys));
CUDA_CHECK(cudaFree(d_vals));
CUDA_CHECK(cudaFree(d_insert_keys));
CUDA_CHECK(cudaFree(d_insert_vals));
CUDA_CHECK(cudaFree(d_dump_keys));
CUDA_CHECK(cudaFree(d_vals_retrieved));
CUDA_CHECK(cudaFree(d_dump_counter));
CUDA_CHECK(cudaFree(d_missing_index));
CUDA_CHECK(cudaFree(d_missing_keys));
CUDA_CHECK(cudaFree(d_missing_len));
CUDA_CHECK(hipHostFree(h_insert_keys));
CUDA_CHECK(hipHostFree(h_insert_vals));
CUDA_CHECK(hipHostFree(h_dump_keys));
CUDA_CHECK(hipHostFree(h_vals_retrieved));
CUDA_CHECK(hipHostFree(h_acc_keys));
CUDA_CHECK(hipFree(d_keys));
CUDA_CHECK(hipFree(d_vals));
CUDA_CHECK(hipFree(d_insert_keys));
CUDA_CHECK(hipFree(d_insert_vals));
CUDA_CHECK(hipFree(d_dump_keys));
CUDA_CHECK(hipFree(d_vals_retrieved));
CUDA_CHECK(hipFree(d_dump_counter));
CUDA_CHECK(hipFree(d_missing_index));
CUDA_CHECK(hipFree(d_missing_keys));
CUDA_CHECK(hipFree(d_missing_len));
delete cache;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment