update third_party/HugeCTR/gpu_cache codes to hip

1d28bf8b · sangwzh · f119ea7c · 1d28bf8b · 1d28bf8b · 1d28bf8b
Commit 1d28bf8b authored Sep 23, 2024 by sangwzh
13 changed files
--- a/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /*
 * Copyright (c) 2021, NVIDIA CORPORATION.
 *
@@ -31,22 +33,22 @@ class gpu_cache_api {
  // Query API, i.e. A single read from the cache
  virtual void Query(const key_type* d_keys, const size_t len, float* d_values,
                     uint64_t* d_missing_index, key_type* d_missing_keys, size_t* d_missing_len,
-                     cudaStream_t stream,
+                     hipStream_t stream,
                     const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;

  // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
  virtual void Replace(const key_type* d_keys, const size_t len, const float* d_values,
-                       cudaStream_t stream,
+                       hipStream_t stream,
                       const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;

  // Update API, i.e. update the embeddings which exist in the cache
  virtual void Update(const key_type* d_keys, const size_t len, const float* d_values,
-                      cudaStream_t stream,
+                      hipStream_t stream,
                      const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;

  // Dump API, i.e. dump some slabsets' keys from the cache
  virtual void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
-                    const size_t end_set_index, cudaStream_t stream) = 0;
+                    const size_t end_set_index, hipStream_t stream) = 0;
 };

 }  // namespace gpu_cache
--- a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
@@ -61,20 +63,20 @@ class gpu_cache : public gpu_cache_api<key_type> {

  // Query API, i.e. A single read from the cache
  void Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index,
-             key_type* d_missing_keys, size_t* d_missing_len, cudaStream_t stream,
+             key_type* d_missing_keys, size_t* d_missing_len, hipStream_t stream,
             const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;

  // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
-  void Replace(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream,
+  void Replace(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream,
               const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;

  // Update API, i.e. update the embeddings which exist in the cache
-  void Update(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream,
+  void Update(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream,
              const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;

  // Dump API, i.e. dump some slabsets' keys from the cache
  void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
-            const size_t end_set_index, cudaStream_t stream) override;
+            const size_t end_set_index, hipStream_t stream) override;

 public:
  using slabset = slab_set<set_associativity, key_type, warp_size>;

--- a/third_party/HugeCTR/gpu_cache/include/nv_util.h
+++ b/third_party/HugeCTR/gpu_cache/include/nv_util.h
+// !!! This is a file automatically generated by hipify!!!
 /*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
@@ -15,7 +16,7 @@
 */
 #pragma once

-#include <cuda_runtime_api.h>
+#include <hip/hip_runtime_api.h>

 #include <stdexcept>
 #include <string>
@@ -30,17 +31,17 @@ class CudaException : public std::runtime_error {
  CudaException(const std::string& what) : runtime_error(what) {}
 };

-inline void cuda_check_(cudaError_t val, const char* file, int line) {
-  if (val != cudaSuccess) {
+inline void cuda_check_(hipError_t val, const char* file, int line) {
+  if (val != hipSuccess) {
    throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " +
-                        std::to_string(val) + ": " + cudaGetErrorString(val));
+                        std::to_string(val) + ": " + hipGetErrorString(val));
  }
 }

 class CudaDeviceRestorer {
 public:
-  CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); }
-  ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); }
+  CudaDeviceRestorer() { CUDA_CHECK(hipGetDevice(&dev_)); }
+  ~CudaDeviceRestorer() { CUDA_CHECK(hipSetDevice(dev_)); }
  void check_device(int device) const {
    if (device != dev_) {
      throw std::runtime_error(
@@ -54,14 +55,14 @@ class CudaDeviceRestorer {
 };

 inline int get_dev(const void* ptr) {
-  cudaPointerAttributes attr;
-  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
+  hipPointerAttribute_t attr;
+  CUDA_CHECK(hipPointerGetAttributes(&attr, ptr));
  int dev = -1;

-#if CUDART_VERSION >= 10000
-  if (attr.type == cudaMemoryTypeDevice)
+#if DTKRT_VERSION >= 10000
+  if (attr.type == hipMemoryTypeDevice)
 #else
-  if (attr.memoryType == cudaMemoryTypeDevice)
+  if (attr.memoryType == hipMemoryTypeDevice)
 #endif
  {
    dev = attr.device;
@@ -72,7 +73,7 @@ inline int get_dev(const void* ptr) {
 inline void switch_to_dev(const void* ptr) {
  int dev = get_dev(ptr);
  if (dev >= 0) {
-    CUDA_CHECK(cudaSetDevice(dev));
+    CUDA_CHECK(hipSetDevice(dev));
  }
 }


--- a/third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
@@ -50,17 +52,17 @@ class StaticHashTable {
    return keys_bytes + indices_bytes + values_bytes;
  }

-  void clear(cudaStream_t stream = 0);
+  void clear(hipStream_t stream = 0);

  // Note:
  // 1. Please make sure the key to be inserted is not duplicated.
  // 2. Please make sure the key to be inserted does not exist in the table.
  // 3. Please make sure (size() + num_keys) <= capacity().
  void insert(const key_type *keys, const value_type *values, size_type num_keys,
-              cudaStream_t stream = 0);
+              hipStream_t stream = 0);

  void lookup(const key_type *keys, value_type *values, int num_keys, value_type default_value = 0,
-              cudaStream_t stream = 0);
+              hipStream_t stream = 0);

 private:
  key_type *table_keys_;

--- a/third_party/HugeCTR/gpu_cache/include/static_table.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/static_table.hpp
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
@@ -36,12 +38,12 @@ class static_table {
  ~static_table(){};

  // Query API, i.e. A single read from the cache
-  void Query(const key_type* d_keys, const size_t len, float* d_values, cudaStream_t stream);
+  void Query(const key_type* d_keys, const size_t len, float* d_values, hipStream_t stream);

  // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
-  void Init(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream);
+  void Init(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream);

-  void Clear(cudaStream_t stream);
+  void Clear(hipStream_t stream);

 private:
  StaticHashTable<key_type, float> static_hash_table_;

--- a/third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include <nv_util.h>
-
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-namespace gpu_cache {
-
-template <typename key_type, typename index_type>
-class HashBlock {
- public:
-  key_type* keys;
-  size_t num_sets;
-  size_t capacity;
-
-  HashBlock(size_t expected_capacity, int set_size, int batch_size);
-  ~HashBlock();
-  void add(const key_type* new_keys, const size_t num_keys, key_type* missing_keys,
-           int* num_missing_keys, cudaStream_t stream);
-  void query(const key_type* query_keys, const size_t num_keys, index_type* output_indices,
-             key_type* missing_keys, int* missing_positions, int* num_missing_keys,
-             cudaStream_t stream);
-  void query(const key_type* query_keys, int* num_keys, index_type* output_indices,
-             cudaStream_t stream);
-  void clear(cudaStream_t stream);
-
- private:
-  int max_set_size_;
-  int batch_size_;
-  int* set_sizes_;
-};
-
-template <typename vec_type>
-class H2HCopy {
- public:
-  H2HCopy(int num_threads) : num_threads_(num_threads), working_(num_threads) {
-    for (int i = 0; i < num_threads_; i++) {
-      threads_.emplace_back(
-          [&](int idx) {
-            while (!terminate_) {
-              if (working_[idx].load(std::memory_order_relaxed)) {
-                working_[idx].store(false, std::memory_order_relaxed);
-                if (num_keys_ == 0) continue;
-                size_t num_keys_this_thread = (num_keys_ - 1) / num_threads_ + 1;
-                size_t begin = idx * num_keys_this_thread;
-                if (idx == num_threads_ - 1) {
-                  num_keys_this_thread = num_keys_ - num_keys_this_thread * idx;
-                }
-                size_t end = begin + num_keys_this_thread;
-
-                for (size_t i = begin; i < end; i++) {
-                  size_t idx_vec = get_index_(i);
-                  if (idx_vec == std::numeric_limits<size_t>::max()) {
-                    continue;
-                  }
-                  memcpy(dst_data_ptr_ + i * vec_size_, src_data_ptr_ + idx_vec * vec_size_,
-                         sizeof(vec_type) * vec_size_);
-                }
-                num_finished_workers_++;
-              }
-            }
-            std::this_thread::sleep_for(std::chrono::microseconds(1));
-          },
-          i);
-    }
-  };
-
-  void copy(vec_type* dst_data_ptr, vec_type* src_data_ptr, size_t num_keys, int vec_size,
-            std::function<size_t(size_t)> get_index_func) {
-    std::lock_guard<std::mutex> guard(submit_mutex_);
-    dst_data_ptr_ = dst_data_ptr;
-    src_data_ptr_ = src_data_ptr;
-    get_index_ = get_index_func;
-    num_keys_ = num_keys;
-    vec_size_ = vec_size;
-    num_finished_workers_.store(0, std::memory_order_acquire);
-
-    for (auto& working : working_) {
-      working.store(true, std::memory_order_relaxed);
-    }
-
-    while (num_finished_workers_ != num_threads_) {
-      continue;
-    }
-  }
-
-  ~H2HCopy() {
-    terminate_ = true;
-    for (auto& t : threads_) {
-      t.join();
-    }
-  }
-
- private:
-  vec_type* src_data_ptr_;
-  vec_type* dst_data_ptr_;
-
-  std::function<size_t(size_t)> get_index_;
-
-  size_t num_keys_;
-  int vec_size_;
-
-  std::mutex submit_mutex_;
-  const int num_threads_;
-  std::vector<std::thread> threads_;
-  std::vector<std::atomic<bool>> working_;
-  volatile bool terminate_{false};
-  std::atomic<int> num_finished_workers_{0};
-};
-
-template <typename key_type, typename index_type, typename vec_type = float>
-class UvmTable {
- public:
-  UvmTable(const size_t device_table_capacity, const size_t host_table_capacity,
-           const int max_batch_size, const int vec_size,
-           const vec_type default_value = (vec_type)0);
-  ~UvmTable();
-  void query(const key_type* d_keys, const int len, vec_type* d_vectors, cudaStream_t stream = 0);
-  void add(const key_type* h_keys, const vec_type* h_vectors, const size_t len);
-  void clear(cudaStream_t stream = 0);
-
- private:
-  static constexpr int num_buffers_ = 2;
-  key_type* d_keys_buffer_;
-  vec_type* d_vectors_buffer_;
-  vec_type* d_vectors_;
-
-  index_type* d_output_indices_;
-  index_type* d_output_host_indices_;
-  index_type* h_output_host_indices_;
-
-  key_type* d_missing_keys_;
-  int* d_missing_positions_;
-  int* d_missing_count_;
-
-  std::vector<vec_type> h_vectors_;
-  key_type* h_missing_keys_;
-
-  cudaStream_t query_stream_;
-  cudaEvent_t query_event_;
-
-  vec_type* h_cpy_buffers_[num_buffers_];
-  vec_type* d_cpy_buffers_[num_buffers_];
-  cudaStream_t cpy_streams_[num_buffers_];
-  cudaEvent_t cpy_events_[num_buffers_];
-
-  std::unordered_map<key_type, index_type> h_final_missing_items_;
-
-  int max_batch_size_;
-  int vec_size_;
-  size_t num_set_;
-  size_t num_host_set_;
-  size_t table_capacity_;
-  std::vector<vec_type> default_vector_;
-
-  HashBlock<key_type, index_type> device_table_;
-  HashBlock<key_type, index_type> host_table_;
-};
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nv_util.h>
+
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+namespace gpu_cache {
+
+template <typename key_type, typename index_type>
+class HashBlock {
+ public:
+  key_type* keys;
+  size_t num_sets;
+  size_t capacity;
+
+  HashBlock(size_t expected_capacity, int set_size, int batch_size);
+  ~HashBlock();
+  void add(const key_type* new_keys, const size_t num_keys, key_type* missing_keys,
+           int* num_missing_keys, hipStream_t stream);
+  void query(const key_type* query_keys, const size_t num_keys, index_type* output_indices,
+             key_type* missing_keys, int* missing_positions, int* num_missing_keys,
+             hipStream_t stream);
+  void query(const key_type* query_keys, int* num_keys, index_type* output_indices,
+             hipStream_t stream);
+  void clear(hipStream_t stream);
+
+ private:
+  int max_set_size_;
+  int batch_size_;
+  int* set_sizes_;
+};
+
+template <typename vec_type>
+class H2HCopy {
+ public:
+  H2HCopy(int num_threads) : num_threads_(num_threads), working_(num_threads) {
+    for (int i = 0; i < num_threads_; i++) {
+      threads_.emplace_back(
+          [&](int idx) {
+            while (!terminate_) {
+              if (working_[idx].load(std::memory_order_relaxed)) {
+                working_[idx].store(false, std::memory_order_relaxed);
+                if (num_keys_ == 0) continue;
+                size_t num_keys_this_thread = (num_keys_ - 1) / num_threads_ + 1;
+                size_t begin = idx * num_keys_this_thread;
+                if (idx == num_threads_ - 1) {
+                  num_keys_this_thread = num_keys_ - num_keys_this_thread * idx;
+                }
+                size_t end = begin + num_keys_this_thread;
+
+                for (size_t i = begin; i < end; i++) {
+                  size_t idx_vec = get_index_(i);
+                  if (idx_vec == std::numeric_limits<size_t>::max()) {
+                    continue;
+                  }
+                  memcpy(dst_data_ptr_ + i * vec_size_, src_data_ptr_ + idx_vec * vec_size_,
+                         sizeof(vec_type) * vec_size_);
+                }
+                num_finished_workers_++;
+              }
+            }
+            std::this_thread::sleep_for(std::chrono::microseconds(1));
+          },
+          i);
+    }
+  };
+
+  void copy(vec_type* dst_data_ptr, vec_type* src_data_ptr, size_t num_keys, int vec_size,
+            std::function<size_t(size_t)> get_index_func) {
+    std::lock_guard<std::mutex> guard(submit_mutex_);
+    dst_data_ptr_ = dst_data_ptr;
+    src_data_ptr_ = src_data_ptr;
+    get_index_ = get_index_func;
+    num_keys_ = num_keys;
+    vec_size_ = vec_size;
+    num_finished_workers_.store(0, std::memory_order_acquire);
+
+    for (auto& working : working_) {
+      working.store(true, std::memory_order_relaxed);
+    }
+
+    while (num_finished_workers_ != num_threads_) {
+      continue;
+    }
+  }
+
+  ~H2HCopy() {
+    terminate_ = true;
+    for (auto& t : threads_) {
+      t.join();
+    }
+  }
+
+ private:
+  vec_type* src_data_ptr_;
+  vec_type* dst_data_ptr_;
+
+  std::function<size_t(size_t)> get_index_;
+
+  size_t num_keys_;
+  int vec_size_;
+
+  std::mutex submit_mutex_;
+  const int num_threads_;
+  std::vector<std::thread> threads_;
+  std::vector<std::atomic<bool>> working_;
+  volatile bool terminate_{false};
+  std::atomic<int> num_finished_workers_{0};
+};
+
+template <typename key_type, typename index_type, typename vec_type = float>
+class UvmTable {
+ public:
+  UvmTable(const size_t device_table_capacity, const size_t host_table_capacity,
+           const int max_batch_size, const int vec_size,
+           const vec_type default_value = (vec_type)0);
+  ~UvmTable();
+  void query(const key_type* d_keys, const int len, vec_type* d_vectors, hipStream_t stream = 0);
+  void add(const key_type* h_keys, const vec_type* h_vectors, const size_t len);
+  void clear(hipStream_t stream = 0);
+
+ private:
+  static constexpr int num_buffers_ = 2;
+  key_type* d_keys_buffer_;
+  vec_type* d_vectors_buffer_;
+  vec_type* d_vectors_;
+
+  index_type* d_output_indices_;
+  index_type* d_output_host_indices_;
+  index_type* h_output_host_indices_;
+
+  key_type* d_missing_keys_;
+  int* d_missing_positions_;
+  int* d_missing_count_;
+
+  std::vector<vec_type> h_vectors_;
+  key_type* h_missing_keys_;
+
+  hipStream_t query_stream_;
+  hipEvent_t query_event_;
+
+  vec_type* h_cpy_buffers_[num_buffers_];
+  vec_type* d_cpy_buffers_[num_buffers_];
+  hipStream_t cpy_streams_[num_buffers_];
+  hipEvent_t cpy_events_[num_buffers_];
+
+  std::unordered_map<key_type, index_type> h_final_missing_items_;
+
+  int max_batch_size_;
+  int vec_size_;
+  size_t num_set_;
+  size_t num_host_set_;
+  size_t table_capacity_;
+  std::vector<vec_type> default_vector_;
+
+  HashBlock<key_type, index_type> device_table_;
+  HashBlock<key_type, index_type> host_table_;
+};
 }  // namespace gpu_cache
\ No newline at end of file
--- a/third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
+++ b/third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
@@ -15,15 +15,14 @@

 cmake_minimum_required(VERSION 3.8)
 file(GLOB gpu_cache_src
-  nv_gpu_cache.cu
-  static_table.cu
-  static_hash_table.cu
-  uvm_table.cu
+  nv_gpu_cache.hip
+  static_table.hip
+  static_hash_table.hip
+  uvm_table.hip
 )

 add_library(gpu_cache SHARED ${gpu_cache_src})
 target_compile_features(gpu_cache PUBLIC cxx_std_11)
-set_target_properties(gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-set_target_properties(gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)
+set_target_properties(gpu_cache PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS  ON)
+# set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)

--- a/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
+++ b/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
@@ -14,7 +16,7 @@
 * limitations under the License.
 */

-#include <cooperative_groups.h>
+#include <hip/hip_cooperative_groups.h>

 #include <nv_gpu_cache.hpp>

@@ -29,9 +31,11 @@ __forceinline__ __device__ long long atomicAdd(long long* address, long long val
  return (long long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
 }

+#ifndef __HIPCC__
 __forceinline__ __device__ unsigned long atomicAdd(unsigned long* address, unsigned long val) {
  return (unsigned long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
 }
+#endif

 namespace gpu_cache {

@@ -1253,27 +1257,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
  }

  // Get the current CUDA dev
-  CUDA_CHECK(cudaGetDevice(&dev_));
+  CUDA_CHECK(hipGetDevice(&dev_));

  // Calculate # of slot
  num_slot_ = capacity_in_set_ * set_associativity * warp_size;

  // Allocate GPU memory for cache
-  CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
-  CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
-  CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
-  CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(atomic_ref_counter_type)));
+  CUDA_CHECK(hipMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
+  CUDA_CHECK(hipMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
+  CUDA_CHECK(hipMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
+  CUDA_CHECK(hipMalloc((void**)&global_counter_, sizeof(atomic_ref_counter_type)));

  // Allocate GPU memory for set mutex
-  CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(mutex) * capacity_in_set_));
+  CUDA_CHECK(hipMalloc((void**)&set_mutex_, sizeof(mutex) * capacity_in_set_));

  // Initialize the cache, set all entry to unused <K,V>
-  init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>(
+ hipLaunchKernelGGL(( init_cache), dim3(((num_slot_ - 1) / BLOCK_SIZE_) + 1), dim3(BLOCK_SIZE_), 0, 0, 
      keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);

  // Wait for initialization to finish
-  CUDA_CHECK(cudaStreamSynchronize(0));
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipStreamSynchronize(0));
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
@@ -1301,27 +1305,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
  }

  // Get the current CUDA dev
-  CUDA_CHECK(cudaGetDevice(&dev_));
+  CUDA_CHECK(hipGetDevice(&dev_));

  // Calculate # of slot
  num_slot_ = capacity_in_set_ * set_associativity * warp_size;

  // Allocate GPU memory for cache
-  CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
-  CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
-  CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
-  CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(ref_counter_type)));
+  CUDA_CHECK(hipMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
+  CUDA_CHECK(hipMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
+  CUDA_CHECK(hipMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
+  CUDA_CHECK(hipMalloc((void**)&global_counter_, sizeof(ref_counter_type)));

  // Allocate GPU memory for set mutex
-  CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(int) * capacity_in_set_));
+  CUDA_CHECK(hipMalloc((void**)&set_mutex_, sizeof(int) * capacity_in_set_));

  // Initialize the cache, set all entry to unused <K,V>
-  init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>(
+ hipLaunchKernelGGL(( init_cache), dim3(((num_slot_ - 1) / BLOCK_SIZE_) + 1), dim3(BLOCK_SIZE_), 0, 0, 
      keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);

  // Wait for initialization to finish
-  CUDA_CHECK(cudaStreamSynchronize(0));
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipStreamSynchronize(0));
+  CUDA_CHECK(hipGetLastError());
 }
 #endif

@@ -1337,18 +1341,18 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
  dev_restorer.check_device(dev_);

  // Destruct CUDA std object
-  destruct_kernel<<<((capacity_in_set_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>(
+ hipLaunchKernelGGL(( destruct_kernel), dim3(((capacity_in_set_ - 1) / BLOCK_SIZE_) + 1), dim3(BLOCK_SIZE_), 0, 0, 
      global_counter_, set_mutex_, capacity_in_set_);
  // Wait for destruction to finish
-  CUDA_CHECK(cudaStreamSynchronize(0));
+  CUDA_CHECK(hipStreamSynchronize(0));

  // Free GPU memory for cache
-  CUDA_CHECK(cudaFree(keys_));
-  CUDA_CHECK(cudaFree(vals_));
-  CUDA_CHECK(cudaFree(slot_counter_));
-  CUDA_CHECK(cudaFree(global_counter_));
+  CUDA_CHECK(hipFree(keys_));
+  CUDA_CHECK(hipFree(vals_));
+  CUDA_CHECK(hipFree(slot_counter_));
+  CUDA_CHECK(hipFree(global_counter_));
  // Free GPU memory for set mutex
-  CUDA_CHECK(cudaFree(set_mutex_));
+  CUDA_CHECK(hipFree(set_mutex_));
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
@@ -1362,12 +1366,12 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
  dev_restorer.check_device(dev_);

  // Free GPU memory for cache
-  CUDA_CHECK(cudaFree(keys_));
-  CUDA_CHECK(cudaFree(vals_));
-  CUDA_CHECK(cudaFree(slot_counter_));
-  CUDA_CHECK(cudaFree(global_counter_));
+  CUDA_CHECK(hipFree(keys_));
+  CUDA_CHECK(hipFree(vals_));
+  CUDA_CHECK(hipFree(slot_counter_));
+  CUDA_CHECK(hipFree(global_counter_));
  // Free GPU memory for set mutex
-  CUDA_CHECK(cudaFree(set_mutex_));
+  CUDA_CHECK(hipFree(set_mutex_));
 }
 #endif

@@ -1377,7 +1381,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
               slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
                                   uint64_t* d_missing_index, key_type* d_missing_keys,
-                                   size_t* d_missing_len, cudaStream_t stream,
+                                   size_t* d_missing_len, hipStream_t stream,
                                   const size_t task_per_warp_tile) {
  // Device Restorer
  nv::CudaDeviceRestorer dev_restorer;
@@ -1387,27 +1391,27 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
  // Check if it is a valid query
  if (len == 0) {
    // Set the d_missing_len to 0 before return
-    CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
+    CUDA_CHECK(hipMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
    return;
  }

  // Update the global counter as user perform a new(most recent) read operation to the cache
  // Resolve distance overflow issue as well.
-  update_kernel_overflow_ignore<atomic_ref_counter_type>
-      <<<1, 1, 0, stream>>>(global_counter_, d_missing_len);
+ hipLaunchKernelGGL(( update_kernel_overflow_ignore<atomic_ref_counter_type>)
+      , dim3(1), dim3(1), 0, stream, global_counter_, d_missing_len);

  // Read from the cache
  // Touch and refresh the hitting slot
  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
-  get_kernel<key_type, ref_counter_type, atomic_ref_counter_type, slabset, set_hasher, slab_hasher,
-             mutex, empty_key, set_associativity, warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+ hipLaunchKernelGGL(( get_kernel<key_type, ref_counter_type, atomic_ref_counter_type, slabset, set_hasher, slab_hasher,
+             mutex, empty_key, set_associativity, warp_size>), dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, 
      d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len,
      global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_,
      task_per_warp_tile);

  // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
@@ -1415,7 +1419,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
               slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
                                   uint64_t* d_missing_index, key_type* d_missing_keys,
-                                   size_t* d_missing_len, cudaStream_t stream,
+                                   size_t* d_missing_len, hipStream_t stream,
                                   const size_t task_per_warp_tile) {
  // Device Restorer
  nv::CudaDeviceRestorer dev_restorer;
@@ -1425,27 +1429,27 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
  // Check if it is a valid query
  if (len == 0) {
    // Set the d_missing_len to 0 before return
-    CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
+    CUDA_CHECK(hipMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
    return;
  }

  // Update the global counter as user perform a new(most recent) read operation to the cache
  // Resolve distance overflow issue as well.
-  update_kernel_overflow_ignore<ref_counter_type>
-      <<<1, 1, 0, stream>>>(global_counter_, d_missing_len);
+ hipLaunchKernelGGL(( update_kernel_overflow_ignore<ref_counter_type>)
+      , dim3(1), dim3(1), 0, stream, global_counter_, d_missing_len);

  // Read from the cache
  // Touch and refresh the hitting slot
  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
-  get_kernel<key_type, ref_counter_type, slabset, set_hasher, slab_hasher, empty_key,
-             set_associativity, warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+ hipLaunchKernelGGL(( get_kernel<key_type, ref_counter_type, slabset, set_hasher, slab_hasher, empty_key,
+             set_associativity, warp_size>), dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, 
      d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len,
      global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_,
      task_per_warp_tile);

  // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #endif

@@ -1454,7 +1458,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
          int warp_size, typename set_hasher, typename slab_hasher>
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
               slab_hasher>::Replace(const key_type* d_keys, const size_t len,
-                                     const float* d_values, cudaStream_t stream,
+                                     const float* d_values, hipStream_t stream,
                                     const size_t task_per_warp_tile) {
  // Check if it is a valid replacement
  if (len == 0) {
@@ -1470,21 +1474,21 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
  // Then replace the <k,v> pairs into the cache
  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
-  insert_replace_kernel<key_type, slabset, ref_counter_type, mutex, atomic_ref_counter_type,
-                        set_hasher, slab_hasher, empty_key, set_associativity, warp_size>
-      <<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, d_values, embedding_vec_size_, len, keys_,
+ hipLaunchKernelGGL(( insert_replace_kernel<key_type, slabset, ref_counter_type, mutex, atomic_ref_counter_type,
+                        set_hasher, slab_hasher, empty_key, set_associativity, warp_size>)
+      , dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, d_keys, d_values, embedding_vec_size_, len, keys_,
                                              vals_, slot_counter_, set_mutex_, global_counter_,
                                              capacity_in_set_, task_per_warp_tile);

  // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
          int warp_size, typename set_hasher, typename slab_hasher>
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
               slab_hasher>::Replace(const key_type* d_keys, const size_t len,
-                                     const float* d_values, cudaStream_t stream,
+                                     const float* d_values, hipStream_t stream,
                                     const size_t task_per_warp_tile) {
  // Check if it is a valid replacement
  if (len == 0) {
@@ -1500,13 +1504,13 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
  // Then replace the <k,v> pairs into the cache
  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
-  insert_replace_kernel<key_type, slabset, ref_counter_type, set_hasher, slab_hasher, empty_key,
-                        set_associativity, warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+ hipLaunchKernelGGL(( insert_replace_kernel<key_type, slabset, ref_counter_type, set_hasher, slab_hasher, empty_key,
+                        set_associativity, warp_size>), dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, 
      d_keys, d_values, embedding_vec_size_, len, keys_, vals_, slot_counter_, set_mutex_,
      global_counter_, capacity_in_set_, task_per_warp_tile);

  // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #endif

@@ -1515,7 +1519,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
          int warp_size, typename set_hasher, typename slab_hasher>
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
               slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
-                                    cudaStream_t stream, const size_t task_per_warp_tile) {
+                                    hipStream_t stream, const size_t task_per_warp_tile) {
  // Check if it is a valid update request
  if (len == 0) {
    return;
@@ -1529,20 +1533,20 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
  // Update the value of input keys that are existed in the cache
  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
-  update_kernel<key_type, slabset, set_hasher, slab_hasher, mutex, empty_key, set_associativity,
-                warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+ hipLaunchKernelGGL(( update_kernel<key_type, slabset, set_hasher, slab_hasher, mutex, empty_key, set_associativity,
+                warp_size>), dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, 
      d_keys, len, d_values, embedding_vec_size_, capacity_in_set_, keys_, vals_, set_mutex_,
      task_per_warp_tile);

  // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
          int warp_size, typename set_hasher, typename slab_hasher>
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
               slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
-                                    cudaStream_t stream, const size_t task_per_warp_tile) {
+                                    hipStream_t stream, const size_t task_per_warp_tile) {
  // Check if it is a valid update request
  if (len == 0) {
    return;
@@ -1556,13 +1560,13 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
  // Update the value of input keys that are existed in the cache
  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
-  update_kernel<key_type, slabset, set_hasher, slab_hasher, empty_key, set_associativity, warp_size>
-      <<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, len, d_values, embedding_vec_size_,
+ hipLaunchKernelGGL(( update_kernel<key_type, slabset, set_hasher, slab_hasher, empty_key, set_associativity, warp_size>)
+      , dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, d_keys, len, d_values, embedding_vec_size_,
                                              capacity_in_set_, keys_, vals_, set_mutex_,
                                              task_per_warp_tile);

  // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #endif

@@ -1572,7 +1576,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
               slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
                                  const size_t start_set_index, const size_t end_set_index,
-                                  cudaStream_t stream) {
+                                  hipStream_t stream) {
  // Check if it is a valid dump request
  if (start_set_index >= capacity_in_set_) {
    printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
@@ -1589,17 +1593,17 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
  dev_restorer.check_device(dev_);

  // Set the global counter to 0 first
-  CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+  CUDA_CHECK(hipMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));

  // Dump keys from the cache
  const size_t grid_size =
      (((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1;
-  dump_kernel<key_type, slabset, mutex, empty_key, set_associativity, warp_size>
-      <<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, d_dump_counter, keys_, set_mutex_,
+ hipLaunchKernelGGL(( dump_kernel<key_type, slabset, mutex, empty_key, set_associativity, warp_size>)
+      , dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, d_keys, d_dump_counter, keys_, set_mutex_,
                                              start_set_index, end_set_index);

  // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
@@ -1607,7 +1611,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
               slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
                                  const size_t start_set_index, const size_t end_set_index,
-                                  cudaStream_t stream) {
+                                  hipStream_t stream) {
  // Check if it is a valid dump request
  if (start_set_index >= capacity_in_set_) {
    printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
@@ -1624,17 +1628,17 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
  dev_restorer.check_device(dev_);

  // Set the global counter to 0 first
-  CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+  CUDA_CHECK(hipMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));

  // Dump keys from the cache
  const size_t grid_size =
      (((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1;
-  dump_kernel<key_type, slabset, empty_key, set_associativity, warp_size>
-      <<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, d_dump_counter, keys_, set_mutex_,
+ hipLaunchKernelGGL(( dump_kernel<key_type, slabset, empty_key, set_associativity, warp_size>)
+      , dim3(grid_size), dim3(BLOCK_SIZE_), 0, stream, d_keys, d_dump_counter, keys_, set_mutex_,
                                              start_set_index, end_set_index);

  // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #endif


--- a/third_party/HugeCTR/gpu_cache/src/static_hash_table.cu
+++ b/third_party/HugeCTR/gpu_cache/src/static_hash_table.cu
+// !!! This is a file automatically generated by hipify!!!
 /*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
@@ -14,8 +15,8 @@
 * limitations under the License.
 */

-#include <cooperative_groups.h>
-#include <cuda.h>
+#include <hip/hip_cooperative_groups.h>
+#include <hip/hip_runtime.h>
 #include <stdint.h>
 #include <stdio.h>

@@ -49,7 +50,7 @@ __device__ size_type insert(key_type *table, size_type capacity, key_type key, c
  // otherwise return invalid_slot.

  const size_type num_groups = capacity / group_size;
-#if (CUDA_VERSION < 11060)
+#if (DTK_VERSION < 11060)
  unsigned long long num_threads_per_group = cg.size();
 #else
  unsigned long long num_threads_per_group = cg.num_threads();
@@ -152,7 +153,7 @@ __device__ size_type lookup(key_type *table, size_type capacity, key_type key, c

  const size_type num_groups = capacity / group_size;

-#if (CUDA_VERSION < 11060)
+#if (DTK_VERSION < 11060)
  unsigned long long num_threads_per_group = cg.size();
 #else
  unsigned long long num_threads_per_group = cg.num_threads();
@@ -300,19 +301,19 @@ StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHash
  size_t align_m = 16;
  size_t num_keys = key_capacity_ + 1;
  size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m;
-  CUDA_CHECK(cudaMalloc(&table_keys_, sizeof(key_type) * num_keys));
-  CUDA_CHECK(cudaMalloc(&table_indices_, sizeof(size_type) * num_keys));
-  CUDA_CHECK(cudaMalloc(&table_values_, sizeof(value_type) * num_values));
+  CUDA_CHECK(hipMalloc(&table_keys_, sizeof(key_type) * num_keys));
+  CUDA_CHECK(hipMalloc(&table_indices_, sizeof(size_type) * num_keys));
+  CUDA_CHECK(hipMalloc(&table_values_, sizeof(value_type) * num_values));

  // Initialize table_keys_
-  CUDA_CHECK(cudaMemset(table_keys_, 0xff, sizeof(key_type) * key_capacity_));
-  CUDA_CHECK(cudaMemset(table_keys_ + key_capacity_, 0, sizeof(key_type)));
+  CUDA_CHECK(hipMemset(table_keys_, 0xff, sizeof(key_type) * key_capacity_));
+  CUDA_CHECK(hipMemset(table_keys_ + key_capacity_, 0, sizeof(key_type)));
 }

 template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
          typename hasher>
 void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert(
-    const key_type *keys, const value_type *values, size_type num_keys, cudaStream_t stream) {
+    const key_type *keys, const value_type *values, size_type num_keys, hipStream_t stream) {
  if (num_keys == 0) {
    return;
  }
@@ -324,12 +325,12 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
  // Insert keys
  constexpr int block = 256;
  int grid = (num_keys - 1) / block + 1;
-  InsertKeyKernel<tile_size, group_size>
-      <<<grid, block, 0, stream>>>(table_keys_, table_indices_, key_capacity_, keys, num_keys,
+ hipLaunchKernelGGL(( InsertKeyKernel<tile_size, group_size>)
+      , dim3(grid), dim3(block), 0, stream, table_keys_, table_indices_, key_capacity_, keys, num_keys,
                                   size_, hash_, empty_key, invalid_slot);
  // Copy values
-  CUDA_CHECK(cudaMemcpyAsync(table_values_ + size_ * value_dim_, values,
-                             sizeof(value_type) * num_keys * value_dim_, cudaMemcpyDeviceToDevice,
+  CUDA_CHECK(hipMemcpyAsync(table_values_ + size_ * value_dim_, values,
+                             sizeof(value_type) * num_keys * value_dim_, hipMemcpyDeviceToDevice,
                             stream));
  size_ += num_keys;
 }
@@ -337,25 +338,25 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
 template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
          typename hasher>
 void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear(
-    cudaStream_t stream) {
-  CUDA_CHECK(cudaMemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream));
-  CUDA_CHECK(cudaMemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream));
+    hipStream_t stream) {
+  CUDA_CHECK(hipMemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream));
+  CUDA_CHECK(hipMemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream));
  size_ = 0;
 }

 template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
          typename hasher>
 StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() {
-  CUDA_CHECK(cudaFree(table_keys_));
-  CUDA_CHECK(cudaFree(table_indices_));
-  CUDA_CHECK(cudaFree(table_values_));
+  CUDA_CHECK(hipFree(table_keys_));
+  CUDA_CHECK(hipFree(table_indices_));
+  CUDA_CHECK(hipFree(table_values_));
 }

 template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
          typename hasher>
 void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup(
    const key_type *keys, value_type *values, int num_keys, value_type default_value,
-    cudaStream_t stream) {
+    hipStream_t stream) {
  if (num_keys == 0) {
    return;
  }
@@ -363,7 +364,7 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::looku
  constexpr int block = 256;
  const int grid = (num_keys - 1) / block + 1;
  // Lookup keys
-  LookupKernel<tile_size, group_size><<<grid, block, 0, stream>>>(
+ hipLaunchKernelGGL(( LookupKernel<tile_size, group_size>), dim3(grid), dim3(block), 0, stream, 
      table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values,
      hash_, empty_key, default_value, invalid_slot);
 }

--- a/third_party/HugeCTR/gpu_cache/src/static_table.cu
+++ b/third_party/HugeCTR/gpu_cache/src/static_table.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
@@ -14,7 +16,7 @@
 * limitations under the License.
 */

-#include <cooperative_groups.h>
+#include <hip/hip_cooperative_groups.h>
 #include <nv_util.h>

 #include <iostream>
@@ -38,18 +40,18 @@ static_table<key_type>::static_table(const size_t table_size, const size_t embed

 template <typename key_type>
 void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values,
-                                   cudaStream_t stream) {
+                                   hipStream_t stream) {
  static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream);
 }

 template <typename key_type>
 void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values,
-                                  cudaStream_t stream) {
+                                  hipStream_t stream) {
  static_hash_table_.insert(d_keys, d_values, len, stream);
 }

 template <typename key_type>
-void static_table<key_type>::Clear(cudaStream_t stream) {
+void static_table<key_type>::Clear(hipStream_t stream) {
  static_hash_table_.clear(stream);
 }


--- a/third_party/HugeCTR/gpu_cache/src/uvm_table.cu
+++ b/third_party/HugeCTR/gpu_cache/src/uvm_table.cu
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cooperative_groups.h>
-#include <cuda_runtime_api.h>
-#include <immintrin.h>
-
-#include <atomic>
-#include <iostream>
-#include <limits>
-#include <mutex>
-#include <uvm_table.hpp>
-
-namespace cg = cooperative_groups;
-
-namespace {
-
-constexpr int set_size = 4;
-constexpr int block_size = 256;
-
-template <typename key_type>
-__host__ __device__ key_type hash(key_type key) {
-  return key;
-}
-
-template <typename key_type>
-__global__ void hash_add_kernel(const key_type* new_keys, const int num_keys, key_type* keys,
-                                const int num_sets, int* set_sizes, const int max_set_size,
-                                key_type* missing_keys, int* num_missing_keys) {
-  __shared__ key_type s_missing_keys[block_size];
-  __shared__ int s_missing_count;
-  __shared__ size_t s_missing_idx;
-
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-
-  if (block.thread_rank() == 0) {
-    s_missing_count = 0;
-  }
-  block.sync();
-
-  size_t idx = grid.thread_rank();
-  if (idx < num_keys) {
-    auto key = new_keys[idx];
-    size_t idx_set = hash(key) % num_sets;
-    int prev_set_size = atomicAdd(&set_sizes[idx_set], 1);
-    if (prev_set_size < max_set_size) {
-      keys[idx_set * max_set_size + prev_set_size] = key;
-    } else {
-      int count = atomicAdd(&s_missing_count, 1);
-      s_missing_keys[count] = key;
-    }
-  }
-
-  block.sync();
-  if (block.thread_rank() == 0) {
-    s_missing_idx = atomicAdd(num_missing_keys, s_missing_count);
-  }
-  block.sync();
-  for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
-    missing_keys[s_missing_idx + i] = s_missing_keys[i];
-  }
-}
-
-template <typename key_type, typename index_type>
-__global__ void hash_query_kernel(const key_type* query_keys, int* num_keys_ptr,
-                                  const key_type* keys, const size_t num_sets,
-                                  const int max_set_size, index_type* output_indices) {
-  constexpr int tile_size = set_size;
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-  auto tile = cg::tiled_partition<tile_size>(block);
-  int num_keys = *num_keys_ptr;
-  if (num_keys == 0) return;
-
-#if (CUDA_VERSION < 11060)
-  size_t num_threads_per_grid = grid.size();
-#else
-  size_t num_threads_per_grid = grid.num_threads();
-#endif
-
-  size_t step = (num_keys - 1) / num_threads_per_grid + 1;
-  for (size_t i = 0; i < step; i++) {
-    size_t idx = i * num_threads_per_grid + grid.thread_rank();
-    key_type query_key = std::numeric_limits<key_type>::max();
-    if (idx < num_keys) {
-      query_key = query_keys[idx];
-    }
-    auto idx_set = hash(query_key) % num_sets;
-    for (int j = 0; j < tile_size; j++) {
-      auto current_idx_set = tile.shfl(idx_set, j);
-      auto current_query_key = tile.shfl(query_key, j);
-      if (current_query_key == std::numeric_limits<key_type>::max()) {
-        continue;
-      }
-      auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
-      int existed = tile.ballot(current_query_key == candidate_key);
-      auto current_idx = tile.shfl(idx, 0) + j;
-      if (existed) {
-        int src_lane = __ffs(existed) - 1;
-        size_t found_idx = current_idx_set * set_size + src_lane;
-        output_indices[current_idx] = num_sets * src_lane + current_idx_set;
-      } else {
-        output_indices[current_idx] = std::numeric_limits<index_type>::max();
-      }
-    }
-  }
-}
-
-template <typename key_type, typename index_type>
-__global__ void hash_query_kernel(const key_type* query_keys, const int num_keys,
-                                  const key_type* keys, const size_t num_sets,
-                                  const int max_set_size, index_type* output_indices,
-                                  key_type* missing_keys, int* missing_positions,
-                                  int* missing_count) {
-  __shared__ key_type s_missing_keys[block_size];
-  __shared__ key_type s_missing_positions[block_size];
-  __shared__ int s_missing_count;
-  __shared__ int s_missing_idx;
-
-  constexpr int tile_size = set_size;
-
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-  auto tile = cg::tiled_partition<tile_size>(block);
-
-  if (block.thread_rank() == 0) {
-    s_missing_count = 0;
-  }
-  block.sync();
-
-  size_t idx = grid.thread_rank();
-  key_type query_key = std::numeric_limits<key_type>::max();
-  if (idx < num_keys) {
-    query_key = query_keys[idx];
-  }
-  auto idx_set = hash(query_key) % num_sets;
-
-  for (int j = 0; j < tile_size; j++) {
-    auto current_idx_set = tile.shfl(idx_set, j);
-    auto current_query_key = tile.shfl(query_key, j);
-    if (current_query_key == std::numeric_limits<key_type>::max()) {
-      continue;
-    }
-    auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
-    int existed = tile.ballot(current_query_key == candidate_key);
-    if (existed) {
-      int src_lane = __ffs(existed) - 1;
-      size_t found_idx = current_idx_set * set_size + src_lane;
-      output_indices[tile.shfl(idx, 0) + j] = num_sets * src_lane + current_idx_set;
-    } else {
-      auto current_idx = tile.shfl(idx, 0) + j;
-      output_indices[current_idx] = std::numeric_limits<index_type>::max();
-      if (tile.thread_rank() == 0) {
-        int s_count = atomicAdd(&s_missing_count, 1);
-        s_missing_keys[s_count] = current_query_key;
-        s_missing_positions[s_count] = current_idx;
-      }
-    }
-  }
-
-  if (missing_keys == nullptr) {
-    if (grid.thread_rank() == 0 && missing_count) {
-      *missing_count = 0;
-    }
-    return;
-  }
-  block.sync();
-  if (block.thread_rank() == 0) {
-    s_missing_idx = atomicAdd(missing_count, s_missing_count);
-  }
-  block.sync();
-  for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
-    missing_keys[s_missing_idx + i] = s_missing_keys[i];
-    missing_positions[s_missing_idx + i] = s_missing_positions[i];
-  }
-}
-
-template <int warp_size>
-__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
-                                               const size_t emb_vec_size_in_float,
-                                               volatile float* d_dst, const float* d_src) {
-  // 16 bytes align
-  if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) {
-#pragma unroll
-    for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
-      d_dst[i] = d_src[i];
-    }
-  } else {
-#pragma unroll
-    for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) {
-      *(float4*)(d_dst + i * 4) = __ldg((const float4*)(d_src + i * 4));
-    }
-  }
-}
-
-template <typename index_type, typename vec_type>
-__global__ void read_vectors_kernel(const index_type* query_indices, const int num_keys,
-                                    const vec_type* vectors, const int vec_size,
-                                    vec_type* output_vectors) {
-  constexpr int warp_size = 32;
-
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-  auto tile = cg::tiled_partition<warp_size>(block);
-
-#if (CUDA_VERSION < 11060)
-  auto num_threads_per_grid = grid.size();
-#else
-  auto num_threads_per_grid = grid.num_threads();
-#endif
-
-  for (int step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
-    int key_num = step * num_threads_per_grid + grid.thread_rank();
-    index_type idx = std::numeric_limits<index_type>::max();
-    if (key_num < num_keys) {
-      idx = query_indices[key_num];
-    }
-#pragma unroll 4
-    for (size_t j = 0; j < warp_size; j++) {
-      index_type current_idx = tile.shfl(idx, j);
-      index_type idx_write = tile.shfl(key_num, 0) + j;
-      if (current_idx == std::numeric_limits<index_type>::max()) continue;
-      warp_tile_copy<warp_size>(tile.thread_rank(), vec_size, output_vectors + idx_write * vec_size,
-                                vectors + current_idx * vec_size);
-    }
-  }
-}
-
-template <typename index_type, typename vec_type>
-__global__ void distribute_vectors_kernel(const index_type* postions, const size_t num_keys,
-                                          const vec_type* vectors, const int vec_size,
-                                          vec_type* output_vectors) {
-  constexpr int warp_size = 32;
-
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-  auto tile = cg::tiled_partition<warp_size>(block);
-
-#if (CUDA_VERSION < 11060)
-  auto num_threads_per_grid = grid.size();
-#else
-  auto num_threads_per_grid = grid.num_threads();
-#endif
-
-  for (size_t step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
-    size_t key_num = step * num_threads_per_grid + grid.thread_rank();
-    index_type idx = std::numeric_limits<index_type>::max();
-    if (key_num < num_keys) {
-      idx = postions[key_num];
-    }
-#pragma unroll 4
-    for (size_t j = 0; j < warp_size; j++) {
-      size_t idx_write = tile.shfl(idx, j);
-      size_t idx_read = tile.shfl(key_num, 0) + j;
-      if (idx_write == std::numeric_limits<index_type>::max()) continue;
-      warp_tile_copy<warp_size>(tile.thread_rank(), vec_size,
-                                output_vectors + (size_t)idx_write * vec_size,
-                                vectors + (size_t)idx_read * vec_size);
-    }
-  }
-}
-
-}  // namespace
-
-namespace gpu_cache {
-template <typename key_type, typename index_type, typename vec_type>
-UvmTable<key_type, index_type, vec_type>::UvmTable(const size_t device_table_capacity,
-                                                   const size_t host_table_capacity,
-                                                   const int max_batch_size, const int vec_size,
-                                                   const vec_type default_value)
-    : max_batch_size_(std::max(100000, max_batch_size)),
-      vec_size_(vec_size),
-      num_set_((device_table_capacity - 1) / set_size + 1),
-      num_host_set_((host_table_capacity - 1) / set_size + 1),
-      table_capacity_(num_set_ * set_size),
-      default_vector_(vec_size, default_value),
-      device_table_(device_table_capacity, set_size, max_batch_size_),
-      host_table_(host_table_capacity * 1.3, set_size, max_batch_size_) {
-  CUDA_CHECK(cudaMalloc(&d_keys_buffer_, sizeof(key_type) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_vectors_buffer_, sizeof(vec_type) * max_batch_size_ * vec_size_));
-  CUDA_CHECK(cudaMalloc(&d_vectors_, sizeof(vec_type) * device_table_.capacity * vec_size_));
-
-  CUDA_CHECK(cudaMalloc(&d_output_indices_, sizeof(index_type) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_output_host_indices_, sizeof(index_type) * max_batch_size_));
-  CUDA_CHECK(cudaMallocHost(&h_output_host_indices_, sizeof(index_type) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_missing_keys_, sizeof(key_type) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_missing_positions_, sizeof(int) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_missing_count_, sizeof(int)));
-  CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(int)));
-  CUDA_CHECK(cudaStreamCreate(&query_stream_));
-  for (int i = 0; i < num_buffers_; i++) {
-    int batch_size_per_buffer = ceil(1.0 * max_batch_size_ / num_buffers_);
-    CUDA_CHECK(
-        cudaMallocHost(&h_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
-    CUDA_CHECK(cudaMalloc(&d_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
-    CUDA_CHECK(cudaStreamCreate(&cpy_streams_[i]));
-    CUDA_CHECK(cudaEventCreate(&cpy_events_[i]));
-  }
-  CUDA_CHECK(cudaMallocHost(&h_missing_keys_, sizeof(key_type) * max_batch_size_));
-  CUDA_CHECK(cudaEventCreate(&query_event_));
-  h_vectors_.resize(host_table_.capacity * vec_size_);
-}
-
-template <typename key_type, typename index_type, typename vec_type>
-void UvmTable<key_type, index_type, vec_type>::add(const key_type* h_keys,
-                                                   const vec_type* h_vectors,
-                                                   const size_t num_keys) {
-  std::vector<key_type> h_missing_keys;
-  size_t num_batches = (num_keys - 1) / max_batch_size_ + 1;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size =
-        i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
-    CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
-                          sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
-    device_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    int num_missing_keys;
-    CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
-                          cudaMemcpyDeviceToHost));
-    size_t prev_size = h_missing_keys.size();
-    h_missing_keys.resize(prev_size + num_missing_keys);
-    CUDA_CHECK(cudaMemcpy(h_missing_keys.data() + prev_size, d_missing_keys_,
-                          sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost));
-  }
-
-  std::vector<key_type> h_final_missing_keys;
-  num_batches = h_missing_keys.size() ? (h_missing_keys.size() - 1) / max_batch_size_ + 1 : 0;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size =
-        i != num_batches - 1 ? max_batch_size_ : h_missing_keys.size() - i * max_batch_size_;
-    CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_missing_keys.data() + i * max_batch_size_,
-                          sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
-    host_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    int num_missing_keys;
-    CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
-                          cudaMemcpyDeviceToHost));
-    size_t prev_size = h_final_missing_keys.size();
-    h_final_missing_keys.resize(prev_size + num_missing_keys);
-    CUDA_CHECK(cudaMemcpy(h_final_missing_keys.data() + prev_size, d_missing_keys_,
-                          sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost));
-  }
-
-  std::vector<key_type> h_keys_buffer(max_batch_size_);
-  std::vector<index_type> h_indices_buffer(max_batch_size_);
-  std::vector<int> h_positions_buffer(max_batch_size_);
-
-  num_batches = (num_keys - 1) / max_batch_size_ + 1;
-
-  size_t num_hit_keys = 0;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size =
-        i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
-    CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
-                          sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
-    device_table_.query(d_keys_buffer_, this_batch_size, d_output_indices_, d_missing_keys_,
-                        d_missing_positions_, d_missing_count_, 0);
-    CUDA_CHECK(cudaStreamSynchronize(0));
-
-    CUDA_CHECK(cudaMemcpy(d_vectors_buffer_, h_vectors + i * max_batch_size_ * vec_size_,
-                          sizeof(*d_vectors_) * this_batch_size * vec_size_,
-                          cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaStreamSynchronize(0));
-    if (num_hit_keys < device_table_.capacity) {
-      distribute_vectors_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, 0>>>(
-          d_output_indices_, this_batch_size, d_vectors_buffer_, vec_size_, d_vectors_);
-      CUDA_CHECK(cudaStreamSynchronize(0));
-    }
-
-    int num_missing_keys;
-    CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
-                          cudaMemcpyDeviceToHost));
-    num_hit_keys += this_batch_size - num_missing_keys;
-    host_table_.query(d_missing_keys_, num_missing_keys, d_output_indices_, nullptr, nullptr,
-                      nullptr, 0);
-
-    CUDA_CHECK(cudaMemcpy(h_keys_buffer.data(), d_missing_keys_,
-                          sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost))
-
-    CUDA_CHECK(cudaMemcpy(h_indices_buffer.data(), d_output_indices_,
-                          sizeof(*d_output_indices_) * num_missing_keys, cudaMemcpyDeviceToHost))
-
-    CUDA_CHECK(cudaMemcpy(h_positions_buffer.data(), d_missing_positions_,
-                          sizeof(*d_missing_positions_) * num_missing_keys, cudaMemcpyDeviceToHost))
-
-    for (int j = 0; j < num_missing_keys; j++) {
-      if (h_indices_buffer[j] != std::numeric_limits<index_type>::max()) {
-        memcpy(h_vectors_.data() + h_indices_buffer[j] * vec_size_,
-               h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
-               sizeof(*h_vectors) * vec_size_);
-      } else {
-        size_t prev_idx = h_vectors_.size() / vec_size_;
-        h_final_missing_items_.emplace(h_keys_buffer[j], prev_idx);
-        h_vectors_.resize(h_vectors_.size() + vec_size_);
-        memcpy(h_vectors_.data() + prev_idx * vec_size_,
-               h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
-               sizeof(*h_vectors) * vec_size_);
-      }
-    }
-  }
-  CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
-}
-
-template <typename key_type, typename index_type, typename vec_type>
-void UvmTable<key_type, index_type, vec_type>::query(const key_type* d_keys, const int num_keys,
-                                                     vec_type* d_vectors, cudaStream_t stream) {
-  if (!num_keys) return;
-  CUDA_CHECK(cudaEventRecord(query_event_, stream));
-  CUDA_CHECK(cudaStreamWaitEvent(query_stream_, query_event_));
-
-  static_assert(num_buffers_ >= 2);
-  device_table_.query(d_keys, num_keys, d_output_indices_, d_missing_keys_, d_missing_positions_,
-                      d_missing_count_, query_stream_);
-
-  CUDA_CHECK(cudaEventRecord(query_event_, query_stream_));
-  CUDA_CHECK(cudaStreamWaitEvent(cpy_streams_[0], query_event_));
-
-  int num_missing_keys;
-  CUDA_CHECK(cudaMemcpyAsync(&num_missing_keys, d_missing_count_, sizeof(*d_missing_count_),
-                             cudaMemcpyDeviceToHost, cpy_streams_[0]));
-
-  host_table_.query(d_missing_keys_, d_missing_count_, d_output_host_indices_, query_stream_);
-  CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0]));
-
-  CUDA_CHECK(cudaMemsetAsync(d_missing_count_, 0, sizeof(*d_missing_count_), query_stream_));
-
-  CUDA_CHECK(cudaMemcpyAsync(h_output_host_indices_, d_output_host_indices_,
-                             sizeof(index_type) * num_missing_keys, cudaMemcpyDeviceToHost,
-                             query_stream_));
-
-  CUDA_CHECK(cudaMemcpyAsync(h_missing_keys_, d_missing_keys_, sizeof(key_type) * num_missing_keys,
-                             cudaMemcpyDeviceToHost, cpy_streams_[0]));
-
-  read_vectors_kernel<<<(num_keys - 1) / block_size + 1, block_size, 0, cpy_streams_[1]>>>(
-      d_output_indices_, num_keys, d_vectors_, vec_size_, d_vectors);
-
-  CUDA_CHECK(cudaStreamSynchronize(query_stream_));
-  CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0]));
-
-  int num_keys_per_buffer = ceil(1.0 * num_missing_keys / num_buffers_);
-
-  for (int buffer_num = 0; buffer_num < num_buffers_; buffer_num++) {
-    int num_keys_this_buffer = buffer_num != num_buffers_ - 1
-                                   ? num_keys_per_buffer
-                                   : num_missing_keys - num_keys_per_buffer * buffer_num;
-    if (!num_keys_this_buffer) break;
-#pragma omp parallel for num_threads(8)
-    for (size_t i = 0; i < static_cast<size_t>(num_keys_this_buffer); i++) {
-      size_t idx_key = buffer_num * num_keys_per_buffer + i;
-      index_type index = h_output_host_indices_[idx_key];
-      if (index == std::numeric_limits<index_type>::max()) {
-        key_type key = h_missing_keys_[idx_key];
-        auto iterator = h_final_missing_items_.find(key);
-        if (iterator != h_final_missing_items_.end()) {
-          index = iterator->second;
-        }
-      }
-      if (index != std::numeric_limits<index_type>::max()) {
-        memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, h_vectors_.data() + index * vec_size_,
-               sizeof(vec_type) * vec_size_);
-      } else {
-        memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, default_vector_.data(),
-               sizeof(vec_type) * vec_size_);
-      }
-    }
-    CUDA_CHECK(cudaMemcpyAsync(d_cpy_buffers_[buffer_num], h_cpy_buffers_[buffer_num],
-                               sizeof(vec_type) * num_keys_this_buffer * vec_size_,
-                               cudaMemcpyHostToDevice, cpy_streams_[buffer_num]));
-
-    distribute_vectors_kernel<<<(num_keys_this_buffer - 1) / block_size + 1, block_size, 0,
-                                cpy_streams_[buffer_num]>>>(
-        d_missing_positions_ + buffer_num * num_keys_per_buffer, num_keys_this_buffer,
-        d_cpy_buffers_[buffer_num], vec_size_, d_vectors);
-  }
-
-  for (int i = 0; i < num_buffers_; i++) {
-    CUDA_CHECK(cudaEventRecord(cpy_events_[i], cpy_streams_[i]));
-    CUDA_CHECK(cudaStreamWaitEvent(stream, cpy_events_[i]));
-  }
-}
-
-template <typename key_type, typename index_type, typename vec_type>
-void UvmTable<key_type, index_type, vec_type>::clear(cudaStream_t stream) {
-  device_table_.clear(stream);
-  host_table_.clear(stream);
-}
-
-template <typename key_type, typename index_type, typename vec_type>
-UvmTable<key_type, index_type, vec_type>::~UvmTable() {
-  CUDA_CHECK(cudaFree(d_keys_buffer_));
-  CUDA_CHECK(cudaFree(d_vectors_buffer_));
-  CUDA_CHECK(cudaFree(d_vectors_));
-
-  CUDA_CHECK(cudaFree(d_output_indices_));
-  CUDA_CHECK(cudaFree(d_output_host_indices_));
-  CUDA_CHECK(cudaFreeHost(h_output_host_indices_));
-
-  CUDA_CHECK(cudaFree(d_missing_keys_));
-  CUDA_CHECK(cudaFree(d_missing_positions_));
-  CUDA_CHECK(cudaFree(d_missing_count_));
-  CUDA_CHECK(cudaFreeHost(h_missing_keys_));
-
-  CUDA_CHECK(cudaStreamDestroy(query_stream_));
-  CUDA_CHECK(cudaEventDestroy(query_event_));
-
-  for (int i = 0; i < num_buffers_; i++) {
-    CUDA_CHECK(cudaFreeHost(h_cpy_buffers_[i]));
-    CUDA_CHECK(cudaFree(d_cpy_buffers_[i]));
-    CUDA_CHECK(cudaStreamDestroy(cpy_streams_[i]));
-    CUDA_CHECK(cudaEventDestroy(cpy_events_[i]));
-  }
-}
-
-template <typename key_type, typename index_type>
-HashBlock<key_type, index_type>::HashBlock(size_t expected_capacity, int set_size, int batch_size)
-    : max_set_size_(set_size), batch_size_(batch_size) {
-  if (expected_capacity) {
-    num_sets = (expected_capacity - 1) / set_size + 1;
-  } else {
-    num_sets = 10000;
-  }
-  capacity = num_sets * set_size;
-  CUDA_CHECK(cudaMalloc(&keys, sizeof(*keys) * capacity));
-  CUDA_CHECK(cudaMalloc(&set_sizes_, sizeof(*set_sizes_) * num_sets));
-  CUDA_CHECK(cudaMemset(set_sizes_, 0, sizeof(*set_sizes_) * num_sets));
-}
-
-template <typename key_type, typename index_type>
-HashBlock<key_type, index_type>::~HashBlock() {
-  CUDA_CHECK(cudaFree(keys));
-  CUDA_CHECK(cudaFree(set_sizes_));
-}
-
-template <typename key_type, typename index_type>
-void HashBlock<key_type, index_type>::query(const key_type* query_keys, const size_t num_keys,
-                                            index_type* output_indices, key_type* missing_keys,
-                                            int* missing_positions, int* num_missing_keys,
-                                            cudaStream_t stream) {
-  if (num_keys == 0) {
-    return;
-  }
-  size_t num_batches = (num_keys - 1) / batch_size_ + 1;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
-    hash_query_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>(
-        query_keys, this_batch_size, keys, num_sets, max_set_size_, output_indices, missing_keys,
-        missing_positions, num_missing_keys);
-  }
-}
-
-template <typename key_type, typename index_type>
-void HashBlock<key_type, index_type>::query(const key_type* query_keys, int* num_keys,
-                                            index_type* output_indices, cudaStream_t stream) {
-  hash_query_kernel<<<128, 64, 0, stream>>>(query_keys, num_keys, keys, num_sets, max_set_size_,
-                                            output_indices);
-}
-
-template <typename key_type, typename index_type>
-void HashBlock<key_type, index_type>::add(const key_type* new_keys, const size_t num_keys,
-                                          key_type* missing_keys, int* num_missing_keys,
-                                          cudaStream_t stream) {
-  if (num_keys == 0) {
-    return;
-  }
-  size_t num_batches = (num_keys - 1) / batch_size_ + 1;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
-    hash_add_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>(
-        new_keys + i * this_batch_size, this_batch_size, keys, num_sets, set_sizes_, max_set_size_,
-        missing_keys, num_missing_keys);
-  }
-}
-
-template <typename key_type, typename index_type>
-void HashBlock<key_type, index_type>::clear(cudaStream_t stream) {
-  CUDA_CHECK(cudaMemsetAsync(set_sizes_, 0, sizeof(*set_sizes_) * num_sets, stream));
-}
-
-template class HashBlock<int, size_t>;
-template class HashBlock<int64_t, size_t>;
-template class HashBlock<size_t, size_t>;
-template class HashBlock<unsigned int, size_t>;
-template class HashBlock<long long, size_t>;
-
-template class UvmTable<int, size_t>;
-template class UvmTable<int64_t, size_t>;
-template class UvmTable<size_t, size_t>;
-template class UvmTable<unsigned int, size_t>;
-template class UvmTable<long long, size_t>;
+// !!! This is a file automatically generated by hipify!!!
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <hip/hip_cooperative_groups.h>
+#include <hip/hip_runtime_api.h>
+#include <immintrin.h>
+
+#include <atomic>
+#include <iostream>
+#include <limits>
+#include <mutex>
+#include <uvm_table.hpp>
+
+namespace cg = cooperative_groups;
+
+namespace {
+
+constexpr int set_size = 4;
+constexpr int block_size = 256;
+
+template <typename key_type>
+__host__ __device__ key_type hash(key_type key) {
+  return key;
+}
+
+template <typename key_type>
+__global__ void hash_add_kernel(const key_type* new_keys, const int num_keys, key_type* keys,
+                                const int num_sets, int* set_sizes, const int max_set_size,
+                                key_type* missing_keys, int* num_missing_keys) {
+  __shared__ key_type s_missing_keys[block_size];
+  __shared__ int s_missing_count;
+  __shared__ size_t s_missing_idx;
+
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+
+  if (block.thread_rank() == 0) {
+    s_missing_count = 0;
+  }
+  block.sync();
+
+  size_t idx = grid.thread_rank();
+  if (idx < num_keys) {
+    auto key = new_keys[idx];
+    size_t idx_set = hash(key) % num_sets;
+    int prev_set_size = atomicAdd(&set_sizes[idx_set], 1);
+    if (prev_set_size < max_set_size) {
+      keys[idx_set * max_set_size + prev_set_size] = key;
+    } else {
+      int count = atomicAdd(&s_missing_count, 1);
+      s_missing_keys[count] = key;
+    }
+  }
+
+  block.sync();
+  if (block.thread_rank() == 0) {
+    s_missing_idx = atomicAdd(num_missing_keys, s_missing_count);
+  }
+  block.sync();
+  for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
+    missing_keys[s_missing_idx + i] = s_missing_keys[i];
+  }
+}
+
+template <typename key_type, typename index_type>
+__global__ void hash_query_kernel(const key_type* query_keys, int* num_keys_ptr,
+                                  const key_type* keys, const size_t num_sets,
+                                  const int max_set_size, index_type* output_indices) {
+  constexpr int tile_size = set_size;
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto tile = cg::tiled_partition<tile_size>(block);
+  int num_keys = *num_keys_ptr;
+  if (num_keys == 0) return;
+
+#if (DTK_VERSION < 11060)
+  size_t num_threads_per_grid = grid.size();
+#else
+  size_t num_threads_per_grid = grid.num_threads();
+#endif
+
+  size_t step = (num_keys - 1) / num_threads_per_grid + 1;
+  for (size_t i = 0; i < step; i++) {
+    size_t idx = i * num_threads_per_grid + grid.thread_rank();
+    key_type query_key = std::numeric_limits<key_type>::max();
+    if (idx < num_keys) {
+      query_key = query_keys[idx];
+    }
+    auto idx_set = hash(query_key) % num_sets;
+    for (int j = 0; j < tile_size; j++) {
+      auto current_idx_set = tile.shfl(idx_set, j);
+      auto current_query_key = tile.shfl(query_key, j);
+      if (current_query_key == std::numeric_limits<key_type>::max()) {
+        continue;
+      }
+      auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
+      int existed = tile.ballot(current_query_key == candidate_key);
+      auto current_idx = tile.shfl(idx, 0) + j;
+      if (existed) {
+        int src_lane = __ffs(existed) - 1;
+        size_t found_idx = current_idx_set * set_size + src_lane;
+        output_indices[current_idx] = num_sets * src_lane + current_idx_set;
+      } else {
+        output_indices[current_idx] = std::numeric_limits<index_type>::max();
+      }
+    }
+  }
+}
+
+template <typename key_type, typename index_type>
+__global__ void hash_query_kernel(const key_type* query_keys, const int num_keys,
+                                  const key_type* keys, const size_t num_sets,
+                                  const int max_set_size, index_type* output_indices,
+                                  key_type* missing_keys, int* missing_positions,
+                                  int* missing_count) {
+  __shared__ key_type s_missing_keys[block_size];
+  __shared__ key_type s_missing_positions[block_size];
+  __shared__ int s_missing_count;
+  __shared__ int s_missing_idx;
+
+  constexpr int tile_size = set_size;
+
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto tile = cg::tiled_partition<tile_size>(block);
+
+  if (block.thread_rank() == 0) {
+    s_missing_count = 0;
+  }
+  block.sync();
+
+  size_t idx = grid.thread_rank();
+  key_type query_key = std::numeric_limits<key_type>::max();
+  if (idx < num_keys) {
+    query_key = query_keys[idx];
+  }
+  auto idx_set = hash(query_key) % num_sets;
+
+  for (int j = 0; j < tile_size; j++) {
+    auto current_idx_set = tile.shfl(idx_set, j);
+    auto current_query_key = tile.shfl(query_key, j);
+    if (current_query_key == std::numeric_limits<key_type>::max()) {
+      continue;
+    }
+    auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
+    int existed = tile.ballot(current_query_key == candidate_key);
+    if (existed) {
+      int src_lane = __ffs(existed) - 1;
+      size_t found_idx = current_idx_set * set_size + src_lane;
+      output_indices[tile.shfl(idx, 0) + j] = num_sets * src_lane + current_idx_set;
+    } else {
+      auto current_idx = tile.shfl(idx, 0) + j;
+      output_indices[current_idx] = std::numeric_limits<index_type>::max();
+      if (tile.thread_rank() == 0) {
+        int s_count = atomicAdd(&s_missing_count, 1);
+        s_missing_keys[s_count] = current_query_key;
+        s_missing_positions[s_count] = current_idx;
+      }
+    }
+  }
+
+  if (missing_keys == nullptr) {
+    if (grid.thread_rank() == 0 && missing_count) {
+      *missing_count = 0;
+    }
+    return;
+  }
+  block.sync();
+  if (block.thread_rank() == 0) {
+    s_missing_idx = atomicAdd(missing_count, s_missing_count);
+  }
+  block.sync();
+  for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
+    missing_keys[s_missing_idx + i] = s_missing_keys[i];
+    missing_positions[s_missing_idx + i] = s_missing_positions[i];
+  }
+}
+
+template <int warp_size>
+__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
+                                               const size_t emb_vec_size_in_float,
+                                               volatile float* d_dst, const float* d_src) {
+  // 16 bytes align
+  if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) {
+#pragma unroll
+    for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
+      d_dst[i] = d_src[i];
+    }
+  } else {
+#pragma unroll
+    for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) {
+      *(float4*)(d_dst + i * 4) = __ldg((const float4*)(d_src + i * 4));
+    }
+  }
+}
+
+template <typename index_type, typename vec_type>
+__global__ void read_vectors_kernel(const index_type* query_indices, const int num_keys,
+                                    const vec_type* vectors, const int vec_size,
+                                    vec_type* output_vectors) {
+  constexpr int warp_size = 32;
+
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto tile = cg::tiled_partition<warp_size>(block);
+
+#if (DTK_VERSION < 11060)
+  auto num_threads_per_grid = grid.size();
+#else
+  auto num_threads_per_grid = grid.num_threads();
+#endif
+
+  for (int step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
+    int key_num = step * num_threads_per_grid + grid.thread_rank();
+    index_type idx = std::numeric_limits<index_type>::max();
+    if (key_num < num_keys) {
+      idx = query_indices[key_num];
+    }
+#pragma unroll 4
+    for (size_t j = 0; j < warp_size; j++) {
+      index_type current_idx = tile.shfl(idx, j);
+      index_type idx_write = tile.shfl(key_num, 0) + j;
+      if (current_idx == std::numeric_limits<index_type>::max()) continue;
+      warp_tile_copy<warp_size>(tile.thread_rank(), vec_size, output_vectors + idx_write * vec_size,
+                                vectors + current_idx * vec_size);
+    }
+  }
+}
+
+template <typename index_type, typename vec_type>
+__global__ void distribute_vectors_kernel(const index_type* postions, const size_t num_keys,
+                                          const vec_type* vectors, const int vec_size,
+                                          vec_type* output_vectors) {
+  constexpr int warp_size = 32;
+
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto tile = cg::tiled_partition<warp_size>(block);
+
+#if (DTK_VERSION < 11060)
+  auto num_threads_per_grid = grid.size();
+#else
+  auto num_threads_per_grid = grid.num_threads();
+#endif
+
+  for (size_t step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
+    size_t key_num = step * num_threads_per_grid + grid.thread_rank();
+    index_type idx = std::numeric_limits<index_type>::max();
+    if (key_num < num_keys) {
+      idx = postions[key_num];
+    }
+#pragma unroll 4
+    for (size_t j = 0; j < warp_size; j++) {
+      size_t idx_write = tile.shfl(idx, j);
+      size_t idx_read = tile.shfl(key_num, 0) + j;
+      if (idx_write == std::numeric_limits<index_type>::max()) continue;
+      warp_tile_copy<warp_size>(tile.thread_rank(), vec_size,
+                                output_vectors + (size_t)idx_write * vec_size,
+                                vectors + (size_t)idx_read * vec_size);
+    }
+  }
+}
+
+}  // namespace
+
+namespace gpu_cache {
+template <typename key_type, typename index_type, typename vec_type>
+UvmTable<key_type, index_type, vec_type>::UvmTable(const size_t device_table_capacity,
+                                                   const size_t host_table_capacity,
+                                                   const int max_batch_size, const int vec_size,
+                                                   const vec_type default_value)
+    : max_batch_size_(::max(100000, max_batch_size)),
+      vec_size_(vec_size),
+      num_set_((device_table_capacity - 1) / set_size + 1),
+      num_host_set_((host_table_capacity - 1) / set_size + 1),
+      table_capacity_(num_set_ * set_size),
+      default_vector_(vec_size, default_value),
+      device_table_(device_table_capacity, set_size, max_batch_size_),
+      host_table_(host_table_capacity * 1.3, set_size, max_batch_size_) {
+  CUDA_CHECK(hipMalloc(&d_keys_buffer_, sizeof(key_type) * max_batch_size_));
+  CUDA_CHECK(hipMalloc(&d_vectors_buffer_, sizeof(vec_type) * max_batch_size_ * vec_size_));
+  CUDA_CHECK(hipMalloc(&d_vectors_, sizeof(vec_type) * device_table_.capacity * vec_size_));
+
+  CUDA_CHECK(hipMalloc(&d_output_indices_, sizeof(index_type) * max_batch_size_));
+  CUDA_CHECK(hipMalloc(&d_output_host_indices_, sizeof(index_type) * max_batch_size_));
+  CUDA_CHECK(hipHostMalloc(&h_output_host_indices_, sizeof(index_type) * max_batch_size_));
+  CUDA_CHECK(hipMalloc(&d_missing_keys_, sizeof(key_type) * max_batch_size_));
+  CUDA_CHECK(hipMalloc(&d_missing_positions_, sizeof(int) * max_batch_size_));
+  CUDA_CHECK(hipMalloc(&d_missing_count_, sizeof(int)));
+  CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(int)));
+  CUDA_CHECK(hipStreamCreate(&query_stream_));
+  for (int i = 0; i < num_buffers_; i++) {
+    int batch_size_per_buffer = ceil(1.0 * max_batch_size_ / num_buffers_);
+    CUDA_CHECK(
+        hipHostMalloc(&h_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
+    CUDA_CHECK(hipMalloc(&d_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
+    CUDA_CHECK(hipStreamCreate(&cpy_streams_[i]));
+    CUDA_CHECK(hipEventCreate(&cpy_events_[i]));
+  }
+  CUDA_CHECK(hipHostMalloc(&h_missing_keys_, sizeof(key_type) * max_batch_size_));
+  CUDA_CHECK(hipEventCreate(&query_event_));
+  h_vectors_.resize(host_table_.capacity * vec_size_);
+}
+
+template <typename key_type, typename index_type, typename vec_type>
+void UvmTable<key_type, index_type, vec_type>::add(const key_type* h_keys,
+                                                   const vec_type* h_vectors,
+                                                   const size_t num_keys) {
+  std::vector<key_type> h_missing_keys;
+  size_t num_batches = (num_keys - 1) / max_batch_size_ + 1;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size =
+        i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
+    CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
+                          sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice));
+    CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
+    device_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
+    CUDA_CHECK(hipDeviceSynchronize());
+    int num_missing_keys;
+    CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
+                          hipMemcpyDeviceToHost));
+    size_t prev_size = h_missing_keys.size();
+    h_missing_keys.resize(prev_size + num_missing_keys);
+    CUDA_CHECK(hipMemcpy(h_missing_keys.data() + prev_size, d_missing_keys_,
+                          sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost));
+  }
+
+  std::vector<key_type> h_final_missing_keys;
+  num_batches = h_missing_keys.size() ? (h_missing_keys.size() - 1) / max_batch_size_ + 1 : 0;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size =
+        i != num_batches - 1 ? max_batch_size_ : h_missing_keys.size() - i * max_batch_size_;
+    CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_missing_keys.data() + i * max_batch_size_,
+                          sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice));
+    CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
+    host_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
+    CUDA_CHECK(hipDeviceSynchronize());
+    int num_missing_keys;
+    CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
+                          hipMemcpyDeviceToHost));
+    size_t prev_size = h_final_missing_keys.size();
+    h_final_missing_keys.resize(prev_size + num_missing_keys);
+    CUDA_CHECK(hipMemcpy(h_final_missing_keys.data() + prev_size, d_missing_keys_,
+                          sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost));
+  }
+
+  std::vector<key_type> h_keys_buffer(max_batch_size_);
+  std::vector<index_type> h_indices_buffer(max_batch_size_);
+  std::vector<int> h_positions_buffer(max_batch_size_);
+
+  num_batches = (num_keys - 1) / max_batch_size_ + 1;
+
+  size_t num_hit_keys = 0;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size =
+        i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
+    CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
+                          sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice));
+    CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
+    device_table_.query(d_keys_buffer_, this_batch_size, d_output_indices_, d_missing_keys_,
+                        d_missing_positions_, d_missing_count_, 0);
+    CUDA_CHECK(hipStreamSynchronize(0));
+
+    CUDA_CHECK(hipMemcpy(d_vectors_buffer_, h_vectors + i * max_batch_size_ * vec_size_,
+                          sizeof(*d_vectors_) * this_batch_size * vec_size_,
+                          hipMemcpyHostToDevice));
+    CUDA_CHECK(hipStreamSynchronize(0));
+    if (num_hit_keys < device_table_.capacity) {
+     hipLaunchKernelGGL(( distribute_vectors_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, 0, 
+          d_output_indices_, this_batch_size, d_vectors_buffer_, vec_size_, d_vectors_);
+      CUDA_CHECK(hipStreamSynchronize(0));
+    }
+
+    int num_missing_keys;
+    CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
+                          hipMemcpyDeviceToHost));
+    num_hit_keys += this_batch_size - num_missing_keys;
+    host_table_.query(d_missing_keys_, num_missing_keys, d_output_indices_, nullptr, nullptr,
+                      nullptr, 0);
+
+    CUDA_CHECK(hipMemcpy(h_keys_buffer.data(), d_missing_keys_,
+                          sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost))
+
+    CUDA_CHECK(hipMemcpy(h_indices_buffer.data(), d_output_indices_,
+                          sizeof(*d_output_indices_) * num_missing_keys, hipMemcpyDeviceToHost))
+
+    CUDA_CHECK(hipMemcpy(h_positions_buffer.data(), d_missing_positions_,
+                          sizeof(*d_missing_positions_) * num_missing_keys, hipMemcpyDeviceToHost))
+
+    for (int j = 0; j < num_missing_keys; j++) {
+      if (h_indices_buffer[j] != std::numeric_limits<index_type>::max()) {
+        memcpy(h_vectors_.data() + h_indices_buffer[j] * vec_size_,
+               h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
+               sizeof(*h_vectors) * vec_size_);
+      } else {
+        size_t prev_idx = h_vectors_.size() / vec_size_;
+        h_final_missing_items_.emplace(h_keys_buffer[j], prev_idx);
+        h_vectors_.resize(h_vectors_.size() + vec_size_);
+        memcpy(h_vectors_.data() + prev_idx * vec_size_,
+               h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
+               sizeof(*h_vectors) * vec_size_);
+      }
+    }
+  }
+  CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
+}
+
+template <typename key_type, typename index_type, typename vec_type>
+void UvmTable<key_type, index_type, vec_type>::query(const key_type* d_keys, const int num_keys,
+                                                     vec_type* d_vectors, hipStream_t stream) {
+  if (!num_keys) return;
+  CUDA_CHECK(hipEventRecord(query_event_, stream));
+  CUDA_CHECK(hipStreamWaitEvent(query_stream_, query_event_));
+
+  static_assert(num_buffers_ >= 2);
+  device_table_.query(d_keys, num_keys, d_output_indices_, d_missing_keys_, d_missing_positions_,
+                      d_missing_count_, query_stream_);
+
+  CUDA_CHECK(hipEventRecord(query_event_, query_stream_));
+  CUDA_CHECK(hipStreamWaitEvent(cpy_streams_[0], query_event_));
+
+  int num_missing_keys;
+  CUDA_CHECK(hipMemcpyAsync(&num_missing_keys, d_missing_count_, sizeof(*d_missing_count_),
+                             hipMemcpyDeviceToHost, cpy_streams_[0]));
+
+  host_table_.query(d_missing_keys_, d_missing_count_, d_output_host_indices_, query_stream_);
+  CUDA_CHECK(hipStreamSynchronize(cpy_streams_[0]));
+
+  CUDA_CHECK(hipMemsetAsync(d_missing_count_, 0, sizeof(*d_missing_count_), query_stream_));
+
+  CUDA_CHECK(hipMemcpyAsync(h_output_host_indices_, d_output_host_indices_,
+                             sizeof(index_type) * num_missing_keys, hipMemcpyDeviceToHost,
+                             query_stream_));
+
+  CUDA_CHECK(hipMemcpyAsync(h_missing_keys_, d_missing_keys_, sizeof(key_type) * num_missing_keys,
+                             hipMemcpyDeviceToHost, cpy_streams_[0]));
+
+ hipLaunchKernelGGL(( read_vectors_kernel), dim3((num_keys - 1) / block_size + 1), dim3(block_size), 0, cpy_streams_[1], 
+      d_output_indices_, num_keys, d_vectors_, vec_size_, d_vectors);
+
+  CUDA_CHECK(hipStreamSynchronize(query_stream_));
+  CUDA_CHECK(hipStreamSynchronize(cpy_streams_[0]));
+
+  int num_keys_per_buffer = ceil(1.0 * num_missing_keys / num_buffers_);
+
+  for (int buffer_num = 0; buffer_num < num_buffers_; buffer_num++) {
+    int num_keys_this_buffer = buffer_num != num_buffers_ - 1
+                                   ? num_keys_per_buffer
+                                   : num_missing_keys - num_keys_per_buffer * buffer_num;
+    if (!num_keys_this_buffer) break;
+#pragma omp parallel for num_threads(8)
+    for (size_t i = 0; i < static_cast<size_t>(num_keys_this_buffer); i++) {
+      size_t idx_key = buffer_num * num_keys_per_buffer + i;
+      index_type index = h_output_host_indices_[idx_key];
+      if (index == std::numeric_limits<index_type>::max()) {
+        key_type key = h_missing_keys_[idx_key];
+        auto iterator = h_final_missing_items_.find(key);
+        if (iterator != h_final_missing_items_.end()) {
+          index = iterator->second;
+        }
+      }
+      if (index != std::numeric_limits<index_type>::max()) {
+        memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, h_vectors_.data() + index * vec_size_,
+               sizeof(vec_type) * vec_size_);
+      } else {
+        memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, default_vector_.data(),
+               sizeof(vec_type) * vec_size_);
+      }
+    }
+    CUDA_CHECK(hipMemcpyAsync(d_cpy_buffers_[buffer_num], h_cpy_buffers_[buffer_num],
+                               sizeof(vec_type) * num_keys_this_buffer * vec_size_,
+                               hipMemcpyHostToDevice, cpy_streams_[buffer_num]));
+
+   hipLaunchKernelGGL(( distribute_vectors_kernel), dim3((num_keys_this_buffer - 1) / block_size + 1), dim3(block_size), 0,
+                                cpy_streams_[buffer_num], 
+        d_missing_positions_ + buffer_num * num_keys_per_buffer, num_keys_this_buffer,
+        d_cpy_buffers_[buffer_num], vec_size_, d_vectors);
+  }
+
+  for (int i = 0; i < num_buffers_; i++) {
+    CUDA_CHECK(hipEventRecord(cpy_events_[i], cpy_streams_[i]));
+    CUDA_CHECK(hipStreamWaitEvent(stream, cpy_events_[i]));
+  }
+}
+
+template <typename key_type, typename index_type, typename vec_type>
+void UvmTable<key_type, index_type, vec_type>::clear(hipStream_t stream) {
+  device_table_.clear(stream);
+  host_table_.clear(stream);
+}
+
+template <typename key_type, typename index_type, typename vec_type>
+UvmTable<key_type, index_type, vec_type>::~UvmTable() {
+  CUDA_CHECK(hipFree(d_keys_buffer_));
+  CUDA_CHECK(hipFree(d_vectors_buffer_));
+  CUDA_CHECK(hipFree(d_vectors_));
+
+  CUDA_CHECK(hipFree(d_output_indices_));
+  CUDA_CHECK(hipFree(d_output_host_indices_));
+  CUDA_CHECK(hipHostFree(h_output_host_indices_));
+
+  CUDA_CHECK(hipFree(d_missing_keys_));
+  CUDA_CHECK(hipFree(d_missing_positions_));
+  CUDA_CHECK(hipFree(d_missing_count_));
+  CUDA_CHECK(hipHostFree(h_missing_keys_));
+
+  CUDA_CHECK(hipStreamDestroy(query_stream_));
+  CUDA_CHECK(hipEventDestroy(query_event_));
+
+  for (int i = 0; i < num_buffers_; i++) {
+    CUDA_CHECK(hipHostFree(h_cpy_buffers_[i]));
+    CUDA_CHECK(hipFree(d_cpy_buffers_[i]));
+    CUDA_CHECK(hipStreamDestroy(cpy_streams_[i]));
+    CUDA_CHECK(hipEventDestroy(cpy_events_[i]));
+  }
+}
+
+template <typename key_type, typename index_type>
+HashBlock<key_type, index_type>::HashBlock(size_t expected_capacity, int set_size, int batch_size)
+    : max_set_size_(set_size), batch_size_(batch_size) {
+  if (expected_capacity) {
+    num_sets = (expected_capacity - 1) / set_size + 1;
+  } else {
+    num_sets = 10000;
+  }
+  capacity = num_sets * set_size;
+  CUDA_CHECK(hipMalloc(&keys, sizeof(*keys) * capacity));
+  CUDA_CHECK(hipMalloc(&set_sizes_, sizeof(*set_sizes_) * num_sets));
+  CUDA_CHECK(hipMemset(set_sizes_, 0, sizeof(*set_sizes_) * num_sets));
+}
+
+template <typename key_type, typename index_type>
+HashBlock<key_type, index_type>::~HashBlock() {
+  CUDA_CHECK(hipFree(keys));
+  CUDA_CHECK(hipFree(set_sizes_));
+}
+
+template <typename key_type, typename index_type>
+void HashBlock<key_type, index_type>::query(const key_type* query_keys, const size_t num_keys,
+                                            index_type* output_indices, key_type* missing_keys,
+                                            int* missing_positions, int* num_missing_keys,
+                                            hipStream_t stream) {
+  if (num_keys == 0) {
+    return;
+  }
+  size_t num_batches = (num_keys - 1) / batch_size_ + 1;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
+   hipLaunchKernelGGL(( hash_query_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, stream, 
+        query_keys, this_batch_size, keys, num_sets, max_set_size_, output_indices, missing_keys,
+        missing_positions, num_missing_keys);
+  }
+}
+
+template <typename key_type, typename index_type>
+void HashBlock<key_type, index_type>::query(const key_type* query_keys, int* num_keys,
+                                            index_type* output_indices, hipStream_t stream) {
+ hipLaunchKernelGGL(( hash_query_kernel), dim3(128), dim3(64), 0, stream, query_keys, num_keys, keys, num_sets, max_set_size_,
+                                            output_indices);
+}
+
+template <typename key_type, typename index_type>
+void HashBlock<key_type, index_type>::add(const key_type* new_keys, const size_t num_keys,
+                                          key_type* missing_keys, int* num_missing_keys,
+                                          hipStream_t stream) {
+  if (num_keys == 0) {
+    return;
+  }
+  size_t num_batches = (num_keys - 1) / batch_size_ + 1;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
+   hipLaunchKernelGGL(( hash_add_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, stream, 
+        new_keys + i * this_batch_size, this_batch_size, keys, num_sets, set_sizes_, max_set_size_,
+        missing_keys, num_missing_keys);
+  }
+}
+
+template <typename key_type, typename index_type>
+void HashBlock<key_type, index_type>::clear(hipStream_t stream) {
+  CUDA_CHECK(hipMemsetAsync(set_sizes_, 0, sizeof(*set_sizes_) * num_sets, stream));
+}
+
+template class HashBlock<int, size_t>;
+template class HashBlock<int64_t, size_t>;
+template class HashBlock<size_t, size_t>;
+template class HashBlock<unsigned int, size_t>;
+template class HashBlock<long long, size_t>;
+
+template class UvmTable<int, size_t>;
+template class UvmTable<int64_t, size_t>;
+template class UvmTable<size_t, size_t>;
+template class UvmTable<unsigned int, size_t>;
+template class UvmTable<long long, size_t>;
 }  // namespace gpu_cache
\ No newline at end of file
--- a/third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
+++ b/third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
@@ -15,14 +15,14 @@

 cmake_minimum_required(VERSION 3.8)
 file(GLOB gpu_cache_test_src
-  cache_op_sol_test.cu
-  ../../HugeCTR/src/hps/embedding_cache_gpu.cu
+  cache_op_sol_test.hip
+  ../../HugeCTR/src/hps/embedding_cache_gpu.hip
 )

 add_executable(cache_op_sol_test ${gpu_cache_test_src})
 target_compile_features(cache_op_sol_test PUBLIC cxx_std_17)
 target_link_libraries(cache_op_sol_test PUBLIC gpu_cache)
 target_link_libraries(cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX)
-set_target_properties(cache_op_sol_test PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-set_target_properties(cache_op_sol_test PROPERTIES CUDA_ARCHITECTURES OFF)
+set_target_properties(cache_op_sol_test PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS  ON)
+set_target_properties(cache_op_sol_test PROPERTIES HIP_ARCHITECTURES OFF)

--- a/third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.cu
+++ b/third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
@@ -155,7 +157,7 @@ void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_
 template <typename T>
 bool is_near(T a, T b) {
  double diff = abs(a - b);
-  bool ret = diff <= std::min(a, b) * 1e-6;
+  bool ret = diff <= ::min(a, b) * 1e-6;
  if (!ret) {
    std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl;
  }
@@ -224,7 +226,7 @@ int main(int argc, char** argv) {
  const size_t cache_type = atoi(argv[7]);

  // Since cache is designed for single-gpu, all threads just use GPU 0
-  CUDA_CHECK(cudaSetDevice(0));
+  CUDA_CHECK(hipSetDevice(0));

  // Host side buffers shared between threads
  key_type* h_keys;  // Buffer holding all keys in embedding table
@@ -302,7 +304,7 @@ int main(int argc, char** argv) {
    int thread_id = omp_get_thread_num();
    printf("Worker %d starts testing cache.\n", thread_id);
    // Since cache is designed for single-gpu, all threads just use GPU 0
-    CUDA_CHECK(cudaSetDevice(0));
+    CUDA_CHECK(hipSetDevice(0));

    // Thread-private host side buffers
    size_t* h_query_keys_index;  // Buffer holding index for keys to be queried
@@ -324,32 +326,32 @@ int main(int argc, char** argv) {
    // host-only buffers placed in normal host memory
    h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t));
    // host-device interactive buffers placed in pinned memory
-    CUDA_CHECK(cudaHostAlloc((void**)&h_query_keys, query_length * sizeof(key_type),
-                             cudaHostAllocPortable));
-    CUDA_CHECK(cudaHostAlloc((void**)&h_vals_retrieved,
+    CUDA_CHECK(hipHostMalloc((void**)&h_query_keys, query_length * sizeof(key_type),
+                             hipHostMallocPortable));
+    CUDA_CHECK(hipHostMalloc((void**)&h_vals_retrieved,
                             query_length * embedding_vec_size * sizeof(float),
-                             cudaHostAllocPortable));
-    CUDA_CHECK(cudaHostAlloc((void**)&h_missing_keys, query_length * sizeof(key_type),
-                             cudaHostAllocPortable));
-    CUDA_CHECK(cudaHostAlloc((void**)&h_missing_vals,
+                             hipHostMallocPortable));
+    CUDA_CHECK(hipHostMalloc((void**)&h_missing_keys, query_length * sizeof(key_type),
+                             hipHostMallocPortable));
+    CUDA_CHECK(hipHostMalloc((void**)&h_missing_vals,
                             query_length * embedding_vec_size * sizeof(float),
-                             cudaHostAllocPortable));
-    CUDA_CHECK(cudaHostAlloc((void**)&h_missing_index, query_length * sizeof(uint64_t),
-                             cudaHostAllocPortable));
+                             hipHostMallocPortable));
+    CUDA_CHECK(hipHostMalloc((void**)&h_missing_index, query_length * sizeof(uint64_t),
+                             hipHostMallocPortable));

    // Allocate device side buffers
-    CUDA_CHECK(cudaMalloc((void**)&d_query_keys, query_length * sizeof(key_type)));
+    CUDA_CHECK(hipMalloc((void**)&d_query_keys, query_length * sizeof(key_type)));
    CUDA_CHECK(
-        cudaMalloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float)));
-    CUDA_CHECK(cudaMalloc((void**)&d_missing_keys, query_length * sizeof(key_type)));
+        hipMalloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float)));
+    CUDA_CHECK(hipMalloc((void**)&d_missing_keys, query_length * sizeof(key_type)));
    CUDA_CHECK(
-        cudaMalloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float)));
-    CUDA_CHECK(cudaMalloc((void**)&d_missing_index, query_length * sizeof(uint64_t)));
-    CUDA_CHECK(cudaMalloc((void**)&d_missing_len, sizeof(size_t)));
+        hipMalloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float)));
+    CUDA_CHECK(hipMalloc((void**)&d_missing_index, query_length * sizeof(uint64_t)));
+    CUDA_CHECK(hipMalloc((void**)&d_missing_len, sizeof(size_t)));

    // Thread-private CUDA stream, all threads just use the #0 device
-    cudaStream_t stream;
-    CUDA_CHECK(cudaStreamCreate(&stream));
+    hipStream_t stream;
+    CUDA_CHECK(hipStreamCreate(&stream));

    // Timimg variables
    double time_1;
@@ -382,33 +384,33 @@ int main(int argc, char** argv) {
      std::cout << std::endl;

      // Copy the keys to GPU memory
-      CUDA_CHECK(cudaMemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type),
-                                 cudaMemcpyHostToDevice, stream));
+      CUDA_CHECK(hipMemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type),
+                                 hipMemcpyHostToDevice, stream));
      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(hipStreamSynchronize(stream));
      // Record time
      time_1 = W_time();
      // Get pairs from hashtable
      cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys,
                   d_missing_len, stream);
      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(hipStreamSynchronize(stream));
      // Elapsed wall time
      time_2 = W_time() - time_1;
      printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n",
             thread_id, i, time_2);

      // Copy the data back to host
-      CUDA_CHECK(cudaMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
+      CUDA_CHECK(hipMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
                                 query_length * embedding_vec_size * sizeof(float),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaStreamSynchronize(stream));
+                                 hipMemcpyDeviceToHost, stream));
+      CUDA_CHECK(hipMemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t),
+                                 hipMemcpyDeviceToHost, stream));
+      CUDA_CHECK(hipMemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type),
+                                 hipMemcpyDeviceToHost, stream));
+      CUDA_CHECK(hipMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
+                                 hipMemcpyDeviceToHost, stream));
+      CUDA_CHECK(hipStreamSynchronize(stream));
      printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i,
             h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f));

@@ -433,13 +435,13 @@ int main(int argc, char** argv) {
          thread_id, i, time_2);

      // Copy the missing value to device
-      CUDA_CHECK(cudaMemcpyAsync(d_missing_vals, h_missing_vals,
+      CUDA_CHECK(hipMemcpyAsync(d_missing_vals, h_missing_vals,
                                 query_length * embedding_vec_size * sizeof(float),
-                                 cudaMemcpyHostToDevice, stream));
-      CUDA_CHECK(cudaMemcpyAsync(d_vals_retrieved, h_vals_retrieved,
+                                 hipMemcpyHostToDevice, stream));
+      CUDA_CHECK(hipMemcpyAsync(d_vals_retrieved, h_vals_retrieved,
                                 query_length * embedding_vec_size * sizeof(float),
-                                 cudaMemcpyHostToDevice, stream));
-      CUDA_CHECK(cudaStreamSynchronize(stream));
+                                 hipMemcpyHostToDevice, stream));
+      CUDA_CHECK(hipStreamSynchronize(stream));

      // Record time
      time_1 = W_time();
@@ -449,7 +451,7 @@ int main(int argc, char** argv) {
      else
        cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream);
      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(hipStreamSynchronize(stream));
      // Elapsed wall time
      time_2 = W_time() - time_1;
      printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n",
@@ -466,20 +468,20 @@ int main(int argc, char** argv) {
    printf("Worker %d : All Finished!\n", thread_id);

    // Clean-up
-    cudaStreamDestroy(stream);
+    hipStreamDestroy(stream);
    free(h_query_keys_index);
-    CUDA_CHECK(cudaFreeHost(h_query_keys));
-    CUDA_CHECK(cudaFreeHost(h_vals_retrieved));
-    CUDA_CHECK(cudaFreeHost(h_missing_keys));
-    CUDA_CHECK(cudaFreeHost(h_missing_vals));
-    CUDA_CHECK(cudaFreeHost(h_missing_index));
-
-    CUDA_CHECK(cudaFree(d_query_keys));
-    CUDA_CHECK(cudaFree(d_vals_retrieved));
-    CUDA_CHECK(cudaFree(d_missing_keys));
-    CUDA_CHECK(cudaFree(d_missing_vals));
-    CUDA_CHECK(cudaFree(d_missing_index));
-    CUDA_CHECK(cudaFree(d_missing_len));
+    CUDA_CHECK(hipHostFree(h_query_keys));
+    CUDA_CHECK(hipHostFree(h_vals_retrieved));
+    CUDA_CHECK(hipHostFree(h_missing_keys));
+    CUDA_CHECK(hipHostFree(h_missing_vals));
+    CUDA_CHECK(hipHostFree(h_missing_index));
+
+    CUDA_CHECK(hipFree(d_query_keys));
+    CUDA_CHECK(hipFree(d_vals_retrieved));
+    CUDA_CHECK(hipFree(d_missing_keys));
+    CUDA_CHECK(hipFree(d_missing_vals));
+    CUDA_CHECK(hipFree(d_missing_index));
+    CUDA_CHECK(hipFree(d_missing_len));
  }

  // 1st test Clean-up
@@ -547,57 +549,57 @@ int main(int argc, char** argv) {
  key_type* d_missing_keys;
  size_t* d_missing_len;

-  CUDA_CHECK(cudaHostAlloc((void**)&h_insert_keys,
+  CUDA_CHECK(hipHostMalloc((void**)&h_insert_keys,
                           SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
-                           cudaHostAllocPortable));
-  CUDA_CHECK(cudaHostAlloc((void**)&h_insert_vals,
+                           hipHostMallocPortable));
+  CUDA_CHECK(hipHostMalloc((void**)&h_insert_vals,
                           SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
-                           cudaHostAllocPortable));
-  CUDA_CHECK(cudaHostAlloc((void**)&h_dump_keys,
+                           hipHostMallocPortable));
+  CUDA_CHECK(hipHostMalloc((void**)&h_dump_keys,
                           SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
-                           cudaHostAllocPortable));
-  CUDA_CHECK(cudaHostAlloc(
+                           hipHostMallocPortable));
+  CUDA_CHECK(hipHostMalloc(
      (void**)&h_vals_retrieved,
      SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
-      cudaHostAllocPortable));
-  CUDA_CHECK(cudaHostAlloc((void**)&h_acc_keys,
+      hipHostMallocPortable));
+  CUDA_CHECK(hipHostMalloc((void**)&h_acc_keys,
                           SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
-                           cudaHostAllocPortable));
+                           hipHostMallocPortable));

-  CUDA_CHECK(cudaMalloc((void**)&d_keys,
+  CUDA_CHECK(hipMalloc((void**)&d_keys,
                        SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
-  CUDA_CHECK(cudaMalloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
+  CUDA_CHECK(hipMalloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
                                             embedding_vec_size * sizeof(float)));
  CUDA_CHECK(
-      cudaMalloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)));
-  CUDA_CHECK(cudaMalloc((void**)&d_insert_vals,
+      hipMalloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)));
+  CUDA_CHECK(hipMalloc((void**)&d_insert_vals,
                        SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
-  CUDA_CHECK(cudaMalloc((void**)&d_dump_keys,
+  CUDA_CHECK(hipMalloc((void**)&d_dump_keys,
                        SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
-  CUDA_CHECK(cudaMalloc(
+  CUDA_CHECK(hipMalloc(
      (void**)&d_vals_retrieved,
      SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
-  CUDA_CHECK(cudaMalloc((void**)&d_dump_counter, sizeof(size_t)));
-  CUDA_CHECK(cudaMalloc((void**)&d_missing_index,
+  CUDA_CHECK(hipMalloc((void**)&d_dump_counter, sizeof(size_t)));
+  CUDA_CHECK(hipMalloc((void**)&d_missing_index,
                        SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t)));
-  CUDA_CHECK(cudaMalloc((void**)&d_missing_keys,
+  CUDA_CHECK(hipMalloc((void**)&d_missing_keys,
                        SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
-  CUDA_CHECK(cudaMalloc((void**)&d_missing_len, sizeof(size_t)));
+  CUDA_CHECK(hipMalloc((void**)&d_missing_len, sizeof(size_t)));

  // CUDA stream
-  cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  hipStream_t stream;
+  CUDA_CHECK(hipStreamCreate(&stream));

  // Copy all keys and values from host to device
-  CUDA_CHECK(cudaMemcpyAsync(
+  CUDA_CHECK(hipMemcpyAsync(
      d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
-      cudaMemcpyHostToDevice, stream));
-  CUDA_CHECK(cudaMemcpyAsync(
+      hipMemcpyHostToDevice, stream));
+  CUDA_CHECK(hipMemcpyAsync(
      d_vals, h_new_vals,
      SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
-      cudaMemcpyHostToDevice, stream));
+      hipMemcpyHostToDevice, stream));
  // Wait for stream to complete
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(hipStreamSynchronize(stream));

  // Each time insert 1 slab per slabset into the cache and check result
  for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) {
@@ -615,17 +617,17 @@ int main(int argc, char** argv) {
           SLAB_SIZE * cache_capacity_in_set * sizeof(key_type));

    // Copy the <k,v> pairs from host to device
-    CUDA_CHECK(cudaMemcpyAsync(d_insert_keys, h_insert_keys,
+    CUDA_CHECK(hipMemcpyAsync(d_insert_keys, h_insert_keys,
                               SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
-                               cudaMemcpyHostToDevice, stream));
+                               hipMemcpyHostToDevice, stream));
    CUDA_CHECK(
-        cudaMemcpyAsync(d_insert_vals, h_insert_vals,
+        hipMemcpyAsync(d_insert_vals, h_insert_vals,
                        SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
-                        cudaMemcpyHostToDevice, stream));
+                        hipMemcpyHostToDevice, stream));
    // Insert the <k,v> pairs into the cache
    cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream);
    // Wait for stream to complete
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(hipStreamSynchronize(stream));

    // Record time
    time_a = W_time();
@@ -633,7 +635,7 @@ int main(int argc, char** argv) {
    cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream,
                  SLAB_SIZE);
    // Wait for stream to complete
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(hipStreamSynchronize(stream));
    // Elapsed wall time
    time_b = W_time() - time_a;
    printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b);
@@ -644,31 +646,31 @@ int main(int argc, char** argv) {
      // Dump the keys from the cache
      cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream);
      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(hipStreamSynchronize(stream));
      // Elapsed wall time
      time_b = W_time() - time_a;
      printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b);

      // Copy the dump counter from device to host
-      CUDA_CHECK(cudaMemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t),
-                                 cudaMemcpyDeviceToHost, stream));
+      CUDA_CHECK(hipMemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t),
+                                 hipMemcpyDeviceToHost, stream));
      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(hipStreamSynchronize(stream));
      // Check the dump counter
      assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1));
      // Query all the dumped keys from the cache
      cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys,
                   d_missing_len, stream);
      // Copy result from device to host
-      CUDA_CHECK(cudaMemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
+      CUDA_CHECK(hipMemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type),
+                                 hipMemcpyDeviceToHost, stream));
+      CUDA_CHECK(hipMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
                                 h_dump_counter * embedding_vec_size * sizeof(float),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
-                                 cudaMemcpyDeviceToHost, stream));
+                                 hipMemcpyDeviceToHost, stream));
+      CUDA_CHECK(hipMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
+                                 hipMemcpyDeviceToHost, stream));
      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
+      CUDA_CHECK(hipStreamSynchronize(stream));
      // Check result
      assert(h_missing_len == 0);
      compare_key(h_dump_keys, h_acc_keys, h_dump_counter);
@@ -679,27 +681,27 @@ int main(int argc, char** argv) {
  printf("Update and Dump API test all finished!\n");

  // 2nd test clean-up
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  CUDA_CHECK(hipStreamDestroy(stream));
  free(h_keys);
  free(h_vals);
  free(h_new_vals);

-  CUDA_CHECK(cudaFreeHost(h_insert_keys));
-  CUDA_CHECK(cudaFreeHost(h_insert_vals));
-  CUDA_CHECK(cudaFreeHost(h_dump_keys));
-  CUDA_CHECK(cudaFreeHost(h_vals_retrieved));
-  CUDA_CHECK(cudaFreeHost(h_acc_keys));
-
-  CUDA_CHECK(cudaFree(d_keys));
-  CUDA_CHECK(cudaFree(d_vals));
-  CUDA_CHECK(cudaFree(d_insert_keys));
-  CUDA_CHECK(cudaFree(d_insert_vals));
-  CUDA_CHECK(cudaFree(d_dump_keys));
-  CUDA_CHECK(cudaFree(d_vals_retrieved));
-  CUDA_CHECK(cudaFree(d_dump_counter));
-  CUDA_CHECK(cudaFree(d_missing_index));
-  CUDA_CHECK(cudaFree(d_missing_keys));
-  CUDA_CHECK(cudaFree(d_missing_len));
+  CUDA_CHECK(hipHostFree(h_insert_keys));
+  CUDA_CHECK(hipHostFree(h_insert_vals));
+  CUDA_CHECK(hipHostFree(h_dump_keys));
+  CUDA_CHECK(hipHostFree(h_vals_retrieved));
+  CUDA_CHECK(hipHostFree(h_acc_keys));
+
+  CUDA_CHECK(hipFree(d_keys));
+  CUDA_CHECK(hipFree(d_vals));
+  CUDA_CHECK(hipFree(d_insert_keys));
+  CUDA_CHECK(hipFree(d_insert_vals));
+  CUDA_CHECK(hipFree(d_dump_keys));
+  CUDA_CHECK(hipFree(d_vals_retrieved));
+  CUDA_CHECK(hipFree(d_dump_counter));
+  CUDA_CHECK(hipFree(d_missing_index));
+  CUDA_CHECK(hipFree(d_missing_keys));
+  CUDA_CHECK(hipFree(d_missing_len));

  delete cache;