// !!! This is a file automatically generated by hipify!!! /* * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include namespace cg = cooperative_groups; namespace { constexpr int set_size = 4; constexpr int block_size = 256; template __host__ __device__ key_type hash(key_type key) { return key; } template __global__ void hash_add_kernel(const key_type* new_keys, const int num_keys, key_type* keys, const int num_sets, int* set_sizes, const int max_set_size, key_type* missing_keys, int* num_missing_keys) { __shared__ key_type s_missing_keys[block_size]; __shared__ int s_missing_count; __shared__ size_t s_missing_idx; auto grid = cg::this_grid(); auto block = cg::this_thread_block(); if (block.thread_rank() == 0) { s_missing_count = 0; } block.sync(); size_t idx = grid.thread_rank(); if (idx < num_keys) { auto key = new_keys[idx]; size_t idx_set = hash(key) % num_sets; int prev_set_size = atomicAdd(&set_sizes[idx_set], 1); if (prev_set_size < max_set_size) { keys[idx_set * max_set_size + prev_set_size] = key; } else { int count = atomicAdd(&s_missing_count, 1); s_missing_keys[count] = key; } } block.sync(); if (block.thread_rank() == 0) { s_missing_idx = atomicAdd(num_missing_keys, s_missing_count); } block.sync(); for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) { missing_keys[s_missing_idx + i] = s_missing_keys[i]; } } template __global__ void hash_query_kernel(const key_type* query_keys, int* num_keys_ptr, const key_type* keys, const size_t num_sets, const int max_set_size, index_type* output_indices) { constexpr int tile_size = set_size; auto grid = cg::this_grid(); auto block = cg::this_thread_block(); auto tile = cg::tiled_partition(block); int num_keys = *num_keys_ptr; if (num_keys == 0) return; #if (DTK_VERSION < 11060) size_t num_threads_per_grid = grid.size(); #else size_t num_threads_per_grid = grid.num_threads(); #endif size_t step = (num_keys - 1) / num_threads_per_grid + 1; for (size_t i = 0; i < step; i++) { size_t idx = i * num_threads_per_grid + grid.thread_rank(); key_type query_key = std::numeric_limits::max(); if (idx < num_keys) { query_key = query_keys[idx]; } auto idx_set = hash(query_key) % num_sets; for (int j = 0; j < tile_size; j++) { auto current_idx_set = tile.shfl(idx_set, j); auto current_query_key = tile.shfl(query_key, j); if (current_query_key == std::numeric_limits::max()) { continue; } auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()]; int existed = tile.ballot(current_query_key == candidate_key); auto current_idx = tile.shfl(idx, 0) + j; if (existed) { int src_lane = __ffs(existed) - 1; size_t found_idx = current_idx_set * set_size + src_lane; output_indices[current_idx] = num_sets * src_lane + current_idx_set; } else { output_indices[current_idx] = std::numeric_limits::max(); } } } } template __global__ void hash_query_kernel(const key_type* query_keys, const int num_keys, const key_type* keys, const size_t num_sets, const int max_set_size, index_type* output_indices, key_type* missing_keys, int* missing_positions, int* missing_count) { __shared__ key_type s_missing_keys[block_size]; __shared__ key_type s_missing_positions[block_size]; __shared__ int s_missing_count; __shared__ int s_missing_idx; constexpr int tile_size = set_size; auto grid = cg::this_grid(); auto block = cg::this_thread_block(); auto tile = cg::tiled_partition(block); if (block.thread_rank() == 0) { s_missing_count = 0; } block.sync(); size_t idx = grid.thread_rank(); key_type query_key = std::numeric_limits::max(); if (idx < num_keys) { query_key = query_keys[idx]; } auto idx_set = hash(query_key) % num_sets; for (int j = 0; j < tile_size; j++) { auto current_idx_set = tile.shfl(idx_set, j); auto current_query_key = tile.shfl(query_key, j); if (current_query_key == std::numeric_limits::max()) { continue; } auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()]; int existed = tile.ballot(current_query_key == candidate_key); if (existed) { int src_lane = __ffs(existed) - 1; size_t found_idx = current_idx_set * set_size + src_lane; output_indices[tile.shfl(idx, 0) + j] = num_sets * src_lane + current_idx_set; } else { auto current_idx = tile.shfl(idx, 0) + j; output_indices[current_idx] = std::numeric_limits::max(); if (tile.thread_rank() == 0) { int s_count = atomicAdd(&s_missing_count, 1); s_missing_keys[s_count] = current_query_key; s_missing_positions[s_count] = current_idx; } } } if (missing_keys == nullptr) { if (grid.thread_rank() == 0 && missing_count) { *missing_count = 0; } return; } block.sync(); if (block.thread_rank() == 0) { s_missing_idx = atomicAdd(missing_count, s_missing_count); } block.sync(); for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) { missing_keys[s_missing_idx + i] = s_missing_keys[i]; missing_positions[s_missing_idx + i] = s_missing_positions[i]; } } template __forceinline__ __device__ void warp_tile_copy(const size_t lane_idx, const size_t emb_vec_size_in_float, volatile float* d_dst, const float* d_src) { // 16 bytes align if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) { #pragma unroll for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) { d_dst[i] = d_src[i]; } } else { #pragma unroll for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) { *(float4*)(d_dst + i * 4) = __ldg((const float4*)(d_src + i * 4)); } } } template __global__ void read_vectors_kernel(const index_type* query_indices, const int num_keys, const vec_type* vectors, const int vec_size, vec_type* output_vectors) { constexpr int warp_size = 32; auto grid = cg::this_grid(); auto block = cg::this_thread_block(); auto tile = cg::tiled_partition(block); #if (DTK_VERSION < 11060) auto num_threads_per_grid = grid.size(); #else auto num_threads_per_grid = grid.num_threads(); #endif for (int step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) { int key_num = step * num_threads_per_grid + grid.thread_rank(); index_type idx = std::numeric_limits::max(); if (key_num < num_keys) { idx = query_indices[key_num]; } #pragma unroll 4 for (size_t j = 0; j < warp_size; j++) { index_type current_idx = tile.shfl(idx, j); index_type idx_write = tile.shfl(key_num, 0) + j; if (current_idx == std::numeric_limits::max()) continue; warp_tile_copy(tile.thread_rank(), vec_size, output_vectors + idx_write * vec_size, vectors + current_idx * vec_size); } } } template __global__ void distribute_vectors_kernel(const index_type* postions, const size_t num_keys, const vec_type* vectors, const int vec_size, vec_type* output_vectors) { constexpr int warp_size = 32; auto grid = cg::this_grid(); auto block = cg::this_thread_block(); auto tile = cg::tiled_partition(block); #if (DTK_VERSION < 11060) auto num_threads_per_grid = grid.size(); #else auto num_threads_per_grid = grid.num_threads(); #endif for (size_t step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) { size_t key_num = step * num_threads_per_grid + grid.thread_rank(); index_type idx = std::numeric_limits::max(); if (key_num < num_keys) { idx = postions[key_num]; } #pragma unroll 4 for (size_t j = 0; j < warp_size; j++) { size_t idx_write = tile.shfl(idx, j); size_t idx_read = tile.shfl(key_num, 0) + j; if (idx_write == std::numeric_limits::max()) continue; warp_tile_copy(tile.thread_rank(), vec_size, output_vectors + (size_t)idx_write * vec_size, vectors + (size_t)idx_read * vec_size); } } } } // namespace namespace gpu_cache { template UvmTable::UvmTable(const size_t device_table_capacity, const size_t host_table_capacity, const int max_batch_size, const int vec_size, const vec_type default_value) : max_batch_size_(::max(100000, max_batch_size)), vec_size_(vec_size), num_set_((device_table_capacity - 1) / set_size + 1), num_host_set_((host_table_capacity - 1) / set_size + 1), table_capacity_(num_set_ * set_size), default_vector_(vec_size, default_value), device_table_(device_table_capacity, set_size, max_batch_size_), host_table_(host_table_capacity * 1.3, set_size, max_batch_size_) { CUDA_CHECK(hipMalloc(&d_keys_buffer_, sizeof(key_type) * max_batch_size_)); CUDA_CHECK(hipMalloc(&d_vectors_buffer_, sizeof(vec_type) * max_batch_size_ * vec_size_)); CUDA_CHECK(hipMalloc(&d_vectors_, sizeof(vec_type) * device_table_.capacity * vec_size_)); CUDA_CHECK(hipMalloc(&d_output_indices_, sizeof(index_type) * max_batch_size_)); CUDA_CHECK(hipMalloc(&d_output_host_indices_, sizeof(index_type) * max_batch_size_)); CUDA_CHECK(hipHostMalloc(&h_output_host_indices_, sizeof(index_type) * max_batch_size_)); CUDA_CHECK(hipMalloc(&d_missing_keys_, sizeof(key_type) * max_batch_size_)); CUDA_CHECK(hipMalloc(&d_missing_positions_, sizeof(int) * max_batch_size_)); CUDA_CHECK(hipMalloc(&d_missing_count_, sizeof(int))); CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(int))); CUDA_CHECK(hipStreamCreate(&query_stream_)); for (int i = 0; i < num_buffers_; i++) { int batch_size_per_buffer = ceil(1.0 * max_batch_size_ / num_buffers_); CUDA_CHECK( hipHostMalloc(&h_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size)); CUDA_CHECK(hipMalloc(&d_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size)); CUDA_CHECK(hipStreamCreate(&cpy_streams_[i])); CUDA_CHECK(hipEventCreate(&cpy_events_[i])); } CUDA_CHECK(hipHostMalloc(&h_missing_keys_, sizeof(key_type) * max_batch_size_)); CUDA_CHECK(hipEventCreate(&query_event_)); h_vectors_.resize(host_table_.capacity * vec_size_); } template void UvmTable::add(const key_type* h_keys, const vec_type* h_vectors, const size_t num_keys) { std::vector h_missing_keys; size_t num_batches = (num_keys - 1) / max_batch_size_ + 1; for (size_t i = 0; i < num_batches; i++) { size_t this_batch_size = i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_; CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_, sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice)); CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_))); device_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0); CUDA_CHECK(hipDeviceSynchronize()); int num_missing_keys; CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys), hipMemcpyDeviceToHost)); size_t prev_size = h_missing_keys.size(); h_missing_keys.resize(prev_size + num_missing_keys); CUDA_CHECK(hipMemcpy(h_missing_keys.data() + prev_size, d_missing_keys_, sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost)); } std::vector h_final_missing_keys; num_batches = h_missing_keys.size() ? (h_missing_keys.size() - 1) / max_batch_size_ + 1 : 0; for (size_t i = 0; i < num_batches; i++) { size_t this_batch_size = i != num_batches - 1 ? max_batch_size_ : h_missing_keys.size() - i * max_batch_size_; CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_missing_keys.data() + i * max_batch_size_, sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice)); CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_))); host_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0); CUDA_CHECK(hipDeviceSynchronize()); int num_missing_keys; CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys), hipMemcpyDeviceToHost)); size_t prev_size = h_final_missing_keys.size(); h_final_missing_keys.resize(prev_size + num_missing_keys); CUDA_CHECK(hipMemcpy(h_final_missing_keys.data() + prev_size, d_missing_keys_, sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost)); } std::vector h_keys_buffer(max_batch_size_); std::vector h_indices_buffer(max_batch_size_); std::vector h_positions_buffer(max_batch_size_); num_batches = (num_keys - 1) / max_batch_size_ + 1; size_t num_hit_keys = 0; for (size_t i = 0; i < num_batches; i++) { size_t this_batch_size = i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_; CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_, sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice)); CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_))); device_table_.query(d_keys_buffer_, this_batch_size, d_output_indices_, d_missing_keys_, d_missing_positions_, d_missing_count_, 0); CUDA_CHECK(hipStreamSynchronize(0)); CUDA_CHECK(hipMemcpy(d_vectors_buffer_, h_vectors + i * max_batch_size_ * vec_size_, sizeof(*d_vectors_) * this_batch_size * vec_size_, hipMemcpyHostToDevice)); CUDA_CHECK(hipStreamSynchronize(0)); if (num_hit_keys < device_table_.capacity) { hipLaunchKernelGGL(( distribute_vectors_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, 0, d_output_indices_, this_batch_size, d_vectors_buffer_, vec_size_, d_vectors_); CUDA_CHECK(hipStreamSynchronize(0)); } int num_missing_keys; CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys), hipMemcpyDeviceToHost)); num_hit_keys += this_batch_size - num_missing_keys; host_table_.query(d_missing_keys_, num_missing_keys, d_output_indices_, nullptr, nullptr, nullptr, 0); CUDA_CHECK(hipMemcpy(h_keys_buffer.data(), d_missing_keys_, sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost)) CUDA_CHECK(hipMemcpy(h_indices_buffer.data(), d_output_indices_, sizeof(*d_output_indices_) * num_missing_keys, hipMemcpyDeviceToHost)) CUDA_CHECK(hipMemcpy(h_positions_buffer.data(), d_missing_positions_, sizeof(*d_missing_positions_) * num_missing_keys, hipMemcpyDeviceToHost)) for (int j = 0; j < num_missing_keys; j++) { if (h_indices_buffer[j] != std::numeric_limits::max()) { memcpy(h_vectors_.data() + h_indices_buffer[j] * vec_size_, h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_, sizeof(*h_vectors) * vec_size_); } else { size_t prev_idx = h_vectors_.size() / vec_size_; h_final_missing_items_.emplace(h_keys_buffer[j], prev_idx); h_vectors_.resize(h_vectors_.size() + vec_size_); memcpy(h_vectors_.data() + prev_idx * vec_size_, h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_, sizeof(*h_vectors) * vec_size_); } } } CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_))); } template void UvmTable::query(const key_type* d_keys, const int num_keys, vec_type* d_vectors, hipStream_t stream) { if (!num_keys) return; CUDA_CHECK(hipEventRecord(query_event_, stream)); CUDA_CHECK(hipStreamWaitEvent(query_stream_, query_event_)); static_assert(num_buffers_ >= 2); device_table_.query(d_keys, num_keys, d_output_indices_, d_missing_keys_, d_missing_positions_, d_missing_count_, query_stream_); CUDA_CHECK(hipEventRecord(query_event_, query_stream_)); CUDA_CHECK(hipStreamWaitEvent(cpy_streams_[0], query_event_)); int num_missing_keys; CUDA_CHECK(hipMemcpyAsync(&num_missing_keys, d_missing_count_, sizeof(*d_missing_count_), hipMemcpyDeviceToHost, cpy_streams_[0])); host_table_.query(d_missing_keys_, d_missing_count_, d_output_host_indices_, query_stream_); CUDA_CHECK(hipStreamSynchronize(cpy_streams_[0])); CUDA_CHECK(hipMemsetAsync(d_missing_count_, 0, sizeof(*d_missing_count_), query_stream_)); CUDA_CHECK(hipMemcpyAsync(h_output_host_indices_, d_output_host_indices_, sizeof(index_type) * num_missing_keys, hipMemcpyDeviceToHost, query_stream_)); CUDA_CHECK(hipMemcpyAsync(h_missing_keys_, d_missing_keys_, sizeof(key_type) * num_missing_keys, hipMemcpyDeviceToHost, cpy_streams_[0])); hipLaunchKernelGGL(( read_vectors_kernel), dim3((num_keys - 1) / block_size + 1), dim3(block_size), 0, cpy_streams_[1], d_output_indices_, num_keys, d_vectors_, vec_size_, d_vectors); CUDA_CHECK(hipStreamSynchronize(query_stream_)); CUDA_CHECK(hipStreamSynchronize(cpy_streams_[0])); int num_keys_per_buffer = ceil(1.0 * num_missing_keys / num_buffers_); for (int buffer_num = 0; buffer_num < num_buffers_; buffer_num++) { int num_keys_this_buffer = buffer_num != num_buffers_ - 1 ? num_keys_per_buffer : num_missing_keys - num_keys_per_buffer * buffer_num; if (!num_keys_this_buffer) break; #pragma omp parallel for num_threads(8) for (size_t i = 0; i < static_cast(num_keys_this_buffer); i++) { size_t idx_key = buffer_num * num_keys_per_buffer + i; index_type index = h_output_host_indices_[idx_key]; if (index == std::numeric_limits::max()) { key_type key = h_missing_keys_[idx_key]; auto iterator = h_final_missing_items_.find(key); if (iterator != h_final_missing_items_.end()) { index = iterator->second; } } if (index != std::numeric_limits::max()) { memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, h_vectors_.data() + index * vec_size_, sizeof(vec_type) * vec_size_); } else { memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, default_vector_.data(), sizeof(vec_type) * vec_size_); } } CUDA_CHECK(hipMemcpyAsync(d_cpy_buffers_[buffer_num], h_cpy_buffers_[buffer_num], sizeof(vec_type) * num_keys_this_buffer * vec_size_, hipMemcpyHostToDevice, cpy_streams_[buffer_num])); hipLaunchKernelGGL(( distribute_vectors_kernel), dim3((num_keys_this_buffer - 1) / block_size + 1), dim3(block_size), 0, cpy_streams_[buffer_num], d_missing_positions_ + buffer_num * num_keys_per_buffer, num_keys_this_buffer, d_cpy_buffers_[buffer_num], vec_size_, d_vectors); } for (int i = 0; i < num_buffers_; i++) { CUDA_CHECK(hipEventRecord(cpy_events_[i], cpy_streams_[i])); CUDA_CHECK(hipStreamWaitEvent(stream, cpy_events_[i])); } } template void UvmTable::clear(hipStream_t stream) { device_table_.clear(stream); host_table_.clear(stream); } template UvmTable::~UvmTable() { CUDA_CHECK(hipFree(d_keys_buffer_)); CUDA_CHECK(hipFree(d_vectors_buffer_)); CUDA_CHECK(hipFree(d_vectors_)); CUDA_CHECK(hipFree(d_output_indices_)); CUDA_CHECK(hipFree(d_output_host_indices_)); CUDA_CHECK(hipHostFree(h_output_host_indices_)); CUDA_CHECK(hipFree(d_missing_keys_)); CUDA_CHECK(hipFree(d_missing_positions_)); CUDA_CHECK(hipFree(d_missing_count_)); CUDA_CHECK(hipHostFree(h_missing_keys_)); CUDA_CHECK(hipStreamDestroy(query_stream_)); CUDA_CHECK(hipEventDestroy(query_event_)); for (int i = 0; i < num_buffers_; i++) { CUDA_CHECK(hipHostFree(h_cpy_buffers_[i])); CUDA_CHECK(hipFree(d_cpy_buffers_[i])); CUDA_CHECK(hipStreamDestroy(cpy_streams_[i])); CUDA_CHECK(hipEventDestroy(cpy_events_[i])); } } template HashBlock::HashBlock(size_t expected_capacity, int set_size, int batch_size) : max_set_size_(set_size), batch_size_(batch_size) { if (expected_capacity) { num_sets = (expected_capacity - 1) / set_size + 1; } else { num_sets = 10000; } capacity = num_sets * set_size; CUDA_CHECK(hipMalloc(&keys, sizeof(*keys) * capacity)); CUDA_CHECK(hipMalloc(&set_sizes_, sizeof(*set_sizes_) * num_sets)); CUDA_CHECK(hipMemset(set_sizes_, 0, sizeof(*set_sizes_) * num_sets)); } template HashBlock::~HashBlock() { CUDA_CHECK(hipFree(keys)); CUDA_CHECK(hipFree(set_sizes_)); } template void HashBlock::query(const key_type* query_keys, const size_t num_keys, index_type* output_indices, key_type* missing_keys, int* missing_positions, int* num_missing_keys, hipStream_t stream) { if (num_keys == 0) { return; } size_t num_batches = (num_keys - 1) / batch_size_ + 1; for (size_t i = 0; i < num_batches; i++) { size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_; hipLaunchKernelGGL(( hash_query_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, stream, query_keys, this_batch_size, keys, num_sets, max_set_size_, output_indices, missing_keys, missing_positions, num_missing_keys); } } template void HashBlock::query(const key_type* query_keys, int* num_keys, index_type* output_indices, hipStream_t stream) { hipLaunchKernelGGL(( hash_query_kernel), dim3(128), dim3(64), 0, stream, query_keys, num_keys, keys, num_sets, max_set_size_, output_indices); } template void HashBlock::add(const key_type* new_keys, const size_t num_keys, key_type* missing_keys, int* num_missing_keys, hipStream_t stream) { if (num_keys == 0) { return; } size_t num_batches = (num_keys - 1) / batch_size_ + 1; for (size_t i = 0; i < num_batches; i++) { size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_; hipLaunchKernelGGL(( hash_add_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, stream, new_keys + i * this_batch_size, this_batch_size, keys, num_sets, set_sizes_, max_set_size_, missing_keys, num_missing_keys); } } template void HashBlock::clear(hipStream_t stream) { CUDA_CHECK(hipMemsetAsync(set_sizes_, 0, sizeof(*set_sizes_) * num_sets, stream)); } template class HashBlock; template class HashBlock; template class HashBlock; template class HashBlock; template class HashBlock; template class UvmTable; template class UvmTable; template class UvmTable; template class UvmTable; template class UvmTable; } // namespace gpu_cache