Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
1d28bf8b
Commit
1d28bf8b
authored
Sep 23, 2024
by
sangwzh
Browse files
update third_party/HugeCTR/gpu_cache codes to hip
parent
f119ea7c
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
1049 additions
and
1029 deletions
+1049
-1029
third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
+6
-4
third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
+6
-4
third_party/HugeCTR/gpu_cache/include/nv_util.h
third_party/HugeCTR/gpu_cache/include/nv_util.h
+13
-12
third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
+5
-3
third_party/HugeCTR/gpu_cache/include/static_table.hpp
third_party/HugeCTR/gpu_cache/include/static_table.hpp
+5
-3
third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
+176
-174
third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
+6
-7
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.hip
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.hip
+76
-72
third_party/HugeCTR/gpu_cache/src/static_hash_table.hip
third_party/HugeCTR/gpu_cache/src/static_hash_table.hip
+23
-22
third_party/HugeCTR/gpu_cache/src/static_table.hip
third_party/HugeCTR/gpu_cache/src/static_table.hip
+6
-4
third_party/HugeCTR/gpu_cache/src/uvm_table.hip
third_party/HugeCTR/gpu_cache/src/uvm_table.hip
+607
-606
third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
+4
-4
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.hip
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.hip
+116
-114
No files found.
third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
...
...
@@ -31,22 +33,22 @@ class gpu_cache_api {
// Query API, i.e. A single read from the cache
virtual
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
uint64_t
*
d_missing_index
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
cuda
Stream_t
stream
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
virtual
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
// Update API, i.e. update the embeddings which exist in the cache
virtual
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
// Dump API, i.e. dump some slabsets' keys from the cache
virtual
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
const
size_t
end_set_index
,
cuda
Stream_t
stream
)
=
0
;
const
size_t
end_set_index
,
hip
Stream_t
stream
)
=
0
;
};
}
// namespace gpu_cache
third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
...
...
@@ -61,20 +63,20 @@ class gpu_cache : public gpu_cache_api<key_type> {
// Query API, i.e. A single read from the cache
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
uint64_t
*
d_missing_index
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
cuda
Stream_t
stream
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
// Update API, i.e. update the embeddings which exist in the cache
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
// Dump API, i.e. dump some slabsets' keys from the cache
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
const
size_t
end_set_index
,
cuda
Stream_t
stream
)
override
;
const
size_t
end_set_index
,
hip
Stream_t
stream
)
override
;
public:
using
slabset
=
slab_set
<
set_associativity
,
key_type
,
warp_size
>
;
...
...
third_party/HugeCTR/gpu_cache/include/nv_util.h
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
...
...
@@ -15,7 +16,7 @@
*/
#pragma once
#include <
cuda
_runtime_api.h>
#include <
hip/hip
_runtime_api.h>
#include <stdexcept>
#include <string>
...
...
@@ -30,17 +31,17 @@ class CudaException : public std::runtime_error {
CudaException
(
const
std
::
string
&
what
)
:
runtime_error
(
what
)
{}
};
inline
void
cuda_check_
(
cuda
Error_t
val
,
const
char
*
file
,
int
line
)
{
if
(
val
!=
cuda
Success
)
{
inline
void
cuda_check_
(
hip
Error_t
val
,
const
char
*
file
,
int
line
)
{
if
(
val
!=
hip
Success
)
{
throw
CudaException
(
std
::
string
(
file
)
+
":"
+
std
::
to_string
(
line
)
+
": CUDA error "
+
std
::
to_string
(
val
)
+
": "
+
cuda
GetErrorString
(
val
));
std
::
to_string
(
val
)
+
": "
+
hip
GetErrorString
(
val
));
}
}
class
CudaDeviceRestorer
{
public:
CudaDeviceRestorer
()
{
CUDA_CHECK
(
cuda
GetDevice
(
&
dev_
));
}
~
CudaDeviceRestorer
()
{
CUDA_CHECK
(
cuda
SetDevice
(
dev_
));
}
CudaDeviceRestorer
()
{
CUDA_CHECK
(
hip
GetDevice
(
&
dev_
));
}
~
CudaDeviceRestorer
()
{
CUDA_CHECK
(
hip
SetDevice
(
dev_
));
}
void
check_device
(
int
device
)
const
{
if
(
device
!=
dev_
)
{
throw
std
::
runtime_error
(
...
...
@@ -54,14 +55,14 @@ class CudaDeviceRestorer {
};
inline
int
get_dev
(
const
void
*
ptr
)
{
cuda
PointerAttribute
s
attr
;
CUDA_CHECK
(
cuda
PointerGetAttributes
(
&
attr
,
ptr
));
hip
PointerAttribute
_t
attr
;
CUDA_CHECK
(
hip
PointerGetAttributes
(
&
attr
,
ptr
));
int
dev
=
-
1
;
#if
CUDA
RT_VERSION >= 10000
if
(
attr
.
type
==
cuda
MemoryTypeDevice
)
#if
DTK
RT_VERSION >= 10000
if
(
attr
.
type
==
hip
MemoryTypeDevice
)
#else
if
(
attr
.
memoryType
==
cuda
MemoryTypeDevice
)
if
(
attr
.
memoryType
==
hip
MemoryTypeDevice
)
#endif
{
dev
=
attr
.
device
;
...
...
@@ -72,7 +73,7 @@ inline int get_dev(const void* ptr) {
inline
void
switch_to_dev
(
const
void
*
ptr
)
{
int
dev
=
get_dev
(
ptr
);
if
(
dev
>=
0
)
{
CUDA_CHECK
(
cuda
SetDevice
(
dev
));
CUDA_CHECK
(
hip
SetDevice
(
dev
));
}
}
...
...
third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
...
...
@@ -50,17 +52,17 @@ class StaticHashTable {
return
keys_bytes
+
indices_bytes
+
values_bytes
;
}
void
clear
(
cuda
Stream_t
stream
=
0
);
void
clear
(
hip
Stream_t
stream
=
0
);
// Note:
// 1. Please make sure the key to be inserted is not duplicated.
// 2. Please make sure the key to be inserted does not exist in the table.
// 3. Please make sure (size() + num_keys) <= capacity().
void
insert
(
const
key_type
*
keys
,
const
value_type
*
values
,
size_type
num_keys
,
cuda
Stream_t
stream
=
0
);
hip
Stream_t
stream
=
0
);
void
lookup
(
const
key_type
*
keys
,
value_type
*
values
,
int
num_keys
,
value_type
default_value
=
0
,
cuda
Stream_t
stream
=
0
);
hip
Stream_t
stream
=
0
);
private:
key_type
*
table_keys_
;
...
...
third_party/HugeCTR/gpu_cache/include/static_table.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
...
...
@@ -36,12 +38,12 @@ class static_table {
~
static_table
(){};
// Query API, i.e. A single read from the cache
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
cuda
Stream_t
stream
);
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
hip
Stream_t
stream
);
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void
Init
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
);
void
Init
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
hip
Stream_t
stream
);
void
Clear
(
cuda
Stream_t
stream
);
void
Clear
(
hip
Stream_t
stream
);
private:
StaticHashTable
<
key_type
,
float
>
static_hash_table_
;
...
...
third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
View file @
1d28bf8b
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nv_util.h>
#include <thread>
#include <unordered_map>
#include <vector>
namespace
gpu_cache
{
template
<
typename
key_type
,
typename
index_type
>
class
HashBlock
{
public:
key_type
*
keys
;
size_t
num_sets
;
size_t
capacity
;
HashBlock
(
size_t
expected_capacity
,
int
set_size
,
int
batch_size
);
~
HashBlock
();
void
add
(
const
key_type
*
new_keys
,
const
size_t
num_keys
,
key_type
*
missing_keys
,
int
*
num_missing_keys
,
cudaStream_t
stream
);
void
query
(
const
key_type
*
query_keys
,
const
size_t
num_keys
,
index_type
*
output_indices
,
key_type
*
missing_keys
,
int
*
missing_positions
,
int
*
num_missing_keys
,
cudaStream_t
stream
);
void
query
(
const
key_type
*
query_keys
,
int
*
num_keys
,
index_type
*
output_indices
,
cudaStream_t
stream
);
void
clear
(
cudaStream_t
stream
);
private:
int
max_set_size_
;
int
batch_size_
;
int
*
set_sizes_
;
};
template
<
typename
vec_type
>
class
H2HCopy
{
public:
H2HCopy
(
int
num_threads
)
:
num_threads_
(
num_threads
),
working_
(
num_threads
)
{
for
(
int
i
=
0
;
i
<
num_threads_
;
i
++
)
{
threads_
.
emplace_back
(
[
&
](
int
idx
)
{
while
(
!
terminate_
)
{
if
(
working_
[
idx
].
load
(
std
::
memory_order_relaxed
))
{
working_
[
idx
].
store
(
false
,
std
::
memory_order_relaxed
);
if
(
num_keys_
==
0
)
continue
;
size_t
num_keys_this_thread
=
(
num_keys_
-
1
)
/
num_threads_
+
1
;
size_t
begin
=
idx
*
num_keys_this_thread
;
if
(
idx
==
num_threads_
-
1
)
{
num_keys_this_thread
=
num_keys_
-
num_keys_this_thread
*
idx
;
}
size_t
end
=
begin
+
num_keys_this_thread
;
for
(
size_t
i
=
begin
;
i
<
end
;
i
++
)
{
size_t
idx_vec
=
get_index_
(
i
);
if
(
idx_vec
==
std
::
numeric_limits
<
size_t
>::
max
())
{
continue
;
}
memcpy
(
dst_data_ptr_
+
i
*
vec_size_
,
src_data_ptr_
+
idx_vec
*
vec_size_
,
sizeof
(
vec_type
)
*
vec_size_
);
}
num_finished_workers_
++
;
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
microseconds
(
1
));
},
i
);
}
};
void
copy
(
vec_type
*
dst_data_ptr
,
vec_type
*
src_data_ptr
,
size_t
num_keys
,
int
vec_size
,
std
::
function
<
size_t
(
size_t
)
>
get_index_func
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
submit_mutex_
);
dst_data_ptr_
=
dst_data_ptr
;
src_data_ptr_
=
src_data_ptr
;
get_index_
=
get_index_func
;
num_keys_
=
num_keys
;
vec_size_
=
vec_size
;
num_finished_workers_
.
store
(
0
,
std
::
memory_order_acquire
);
for
(
auto
&
working
:
working_
)
{
working
.
store
(
true
,
std
::
memory_order_relaxed
);
}
while
(
num_finished_workers_
!=
num_threads_
)
{
continue
;
}
}
~
H2HCopy
()
{
terminate_
=
true
;
for
(
auto
&
t
:
threads_
)
{
t
.
join
();
}
}
private:
vec_type
*
src_data_ptr_
;
vec_type
*
dst_data_ptr_
;
std
::
function
<
size_t
(
size_t
)
>
get_index_
;
size_t
num_keys_
;
int
vec_size_
;
std
::
mutex
submit_mutex_
;
const
int
num_threads_
;
std
::
vector
<
std
::
thread
>
threads_
;
std
::
vector
<
std
::
atomic
<
bool
>>
working_
;
volatile
bool
terminate_
{
false
};
std
::
atomic
<
int
>
num_finished_workers_
{
0
};
};
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
=
float
>
class
UvmTable
{
public:
UvmTable
(
const
size_t
device_table_capacity
,
const
size_t
host_table_capacity
,
const
int
max_batch_size
,
const
int
vec_size
,
const
vec_type
default_value
=
(
vec_type
)
0
);
~
UvmTable
();
void
query
(
const
key_type
*
d_keys
,
const
int
len
,
vec_type
*
d_vectors
,
cudaStream_t
stream
=
0
);
void
add
(
const
key_type
*
h_keys
,
const
vec_type
*
h_vectors
,
const
size_t
len
);
void
clear
(
cudaStream_t
stream
=
0
);
private:
static
constexpr
int
num_buffers_
=
2
;
key_type
*
d_keys_buffer_
;
vec_type
*
d_vectors_buffer_
;
vec_type
*
d_vectors_
;
index_type
*
d_output_indices_
;
index_type
*
d_output_host_indices_
;
index_type
*
h_output_host_indices_
;
key_type
*
d_missing_keys_
;
int
*
d_missing_positions_
;
int
*
d_missing_count_
;
std
::
vector
<
vec_type
>
h_vectors_
;
key_type
*
h_missing_keys_
;
cudaStream_t
query_stream_
;
cudaEvent_t
query_event_
;
vec_type
*
h_cpy_buffers_
[
num_buffers_
];
vec_type
*
d_cpy_buffers_
[
num_buffers_
];
cudaStream_t
cpy_streams_
[
num_buffers_
];
cudaEvent_t
cpy_events_
[
num_buffers_
];
std
::
unordered_map
<
key_type
,
index_type
>
h_final_missing_items_
;
int
max_batch_size_
;
int
vec_size_
;
size_t
num_set_
;
size_t
num_host_set_
;
size_t
table_capacity_
;
std
::
vector
<
vec_type
>
default_vector_
;
HashBlock
<
key_type
,
index_type
>
device_table_
;
HashBlock
<
key_type
,
index_type
>
host_table_
;
};
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nv_util.h>
#include <thread>
#include <unordered_map>
#include <vector>
namespace
gpu_cache
{
template
<
typename
key_type
,
typename
index_type
>
class
HashBlock
{
public:
key_type
*
keys
;
size_t
num_sets
;
size_t
capacity
;
HashBlock
(
size_t
expected_capacity
,
int
set_size
,
int
batch_size
);
~
HashBlock
();
void
add
(
const
key_type
*
new_keys
,
const
size_t
num_keys
,
key_type
*
missing_keys
,
int
*
num_missing_keys
,
hipStream_t
stream
);
void
query
(
const
key_type
*
query_keys
,
const
size_t
num_keys
,
index_type
*
output_indices
,
key_type
*
missing_keys
,
int
*
missing_positions
,
int
*
num_missing_keys
,
hipStream_t
stream
);
void
query
(
const
key_type
*
query_keys
,
int
*
num_keys
,
index_type
*
output_indices
,
hipStream_t
stream
);
void
clear
(
hipStream_t
stream
);
private:
int
max_set_size_
;
int
batch_size_
;
int
*
set_sizes_
;
};
template
<
typename
vec_type
>
class
H2HCopy
{
public:
H2HCopy
(
int
num_threads
)
:
num_threads_
(
num_threads
),
working_
(
num_threads
)
{
for
(
int
i
=
0
;
i
<
num_threads_
;
i
++
)
{
threads_
.
emplace_back
(
[
&
](
int
idx
)
{
while
(
!
terminate_
)
{
if
(
working_
[
idx
].
load
(
std
::
memory_order_relaxed
))
{
working_
[
idx
].
store
(
false
,
std
::
memory_order_relaxed
);
if
(
num_keys_
==
0
)
continue
;
size_t
num_keys_this_thread
=
(
num_keys_
-
1
)
/
num_threads_
+
1
;
size_t
begin
=
idx
*
num_keys_this_thread
;
if
(
idx
==
num_threads_
-
1
)
{
num_keys_this_thread
=
num_keys_
-
num_keys_this_thread
*
idx
;
}
size_t
end
=
begin
+
num_keys_this_thread
;
for
(
size_t
i
=
begin
;
i
<
end
;
i
++
)
{
size_t
idx_vec
=
get_index_
(
i
);
if
(
idx_vec
==
std
::
numeric_limits
<
size_t
>::
max
())
{
continue
;
}
memcpy
(
dst_data_ptr_
+
i
*
vec_size_
,
src_data_ptr_
+
idx_vec
*
vec_size_
,
sizeof
(
vec_type
)
*
vec_size_
);
}
num_finished_workers_
++
;
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
microseconds
(
1
));
},
i
);
}
};
void
copy
(
vec_type
*
dst_data_ptr
,
vec_type
*
src_data_ptr
,
size_t
num_keys
,
int
vec_size
,
std
::
function
<
size_t
(
size_t
)
>
get_index_func
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
submit_mutex_
);
dst_data_ptr_
=
dst_data_ptr
;
src_data_ptr_
=
src_data_ptr
;
get_index_
=
get_index_func
;
num_keys_
=
num_keys
;
vec_size_
=
vec_size
;
num_finished_workers_
.
store
(
0
,
std
::
memory_order_acquire
);
for
(
auto
&
working
:
working_
)
{
working
.
store
(
true
,
std
::
memory_order_relaxed
);
}
while
(
num_finished_workers_
!=
num_threads_
)
{
continue
;
}
}
~
H2HCopy
()
{
terminate_
=
true
;
for
(
auto
&
t
:
threads_
)
{
t
.
join
();
}
}
private:
vec_type
*
src_data_ptr_
;
vec_type
*
dst_data_ptr_
;
std
::
function
<
size_t
(
size_t
)
>
get_index_
;
size_t
num_keys_
;
int
vec_size_
;
std
::
mutex
submit_mutex_
;
const
int
num_threads_
;
std
::
vector
<
std
::
thread
>
threads_
;
std
::
vector
<
std
::
atomic
<
bool
>>
working_
;
volatile
bool
terminate_
{
false
};
std
::
atomic
<
int
>
num_finished_workers_
{
0
};
};
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
=
float
>
class
UvmTable
{
public:
UvmTable
(
const
size_t
device_table_capacity
,
const
size_t
host_table_capacity
,
const
int
max_batch_size
,
const
int
vec_size
,
const
vec_type
default_value
=
(
vec_type
)
0
);
~
UvmTable
();
void
query
(
const
key_type
*
d_keys
,
const
int
len
,
vec_type
*
d_vectors
,
hipStream_t
stream
=
0
);
void
add
(
const
key_type
*
h_keys
,
const
vec_type
*
h_vectors
,
const
size_t
len
);
void
clear
(
hipStream_t
stream
=
0
);
private:
static
constexpr
int
num_buffers_
=
2
;
key_type
*
d_keys_buffer_
;
vec_type
*
d_vectors_buffer_
;
vec_type
*
d_vectors_
;
index_type
*
d_output_indices_
;
index_type
*
d_output_host_indices_
;
index_type
*
h_output_host_indices_
;
key_type
*
d_missing_keys_
;
int
*
d_missing_positions_
;
int
*
d_missing_count_
;
std
::
vector
<
vec_type
>
h_vectors_
;
key_type
*
h_missing_keys_
;
hipStream_t
query_stream_
;
hipEvent_t
query_event_
;
vec_type
*
h_cpy_buffers_
[
num_buffers_
];
vec_type
*
d_cpy_buffers_
[
num_buffers_
];
hipStream_t
cpy_streams_
[
num_buffers_
];
hipEvent_t
cpy_events_
[
num_buffers_
];
std
::
unordered_map
<
key_type
,
index_type
>
h_final_missing_items_
;
int
max_batch_size_
;
int
vec_size_
;
size_t
num_set_
;
size_t
num_host_set_
;
size_t
table_capacity_
;
std
::
vector
<
vec_type
>
default_vector_
;
HashBlock
<
key_type
,
index_type
>
device_table_
;
HashBlock
<
key_type
,
index_type
>
host_table_
;
};
}
// namespace gpu_cache
\ No newline at end of file
third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
View file @
1d28bf8b
...
...
@@ -15,15 +15,14 @@
cmake_minimum_required
(
VERSION 3.8
)
file
(
GLOB gpu_cache_src
nv_gpu_cache.
cu
static_table.
cu
static_hash_table.
cu
uvm_table.
cu
nv_gpu_cache.
hip
static_table.
hip
static_hash_table.
hip
uvm_table.
hip
)
add_library
(
gpu_cache SHARED
${
gpu_cache_src
}
)
target_compile_features
(
gpu_cache PUBLIC cxx_std_11
)
set_target_properties
(
gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
set_target_properties
(
gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
set_target_properties
(
gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF
)
set_target_properties
(
gpu_cache PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS ON
)
# set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.
cu
→
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
...
...
@@ -14,7 +16,7 @@
* limitations under the License.
*/
#include <cooperative_groups.h>
#include <
hip/hip_
cooperative_groups.h>
#include <nv_gpu_cache.hpp>
...
...
@@ -29,9 +31,11 @@ __forceinline__ __device__ long long atomicAdd(long long* address, long long val
return (long long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
}
#ifndef __HIPCC__
__forceinline__ __device__ unsigned long atomicAdd(unsigned long* address, unsigned long val) {
return (unsigned long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
}
#endif
namespace gpu_cache {
...
...
@@ -1253,27 +1257,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
}
// Get the current CUDA dev
CUDA_CHECK
(
cuda
GetDevice
(
&
dev_
));
CUDA_CHECK(
hip
GetDevice(&dev_));
// Calculate # of slot
num_slot_ = capacity_in_set_ * set_associativity * warp_size;
// Allocate GPU memory for cache
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
keys_
,
sizeof
(
slabset
)
*
capacity_in_set_
));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
vals_
,
sizeof
(
float
)
*
embedding_vec_size_
*
num_slot_
));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
slot_counter_
,
sizeof
(
ref_counter_type
)
*
num_slot_
));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
global_counter_
,
sizeof
(
atomic_ref_counter_type
)));
CUDA_CHECK(
hip
Malloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
CUDA_CHECK(
hip
Malloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
CUDA_CHECK(
hip
Malloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
CUDA_CHECK(
hip
Malloc((void**)&global_counter_, sizeof(atomic_ref_counter_type)));
// Allocate GPU memory for set mutex
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
set_mutex_
,
sizeof
(
mutex
)
*
capacity_in_set_
));
CUDA_CHECK(
hip
Malloc((void**)&set_mutex_, sizeof(mutex) * capacity_in_set_));
// Initialize the cache, set all entry to unused <K,V>
init_cache
<<<
((
num_slot_
-
1
)
/
BLOCK_SIZE_
)
+
1
,
BLOCK_SIZE_
>>>
(
hipLaunchKernelGGL((
init_cache
), dim3(
((num_slot_ - 1) / BLOCK_SIZE_) + 1
)
,
dim3(
BLOCK_SIZE_
), 0, 0,
keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);
// Wait for initialization to finish
CUDA_CHECK
(
cuda
StreamSynchronize
(
0
));
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
StreamSynchronize(0));
CUDA_CHECK(
hip
GetLastError());
}
#else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
...
...
@@ -1301,27 +1305,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
}
// Get the current CUDA dev
CUDA_CHECK
(
cuda
GetDevice
(
&
dev_
));
CUDA_CHECK(
hip
GetDevice(&dev_));
// Calculate # of slot
num_slot_ = capacity_in_set_ * set_associativity * warp_size;
// Allocate GPU memory for cache
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
keys_
,
sizeof
(
slabset
)
*
capacity_in_set_
));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
vals_
,
sizeof
(
float
)
*
embedding_vec_size_
*
num_slot_
));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
slot_counter_
,
sizeof
(
ref_counter_type
)
*
num_slot_
));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
global_counter_
,
sizeof
(
ref_counter_type
)));
CUDA_CHECK(
hip
Malloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
CUDA_CHECK(
hip
Malloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
CUDA_CHECK(
hip
Malloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
CUDA_CHECK(
hip
Malloc((void**)&global_counter_, sizeof(ref_counter_type)));
// Allocate GPU memory for set mutex
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
set_mutex_
,
sizeof
(
int
)
*
capacity_in_set_
));
CUDA_CHECK(
hip
Malloc((void**)&set_mutex_, sizeof(int) * capacity_in_set_));
// Initialize the cache, set all entry to unused <K,V>
init_cache
<<<
((
num_slot_
-
1
)
/
BLOCK_SIZE_
)
+
1
,
BLOCK_SIZE_
>>>
(
hipLaunchKernelGGL((
init_cache
), dim3(
((num_slot_ - 1) / BLOCK_SIZE_) + 1
)
,
dim3(
BLOCK_SIZE_
), 0, 0,
keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);
// Wait for initialization to finish
CUDA_CHECK
(
cuda
StreamSynchronize
(
0
));
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
StreamSynchronize(0));
CUDA_CHECK(
hip
GetLastError());
}
#endif
...
...
@@ -1337,18 +1341,18 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
dev_restorer.check_device(dev_);
// Destruct CUDA std object
destruct_kernel
<<<
((
capacity_in_set_
-
1
)
/
BLOCK_SIZE_
)
+
1
,
BLOCK_SIZE_
>>>
(
hipLaunchKernelGGL((
destruct_kernel
), dim3(
((capacity_in_set_ - 1) / BLOCK_SIZE_) + 1
)
,
dim3(
BLOCK_SIZE_
), 0, 0,
global_counter_, set_mutex_, capacity_in_set_);
// Wait for destruction to finish
CUDA_CHECK
(
cuda
StreamSynchronize
(
0
));
CUDA_CHECK(
hip
StreamSynchronize(0));
// Free GPU memory for cache
CUDA_CHECK
(
cuda
Free
(
keys_
));
CUDA_CHECK
(
cuda
Free
(
vals_
));
CUDA_CHECK
(
cuda
Free
(
slot_counter_
));
CUDA_CHECK
(
cuda
Free
(
global_counter_
));
CUDA_CHECK(
hip
Free(keys_));
CUDA_CHECK(
hip
Free(vals_));
CUDA_CHECK(
hip
Free(slot_counter_));
CUDA_CHECK(
hip
Free(global_counter_));
// Free GPU memory for set mutex
CUDA_CHECK
(
cuda
Free
(
set_mutex_
));
CUDA_CHECK(
hip
Free(set_mutex_));
}
#else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
...
...
@@ -1362,12 +1366,12 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
dev_restorer.check_device(dev_);
// Free GPU memory for cache
CUDA_CHECK
(
cuda
Free
(
keys_
));
CUDA_CHECK
(
cuda
Free
(
vals_
));
CUDA_CHECK
(
cuda
Free
(
slot_counter_
));
CUDA_CHECK
(
cuda
Free
(
global_counter_
));
CUDA_CHECK(
hip
Free(keys_));
CUDA_CHECK(
hip
Free(vals_));
CUDA_CHECK(
hip
Free(slot_counter_));
CUDA_CHECK(
hip
Free(global_counter_));
// Free GPU memory for set mutex
CUDA_CHECK
(
cuda
Free
(
set_mutex_
));
CUDA_CHECK(
hip
Free(set_mutex_));
}
#endif
...
...
@@ -1377,7 +1381,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
uint64_t* d_missing_index, key_type* d_missing_keys,
size_t
*
d_missing_len
,
cuda
Stream_t
stream
,
size_t* d_missing_len,
hip
Stream_t stream,
const size_t task_per_warp_tile) {
// Device Restorer
nv::CudaDeviceRestorer dev_restorer;
...
...
@@ -1387,27 +1391,27 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Check if it is a valid query
if (len == 0) {
// Set the d_missing_len to 0 before return
CUDA_CHECK
(
cuda
MemsetAsync
(
d_missing_len
,
0
,
sizeof
(
size_t
),
stream
));
CUDA_CHECK(
hip
MemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
return;
}
// Update the global counter as user perform a new(most recent) read operation to the cache
// Resolve distance overflow issue as well.
update_kernel_overflow_ignore
<
atomic_ref_counter_type
>
<<<
1
,
1
,
0
,
stream
>>>
(
global_counter_
,
d_missing_len
);
hipLaunchKernelGGL((
update_kernel_overflow_ignore<atomic_ref_counter_type>
)
, dim3(1), dim3(1)
, 0, stream
,
global_counter_, d_missing_len);
// Read from the cache
// Touch and refresh the hitting slot
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1;
get_kernel
<
key_type
,
ref_counter_type
,
atomic_ref_counter_type
,
slabset
,
set_hasher
,
slab_hasher
,
mutex
,
empty_key
,
set_associativity
,
warp_size
>
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
get_kernel<key_type, ref_counter_type, atomic_ref_counter_type, slabset, set_hasher, slab_hasher,
mutex, empty_key, set_associativity, warp_size>
), dim3(
grid_size
)
,
dim3(
BLOCK_SIZE_
)
, 0, stream
,
d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len,
global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_,
task_per_warp_tile);
// Check for GPU error before return
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
GetLastError());
}
#else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
...
...
@@ -1415,7 +1419,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
uint64_t* d_missing_index, key_type* d_missing_keys,
size_t
*
d_missing_len
,
cuda
Stream_t
stream
,
size_t* d_missing_len,
hip
Stream_t stream,
const size_t task_per_warp_tile) {
// Device Restorer
nv::CudaDeviceRestorer dev_restorer;
...
...
@@ -1425,27 +1429,27 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Check if it is a valid query
if (len == 0) {
// Set the d_missing_len to 0 before return
CUDA_CHECK
(
cuda
MemsetAsync
(
d_missing_len
,
0
,
sizeof
(
size_t
),
stream
));
CUDA_CHECK(
hip
MemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
return;
}
// Update the global counter as user perform a new(most recent) read operation to the cache
// Resolve distance overflow issue as well.
update_kernel_overflow_ignore
<
ref_counter_type
>
<<<
1
,
1
,
0
,
stream
>>>
(
global_counter_
,
d_missing_len
);
hipLaunchKernelGGL((
update_kernel_overflow_ignore<ref_counter_type>
)
, dim3(1), dim3(1)
, 0, stream
,
global_counter_, d_missing_len);
// Read from the cache
// Touch and refresh the hitting slot
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1;
get_kernel
<
key_type
,
ref_counter_type
,
slabset
,
set_hasher
,
slab_hasher
,
empty_key
,
set_associativity
,
warp_size
>
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
get_kernel<key_type, ref_counter_type, slabset, set_hasher, slab_hasher, empty_key,
set_associativity, warp_size>
), dim3(
grid_size
)
,
dim3(
BLOCK_SIZE_
)
, 0, stream
,
d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len,
global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_,
task_per_warp_tile);
// Check for GPU error before return
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
GetLastError());
}
#endif
...
...
@@ -1454,7 +1458,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
int warp_size, typename set_hasher, typename slab_hasher>
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Replace(const key_type* d_keys, const size_t len,
const
float
*
d_values
,
cuda
Stream_t
stream
,
const float* d_values,
hip
Stream_t stream,
const size_t task_per_warp_tile) {
// Check if it is a valid replacement
if (len == 0) {
...
...
@@ -1470,21 +1474,21 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Then replace the <k,v> pairs into the cache
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1;
insert_replace_kernel
<
key_type
,
slabset
,
ref_counter_type
,
mutex
,
atomic_ref_counter_type
,
set_hasher
,
slab_hasher
,
empty_key
,
set_associativity
,
warp_size
>
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
d_keys
,
d_values
,
embedding_vec_size_
,
len
,
keys_
,
hipLaunchKernelGGL((
insert_replace_kernel<key_type, slabset, ref_counter_type, mutex, atomic_ref_counter_type,
set_hasher, slab_hasher, empty_key, set_associativity, warp_size>
)
, dim3(
grid_size
)
,
dim3(
BLOCK_SIZE_
)
, 0, stream
,
d_keys, d_values, embedding_vec_size_, len, keys_,
vals_, slot_counter_, set_mutex_, global_counter_,
capacity_in_set_, task_per_warp_tile);
// Check for GPU error before return
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
GetLastError());
}
#else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
int warp_size, typename set_hasher, typename slab_hasher>
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Replace(const key_type* d_keys, const size_t len,
const
float
*
d_values
,
cuda
Stream_t
stream
,
const float* d_values,
hip
Stream_t stream,
const size_t task_per_warp_tile) {
// Check if it is a valid replacement
if (len == 0) {
...
...
@@ -1500,13 +1504,13 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Then replace the <k,v> pairs into the cache
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1;
insert_replace_kernel
<
key_type
,
slabset
,
ref_counter_type
,
set_hasher
,
slab_hasher
,
empty_key
,
set_associativity
,
warp_size
>
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
insert_replace_kernel<key_type, slabset, ref_counter_type, set_hasher, slab_hasher, empty_key,
set_associativity, warp_size>
), dim3(
grid_size
)
,
dim3(
BLOCK_SIZE_
)
, 0, stream
,
d_keys, d_values, embedding_vec_size_, len, keys_, vals_, slot_counter_, set_mutex_,
global_counter_, capacity_in_set_, task_per_warp_tile);
// Check for GPU error before return
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
GetLastError());
}
#endif
...
...
@@ -1515,7 +1519,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
int warp_size, typename set_hasher, typename slab_hasher>
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
cuda
Stream_t
stream
,
const
size_t
task_per_warp_tile
)
{
hip
Stream_t stream, const size_t task_per_warp_tile) {
// Check if it is a valid update request
if (len == 0) {
return;
...
...
@@ -1529,20 +1533,20 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Update the value of input keys that are existed in the cache
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1;
update_kernel
<
key_type
,
slabset
,
set_hasher
,
slab_hasher
,
mutex
,
empty_key
,
set_associativity
,
warp_size
>
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
update_kernel<key_type, slabset, set_hasher, slab_hasher, mutex, empty_key, set_associativity,
warp_size>
), dim3(
grid_size
)
,
dim3(
BLOCK_SIZE_
)
, 0, stream
,
d_keys, len, d_values, embedding_vec_size_, capacity_in_set_, keys_, vals_, set_mutex_,
task_per_warp_tile);
// Check for GPU error before return
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
GetLastError());
}
#else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
int warp_size, typename set_hasher, typename slab_hasher>
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
cuda
Stream_t
stream
,
const
size_t
task_per_warp_tile
)
{
hip
Stream_t stream, const size_t task_per_warp_tile) {
// Check if it is a valid update request
if (len == 0) {
return;
...
...
@@ -1556,13 +1560,13 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
// Update the value of input keys that are existed in the cache
const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
const size_t grid_size = ((len - 1) / keys_per_block) + 1;
update_kernel
<
key_type
,
slabset
,
set_hasher
,
slab_hasher
,
empty_key
,
set_associativity
,
warp_size
>
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
d_keys
,
len
,
d_values
,
embedding_vec_size_
,
hipLaunchKernelGGL((
update_kernel<key_type, slabset, set_hasher, slab_hasher, empty_key, set_associativity, warp_size>
)
, dim3(
grid_size
)
,
dim3(
BLOCK_SIZE_
)
, 0, stream
,
d_keys, len, d_values, embedding_vec_size_,
capacity_in_set_, keys_, vals_, set_mutex_,
task_per_warp_tile);
// Check for GPU error before return
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
GetLastError());
}
#endif
...
...
@@ -1572,7 +1576,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
const size_t start_set_index, const size_t end_set_index,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
// Check if it is a valid dump request
if (start_set_index >= capacity_in_set_) {
printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
...
...
@@ -1589,17 +1593,17 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
dev_restorer.check_device(dev_);
// Set the global counter to 0 first
CUDA_CHECK
(
cuda
MemsetAsync
(
d_dump_counter
,
0
,
sizeof
(
size_t
),
stream
));
CUDA_CHECK(
hip
MemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
// Dump keys from the cache
const size_t grid_size =
(((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1;
dump_kernel
<
key_type
,
slabset
,
mutex
,
empty_key
,
set_associativity
,
warp_size
>
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
d_keys
,
d_dump_counter
,
keys_
,
set_mutex_
,
hipLaunchKernelGGL((
dump_kernel<key_type, slabset, mutex, empty_key, set_associativity, warp_size>
)
, dim3(
grid_size
)
,
dim3(
BLOCK_SIZE_
)
, 0, stream
,
d_keys, d_dump_counter, keys_, set_mutex_,
start_set_index, end_set_index);
// Check for GPU error before return
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
GetLastError());
}
#else
template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
...
...
@@ -1607,7 +1611,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
const size_t start_set_index, const size_t end_set_index,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
// Check if it is a valid dump request
if (start_set_index >= capacity_in_set_) {
printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
...
...
@@ -1624,17 +1628,17 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
dev_restorer.check_device(dev_);
// Set the global counter to 0 first
CUDA_CHECK
(
cuda
MemsetAsync
(
d_dump_counter
,
0
,
sizeof
(
size_t
),
stream
));
CUDA_CHECK(
hip
MemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
// Dump keys from the cache
const size_t grid_size =
(((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1;
dump_kernel
<
key_type
,
slabset
,
empty_key
,
set_associativity
,
warp_size
>
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
d_keys
,
d_dump_counter
,
keys_
,
set_mutex_
,
hipLaunchKernelGGL((
dump_kernel<key_type, slabset, empty_key, set_associativity, warp_size>
)
, dim3(
grid_size
)
,
dim3(
BLOCK_SIZE_
)
, 0, stream
,
d_keys, d_dump_counter, keys_, set_mutex_,
start_set_index, end_set_index);
// Check for GPU error before return
CUDA_CHECK
(
cuda
GetLastError
());
CUDA_CHECK(
hip
GetLastError());
}
#endif
...
...
third_party/HugeCTR/gpu_cache/src/static_hash_table.
cu
→
third_party/HugeCTR/gpu_cache/src/static_hash_table.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
...
...
@@ -14,8 +15,8 @@
* limitations under the License.
*/
#include <cooperative_groups.h>
#include <
cuda
.h>
#include <
hip/hip_
cooperative_groups.h>
#include <
hip/hip_runtime
.h>
#include <stdint.h>
#include <stdio.h>
...
...
@@ -49,7 +50,7 @@ __device__ size_type insert(key_type *table, size_type capacity, key_type key, c
// otherwise return invalid_slot.
const size_type num_groups = capacity / group_size;
#if (
CUDA
_VERSION < 11060)
#if (
DTK
_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size();
#else
unsigned long long num_threads_per_group = cg.num_threads();
...
...
@@ -152,7 +153,7 @@ __device__ size_type lookup(key_type *table, size_type capacity, key_type key, c
const size_type num_groups = capacity / group_size;
#if (
CUDA
_VERSION < 11060)
#if (
DTK
_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size();
#else
unsigned long long num_threads_per_group = cg.num_threads();
...
...
@@ -300,19 +301,19 @@ StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHash
size_t align_m = 16;
size_t num_keys = key_capacity_ + 1;
size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m;
CUDA_CHECK
(
cuda
Malloc
(
&
table_keys_
,
sizeof
(
key_type
)
*
num_keys
));
CUDA_CHECK
(
cuda
Malloc
(
&
table_indices_
,
sizeof
(
size_type
)
*
num_keys
));
CUDA_CHECK
(
cuda
Malloc
(
&
table_values_
,
sizeof
(
value_type
)
*
num_values
));
CUDA_CHECK(
hip
Malloc(&table_keys_, sizeof(key_type) * num_keys));
CUDA_CHECK(
hip
Malloc(&table_indices_, sizeof(size_type) * num_keys));
CUDA_CHECK(
hip
Malloc(&table_values_, sizeof(value_type) * num_values));
// Initialize table_keys_
CUDA_CHECK
(
cuda
Memset
(
table_keys_
,
0xff
,
sizeof
(
key_type
)
*
key_capacity_
));
CUDA_CHECK
(
cuda
Memset
(
table_keys_
+
key_capacity_
,
0
,
sizeof
(
key_type
)));
CUDA_CHECK(
hip
Memset(table_keys_, 0xff, sizeof(key_type) * key_capacity_));
CUDA_CHECK(
hip
Memset(table_keys_ + key_capacity_, 0, sizeof(key_type)));
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert(
const
key_type
*
keys
,
const
value_type
*
values
,
size_type
num_keys
,
cuda
Stream_t
stream
)
{
const key_type *keys, const value_type *values, size_type num_keys,
hip
Stream_t stream) {
if (num_keys == 0) {
return;
}
...
...
@@ -324,12 +325,12 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
// Insert keys
constexpr int block = 256;
int grid = (num_keys - 1) / block + 1;
InsertKeyKernel
<
tile_size
,
group_size
>
<<<
grid
,
block
,
0
,
stream
>>>
(
table_keys_
,
table_indices_
,
key_capacity_
,
keys
,
num_keys
,
hipLaunchKernelGGL((
InsertKeyKernel<tile_size, group_size>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
table_keys_, table_indices_, key_capacity_, keys, num_keys,
size_, hash_, empty_key, invalid_slot);
// Copy values
CUDA_CHECK
(
cuda
MemcpyAsync
(
table_values_
+
size_
*
value_dim_
,
values
,
sizeof
(
value_type
)
*
num_keys
*
value_dim_
,
cuda
MemcpyDeviceToDevice
,
CUDA_CHECK(
hip
MemcpyAsync(table_values_ + size_ * value_dim_, values,
sizeof(value_type) * num_keys * value_dim_,
hip
MemcpyDeviceToDevice,
stream));
size_ += num_keys;
}
...
...
@@ -337,25 +338,25 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear(
cuda
Stream_t
stream
)
{
CUDA_CHECK
(
cuda
MemsetAsync
(
table_keys_
,
0xff
,
sizeof
(
key_type
)
*
key_capacity_
,
stream
));
CUDA_CHECK
(
cuda
MemsetAsync
(
table_keys_
+
key_capacity_
,
0
,
sizeof
(
key_type
),
stream
));
hip
Stream_t stream) {
CUDA_CHECK(
hip
MemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream));
CUDA_CHECK(
hip
MemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream));
size_ = 0;
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() {
CUDA_CHECK
(
cuda
Free
(
table_keys_
));
CUDA_CHECK
(
cuda
Free
(
table_indices_
));
CUDA_CHECK
(
cuda
Free
(
table_values_
));
CUDA_CHECK(
hip
Free(table_keys_));
CUDA_CHECK(
hip
Free(table_indices_));
CUDA_CHECK(
hip
Free(table_values_));
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup(
const key_type *keys, value_type *values, int num_keys, value_type default_value,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
if (num_keys == 0) {
return;
}
...
...
@@ -363,7 +364,7 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::looku
constexpr int block = 256;
const int grid = (num_keys - 1) / block + 1;
// Lookup keys
LookupKernel
<
tile_size
,
group_size
>
<<<
grid
,
block
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
LookupKernel<tile_size, group_size>
), dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values,
hash_, empty_key, default_value, invalid_slot);
}
...
...
third_party/HugeCTR/gpu_cache/src/static_table.
cu
→
third_party/HugeCTR/gpu_cache/src/static_table.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
...
...
@@ -14,7 +16,7 @@
* limitations under the License.
*/
#include <cooperative_groups.h>
#include <
hip/hip_
cooperative_groups.h>
#include <nv_util.h>
#include <iostream>
...
...
@@ -38,18 +40,18 @@ static_table<key_type>::static_table(const size_t table_size, const size_t embed
template <typename key_type>
void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream);
}
template <typename key_type>
void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
static_hash_table_.insert(d_keys, d_values, len, stream);
}
template <typename key_type>
void
static_table
<
key_type
>::
Clear
(
cuda
Stream_t
stream
)
{
void static_table<key_type>::Clear(
hip
Stream_t stream) {
static_hash_table_.clear(stream);
}
...
...
third_party/HugeCTR/gpu_cache/src/uvm_table.
cu
→
third_party/HugeCTR/gpu_cache/src/uvm_table.
hip
View file @
1d28bf8b
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cooperative_groups.h>
#include <cuda_runtime_api.h>
#include <immintrin.h>
#include <atomic>
#include <iostream>
#include <limits>
#include <mutex>
#include <uvm_table.hpp>
namespace
cg
=
cooperative_groups
;
namespace
{
constexpr
int
set_size
=
4
;
constexpr
int
block_size
=
256
;
template
<
typename
key_type
>
__host__
__device__
key_type
hash
(
key_type
key
)
{
return
key
;
}
template
<
typename
key_type
>
__global__
void
hash_add_kernel
(
const
key_type
*
new_keys
,
const
int
num_keys
,
key_type
*
keys
,
const
int
num_sets
,
int
*
set_sizes
,
const
int
max_set_size
,
key_type
*
missing_keys
,
int
*
num_missing_keys
)
{
__shared__
key_type
s_missing_keys
[
block_size
];
__shared__
int
s_missing_count
;
__shared__
size_t
s_missing_idx
;
auto
grid
=
cg
::
this_grid
();
auto
block
=
cg
::
this_thread_block
();
if
(
block
.
thread_rank
()
==
0
)
{
s_missing_count
=
0
;
}
block
.
sync
();
size_t
idx
=
grid
.
thread_rank
();
if
(
idx
<
num_keys
)
{
auto
key
=
new_keys
[
idx
];
size_t
idx_set
=
hash
(
key
)
%
num_sets
;
int
prev_set_size
=
atomicAdd
(
&
set_sizes
[
idx_set
],
1
);
if
(
prev_set_size
<
max_set_size
)
{
keys
[
idx_set
*
max_set_size
+
prev_set_size
]
=
key
;
}
else
{
int
count
=
atomicAdd
(
&
s_missing_count
,
1
);
s_missing_keys
[
count
]
=
key
;
}
}
block
.
sync
();
if
(
block
.
thread_rank
()
==
0
)
{
s_missing_idx
=
atomicAdd
(
num_missing_keys
,
s_missing_count
);
}
block
.
sync
();
for
(
size_t
i
=
block
.
thread_rank
();
i
<
s_missing_count
;
i
+=
block
.
num_threads
())
{
missing_keys
[
s_missing_idx
+
i
]
=
s_missing_keys
[
i
];
}
}
template
<
typename
key_type
,
typename
index_type
>
__global__
void
hash_query_kernel
(
const
key_type
*
query_keys
,
int
*
num_keys_ptr
,
const
key_type
*
keys
,
const
size_t
num_sets
,
const
int
max_set_size
,
index_type
*
output_indices
)
{
constexpr
int
tile_size
=
set_size
;
auto
grid
=
cg
::
this_grid
();
auto
block
=
cg
::
this_thread_block
();
auto
tile
=
cg
::
tiled_partition
<
tile_size
>
(
block
);
int
num_keys
=
*
num_keys_ptr
;
if
(
num_keys
==
0
)
return
;
#if (CUDA_VERSION < 11060)
size_t
num_threads_per_grid
=
grid
.
size
();
#else
size_t
num_threads_per_grid
=
grid
.
num_threads
();
#endif
size_t
step
=
(
num_keys
-
1
)
/
num_threads_per_grid
+
1
;
for
(
size_t
i
=
0
;
i
<
step
;
i
++
)
{
size_t
idx
=
i
*
num_threads_per_grid
+
grid
.
thread_rank
();
key_type
query_key
=
std
::
numeric_limits
<
key_type
>::
max
();
if
(
idx
<
num_keys
)
{
query_key
=
query_keys
[
idx
];
}
auto
idx_set
=
hash
(
query_key
)
%
num_sets
;
for
(
int
j
=
0
;
j
<
tile_size
;
j
++
)
{
auto
current_idx_set
=
tile
.
shfl
(
idx_set
,
j
);
auto
current_query_key
=
tile
.
shfl
(
query_key
,
j
);
if
(
current_query_key
==
std
::
numeric_limits
<
key_type
>::
max
())
{
continue
;
}
auto
candidate_key
=
keys
[
current_idx_set
*
set_size
+
tile
.
thread_rank
()];
int
existed
=
tile
.
ballot
(
current_query_key
==
candidate_key
);
auto
current_idx
=
tile
.
shfl
(
idx
,
0
)
+
j
;
if
(
existed
)
{
int
src_lane
=
__ffs
(
existed
)
-
1
;
size_t
found_idx
=
current_idx_set
*
set_size
+
src_lane
;
output_indices
[
current_idx
]
=
num_sets
*
src_lane
+
current_idx_set
;
}
else
{
output_indices
[
current_idx
]
=
std
::
numeric_limits
<
index_type
>::
max
();
}
}
}
}
template
<
typename
key_type
,
typename
index_type
>
__global__
void
hash_query_kernel
(
const
key_type
*
query_keys
,
const
int
num_keys
,
const
key_type
*
keys
,
const
size_t
num_sets
,
const
int
max_set_size
,
index_type
*
output_indices
,
key_type
*
missing_keys
,
int
*
missing_positions
,
int
*
missing_count
)
{
__shared__
key_type
s_missing_keys
[
block_size
];
__shared__
key_type
s_missing_positions
[
block_size
];
__shared__
int
s_missing_count
;
__shared__
int
s_missing_idx
;
constexpr
int
tile_size
=
set_size
;
auto
grid
=
cg
::
this_grid
();
auto
block
=
cg
::
this_thread_block
();
auto
tile
=
cg
::
tiled_partition
<
tile_size
>
(
block
);
if
(
block
.
thread_rank
()
==
0
)
{
s_missing_count
=
0
;
}
block
.
sync
();
size_t
idx
=
grid
.
thread_rank
();
key_type
query_key
=
std
::
numeric_limits
<
key_type
>::
max
();
if
(
idx
<
num_keys
)
{
query_key
=
query_keys
[
idx
];
}
auto
idx_set
=
hash
(
query_key
)
%
num_sets
;
for
(
int
j
=
0
;
j
<
tile_size
;
j
++
)
{
auto
current_idx_set
=
tile
.
shfl
(
idx_set
,
j
);
auto
current_query_key
=
tile
.
shfl
(
query_key
,
j
);
if
(
current_query_key
==
std
::
numeric_limits
<
key_type
>::
max
())
{
continue
;
}
auto
candidate_key
=
keys
[
current_idx_set
*
set_size
+
tile
.
thread_rank
()];
int
existed
=
tile
.
ballot
(
current_query_key
==
candidate_key
);
if
(
existed
)
{
int
src_lane
=
__ffs
(
existed
)
-
1
;
size_t
found_idx
=
current_idx_set
*
set_size
+
src_lane
;
output_indices
[
tile
.
shfl
(
idx
,
0
)
+
j
]
=
num_sets
*
src_lane
+
current_idx_set
;
}
else
{
auto
current_idx
=
tile
.
shfl
(
idx
,
0
)
+
j
;
output_indices
[
current_idx
]
=
std
::
numeric_limits
<
index_type
>::
max
();
if
(
tile
.
thread_rank
()
==
0
)
{
int
s_count
=
atomicAdd
(
&
s_missing_count
,
1
);
s_missing_keys
[
s_count
]
=
current_query_key
;
s_missing_positions
[
s_count
]
=
current_idx
;
}
}
}
if
(
missing_keys
==
nullptr
)
{
if
(
grid
.
thread_rank
()
==
0
&&
missing_count
)
{
*
missing_count
=
0
;
}
return
;
}
block
.
sync
();
if
(
block
.
thread_rank
()
==
0
)
{
s_missing_idx
=
atomicAdd
(
missing_count
,
s_missing_count
);
}
block
.
sync
();
for
(
size_t
i
=
block
.
thread_rank
();
i
<
s_missing_count
;
i
+=
block
.
num_threads
())
{
missing_keys
[
s_missing_idx
+
i
]
=
s_missing_keys
[
i
];
missing_positions
[
s_missing_idx
+
i
]
=
s_missing_positions
[
i
];
}
}
template
<
int
warp_size
>
__forceinline__
__device__
void
warp_tile_copy
(
const
size_t
lane_idx
,
const
size_t
emb_vec_size_in_float
,
volatile
float
*
d_dst
,
const
float
*
d_src
)
{
// 16 bytes align
if
(
emb_vec_size_in_float
%
4
!=
0
||
(
size_t
)
d_dst
%
16
!=
0
||
(
size_t
)
d_src
%
16
!=
0
)
{
#pragma unroll
for
(
size_t
i
=
lane_idx
;
i
<
emb_vec_size_in_float
;
i
+=
warp_size
)
{
d_dst
[
i
]
=
d_src
[
i
];
}
}
else
{
#pragma unroll
for
(
size_t
i
=
lane_idx
;
i
<
emb_vec_size_in_float
/
4
;
i
+=
warp_size
)
{
*
(
float4
*
)(
d_dst
+
i
*
4
)
=
__ldg
((
const
float4
*
)(
d_src
+
i
*
4
));
}
}
}
template
<
typename
index_type
,
typename
vec_type
>
__global__
void
read_vectors_kernel
(
const
index_type
*
query_indices
,
const
int
num_keys
,
const
vec_type
*
vectors
,
const
int
vec_size
,
vec_type
*
output_vectors
)
{
constexpr
int
warp_size
=
32
;
auto
grid
=
cg
::
this_grid
();
auto
block
=
cg
::
this_thread_block
();
auto
tile
=
cg
::
tiled_partition
<
warp_size
>
(
block
);
#if (CUDA_VERSION < 11060)
auto
num_threads_per_grid
=
grid
.
size
();
#else
auto
num_threads_per_grid
=
grid
.
num_threads
();
#endif
for
(
int
step
=
0
;
step
<
(
num_keys
-
1
)
/
num_threads_per_grid
+
1
;
step
++
)
{
int
key_num
=
step
*
num_threads_per_grid
+
grid
.
thread_rank
();
index_type
idx
=
std
::
numeric_limits
<
index_type
>::
max
();
if
(
key_num
<
num_keys
)
{
idx
=
query_indices
[
key_num
];
}
#pragma unroll 4
for
(
size_t
j
=
0
;
j
<
warp_size
;
j
++
)
{
index_type
current_idx
=
tile
.
shfl
(
idx
,
j
);
index_type
idx_write
=
tile
.
shfl
(
key_num
,
0
)
+
j
;
if
(
current_idx
==
std
::
numeric_limits
<
index_type
>::
max
())
continue
;
warp_tile_copy
<
warp_size
>
(
tile
.
thread_rank
(),
vec_size
,
output_vectors
+
idx_write
*
vec_size
,
vectors
+
current_idx
*
vec_size
);
}
}
}
template
<
typename
index_type
,
typename
vec_type
>
__global__
void
distribute_vectors_kernel
(
const
index_type
*
postions
,
const
size_t
num_keys
,
const
vec_type
*
vectors
,
const
int
vec_size
,
vec_type
*
output_vectors
)
{
constexpr
int
warp_size
=
32
;
auto
grid
=
cg
::
this_grid
();
auto
block
=
cg
::
this_thread_block
();
auto
tile
=
cg
::
tiled_partition
<
warp_size
>
(
block
);
#if (CUDA_VERSION < 11060)
auto
num_threads_per_grid
=
grid
.
size
();
#else
auto
num_threads_per_grid
=
grid
.
num_threads
();
#endif
for
(
size_t
step
=
0
;
step
<
(
num_keys
-
1
)
/
num_threads_per_grid
+
1
;
step
++
)
{
size_t
key_num
=
step
*
num_threads_per_grid
+
grid
.
thread_rank
();
index_type
idx
=
std
::
numeric_limits
<
index_type
>::
max
();
if
(
key_num
<
num_keys
)
{
idx
=
postions
[
key_num
];
}
#pragma unroll 4
for
(
size_t
j
=
0
;
j
<
warp_size
;
j
++
)
{
size_t
idx_write
=
tile
.
shfl
(
idx
,
j
);
size_t
idx_read
=
tile
.
shfl
(
key_num
,
0
)
+
j
;
if
(
idx_write
==
std
::
numeric_limits
<
index_type
>::
max
())
continue
;
warp_tile_copy
<
warp_size
>
(
tile
.
thread_rank
(),
vec_size
,
output_vectors
+
(
size_t
)
idx_write
*
vec_size
,
vectors
+
(
size_t
)
idx_read
*
vec_size
);
}
}
}
}
// namespace
namespace
gpu_cache
{
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
>
UvmTable
<
key_type
,
index_type
,
vec_type
>::
UvmTable
(
const
size_t
device_table_capacity
,
const
size_t
host_table_capacity
,
const
int
max_batch_size
,
const
int
vec_size
,
const
vec_type
default_value
)
:
max_batch_size_
(
std
::
max
(
100000
,
max_batch_size
)),
vec_size_
(
vec_size
),
num_set_
((
device_table_capacity
-
1
)
/
set_size
+
1
),
num_host_set_
((
host_table_capacity
-
1
)
/
set_size
+
1
),
table_capacity_
(
num_set_
*
set_size
),
default_vector_
(
vec_size
,
default_value
),
device_table_
(
device_table_capacity
,
set_size
,
max_batch_size_
),
host_table_
(
host_table_capacity
*
1.3
,
set_size
,
max_batch_size_
)
{
CUDA_CHECK
(
cudaMalloc
(
&
d_keys_buffer_
,
sizeof
(
key_type
)
*
max_batch_size_
));
CUDA_CHECK
(
cudaMalloc
(
&
d_vectors_buffer_
,
sizeof
(
vec_type
)
*
max_batch_size_
*
vec_size_
));
CUDA_CHECK
(
cudaMalloc
(
&
d_vectors_
,
sizeof
(
vec_type
)
*
device_table_
.
capacity
*
vec_size_
));
CUDA_CHECK
(
cudaMalloc
(
&
d_output_indices_
,
sizeof
(
index_type
)
*
max_batch_size_
));
CUDA_CHECK
(
cudaMalloc
(
&
d_output_host_indices_
,
sizeof
(
index_type
)
*
max_batch_size_
));
CUDA_CHECK
(
cudaMallocHost
(
&
h_output_host_indices_
,
sizeof
(
index_type
)
*
max_batch_size_
));
CUDA_CHECK
(
cudaMalloc
(
&
d_missing_keys_
,
sizeof
(
key_type
)
*
max_batch_size_
));
CUDA_CHECK
(
cudaMalloc
(
&
d_missing_positions_
,
sizeof
(
int
)
*
max_batch_size_
));
CUDA_CHECK
(
cudaMalloc
(
&
d_missing_count_
,
sizeof
(
int
)));
CUDA_CHECK
(
cudaMemset
(
d_missing_count_
,
0
,
sizeof
(
int
)));
CUDA_CHECK
(
cudaStreamCreate
(
&
query_stream_
));
for
(
int
i
=
0
;
i
<
num_buffers_
;
i
++
)
{
int
batch_size_per_buffer
=
ceil
(
1.0
*
max_batch_size_
/
num_buffers_
);
CUDA_CHECK
(
cudaMallocHost
(
&
h_cpy_buffers_
[
i
],
sizeof
(
vec_type
)
*
batch_size_per_buffer
*
vec_size
));
CUDA_CHECK
(
cudaMalloc
(
&
d_cpy_buffers_
[
i
],
sizeof
(
vec_type
)
*
batch_size_per_buffer
*
vec_size
));
CUDA_CHECK
(
cudaStreamCreate
(
&
cpy_streams_
[
i
]));
CUDA_CHECK
(
cudaEventCreate
(
&
cpy_events_
[
i
]));
}
CUDA_CHECK
(
cudaMallocHost
(
&
h_missing_keys_
,
sizeof
(
key_type
)
*
max_batch_size_
));
CUDA_CHECK
(
cudaEventCreate
(
&
query_event_
));
h_vectors_
.
resize
(
host_table_
.
capacity
*
vec_size_
);
}
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
>
void
UvmTable
<
key_type
,
index_type
,
vec_type
>::
add
(
const
key_type
*
h_keys
,
const
vec_type
*
h_vectors
,
const
size_t
num_keys
)
{
std
::
vector
<
key_type
>
h_missing_keys
;
size_t
num_batches
=
(
num_keys
-
1
)
/
max_batch_size_
+
1
;
for
(
size_t
i
=
0
;
i
<
num_batches
;
i
++
)
{
size_t
this_batch_size
=
i
!=
num_batches
-
1
?
max_batch_size_
:
num_keys
-
i
*
max_batch_size_
;
CUDA_CHECK
(
cudaMemcpy
(
d_keys_buffer_
,
h_keys
+
i
*
max_batch_size_
,
sizeof
(
*
d_keys_buffer_
)
*
this_batch_size
,
cudaMemcpyHostToDevice
));
CUDA_CHECK
(
cudaMemset
(
d_missing_count_
,
0
,
sizeof
(
*
d_missing_count_
)));
device_table_
.
add
(
d_keys_buffer_
,
this_batch_size
,
d_missing_keys_
,
d_missing_count_
,
0
);
CUDA_CHECK
(
cudaDeviceSynchronize
());
int
num_missing_keys
;
CUDA_CHECK
(
cudaMemcpy
(
&
num_missing_keys
,
d_missing_count_
,
sizeof
(
num_missing_keys
),
cudaMemcpyDeviceToHost
));
size_t
prev_size
=
h_missing_keys
.
size
();
h_missing_keys
.
resize
(
prev_size
+
num_missing_keys
);
CUDA_CHECK
(
cudaMemcpy
(
h_missing_keys
.
data
()
+
prev_size
,
d_missing_keys_
,
sizeof
(
*
d_missing_keys_
)
*
num_missing_keys
,
cudaMemcpyDeviceToHost
));
}
std
::
vector
<
key_type
>
h_final_missing_keys
;
num_batches
=
h_missing_keys
.
size
()
?
(
h_missing_keys
.
size
()
-
1
)
/
max_batch_size_
+
1
:
0
;
for
(
size_t
i
=
0
;
i
<
num_batches
;
i
++
)
{
size_t
this_batch_size
=
i
!=
num_batches
-
1
?
max_batch_size_
:
h_missing_keys
.
size
()
-
i
*
max_batch_size_
;
CUDA_CHECK
(
cudaMemcpy
(
d_keys_buffer_
,
h_missing_keys
.
data
()
+
i
*
max_batch_size_
,
sizeof
(
*
d_keys_buffer_
)
*
this_batch_size
,
cudaMemcpyHostToDevice
));
CUDA_CHECK
(
cudaMemset
(
d_missing_count_
,
0
,
sizeof
(
*
d_missing_count_
)));
host_table_
.
add
(
d_keys_buffer_
,
this_batch_size
,
d_missing_keys_
,
d_missing_count_
,
0
);
CUDA_CHECK
(
cudaDeviceSynchronize
());
int
num_missing_keys
;
CUDA_CHECK
(
cudaMemcpy
(
&
num_missing_keys
,
d_missing_count_
,
sizeof
(
num_missing_keys
),
cudaMemcpyDeviceToHost
));
size_t
prev_size
=
h_final_missing_keys
.
size
();
h_final_missing_keys
.
resize
(
prev_size
+
num_missing_keys
);
CUDA_CHECK
(
cudaMemcpy
(
h_final_missing_keys
.
data
()
+
prev_size
,
d_missing_keys_
,
sizeof
(
*
d_missing_keys_
)
*
num_missing_keys
,
cudaMemcpyDeviceToHost
));
}
std
::
vector
<
key_type
>
h_keys_buffer
(
max_batch_size_
);
std
::
vector
<
index_type
>
h_indices_buffer
(
max_batch_size_
);
std
::
vector
<
int
>
h_positions_buffer
(
max_batch_size_
);
num_batches
=
(
num_keys
-
1
)
/
max_batch_size_
+
1
;
size_t
num_hit_keys
=
0
;
for
(
size_t
i
=
0
;
i
<
num_batches
;
i
++
)
{
size_t
this_batch_size
=
i
!=
num_batches
-
1
?
max_batch_size_
:
num_keys
-
i
*
max_batch_size_
;
CUDA_CHECK
(
cudaMemcpy
(
d_keys_buffer_
,
h_keys
+
i
*
max_batch_size_
,
sizeof
(
*
d_keys_buffer_
)
*
this_batch_size
,
cudaMemcpyHostToDevice
));
CUDA_CHECK
(
cudaMemset
(
d_missing_count_
,
0
,
sizeof
(
*
d_missing_count_
)));
device_table_
.
query
(
d_keys_buffer_
,
this_batch_size
,
d_output_indices_
,
d_missing_keys_
,
d_missing_positions_
,
d_missing_count_
,
0
);
CUDA_CHECK
(
cudaStreamSynchronize
(
0
));
CUDA_CHECK
(
cudaMemcpy
(
d_vectors_buffer_
,
h_vectors
+
i
*
max_batch_size_
*
vec_size_
,
sizeof
(
*
d_vectors_
)
*
this_batch_size
*
vec_size_
,
cudaMemcpyHostToDevice
));
CUDA_CHECK
(
cudaStreamSynchronize
(
0
));
if
(
num_hit_keys
<
device_table_
.
capacity
)
{
distribute_vectors_kernel
<<<
(
this_batch_size
-
1
)
/
block_size
+
1
,
block_size
,
0
,
0
>>>
(
d_output_indices_
,
this_batch_size
,
d_vectors_buffer_
,
vec_size_
,
d_vectors_
);
CUDA_CHECK
(
cudaStreamSynchronize
(
0
));
}
int
num_missing_keys
;
CUDA_CHECK
(
cudaMemcpy
(
&
num_missing_keys
,
d_missing_count_
,
sizeof
(
num_missing_keys
),
cudaMemcpyDeviceToHost
));
num_hit_keys
+=
this_batch_size
-
num_missing_keys
;
host_table_
.
query
(
d_missing_keys_
,
num_missing_keys
,
d_output_indices_
,
nullptr
,
nullptr
,
nullptr
,
0
);
CUDA_CHECK
(
cudaMemcpy
(
h_keys_buffer
.
data
(),
d_missing_keys_
,
sizeof
(
*
d_missing_keys_
)
*
num_missing_keys
,
cudaMemcpyDeviceToHost
))
CUDA_CHECK
(
cudaMemcpy
(
h_indices_buffer
.
data
(),
d_output_indices_
,
sizeof
(
*
d_output_indices_
)
*
num_missing_keys
,
cudaMemcpyDeviceToHost
))
CUDA_CHECK
(
cudaMemcpy
(
h_positions_buffer
.
data
(),
d_missing_positions_
,
sizeof
(
*
d_missing_positions_
)
*
num_missing_keys
,
cudaMemcpyDeviceToHost
))
for
(
int
j
=
0
;
j
<
num_missing_keys
;
j
++
)
{
if
(
h_indices_buffer
[
j
]
!=
std
::
numeric_limits
<
index_type
>::
max
())
{
memcpy
(
h_vectors_
.
data
()
+
h_indices_buffer
[
j
]
*
vec_size_
,
h_vectors
+
(
i
*
max_batch_size_
+
h_positions_buffer
[
j
])
*
vec_size_
,
sizeof
(
*
h_vectors
)
*
vec_size_
);
}
else
{
size_t
prev_idx
=
h_vectors_
.
size
()
/
vec_size_
;
h_final_missing_items_
.
emplace
(
h_keys_buffer
[
j
],
prev_idx
);
h_vectors_
.
resize
(
h_vectors_
.
size
()
+
vec_size_
);
memcpy
(
h_vectors_
.
data
()
+
prev_idx
*
vec_size_
,
h_vectors
+
(
i
*
max_batch_size_
+
h_positions_buffer
[
j
])
*
vec_size_
,
sizeof
(
*
h_vectors
)
*
vec_size_
);
}
}
}
CUDA_CHECK
(
cudaMemset
(
d_missing_count_
,
0
,
sizeof
(
*
d_missing_count_
)));
}
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
>
void
UvmTable
<
key_type
,
index_type
,
vec_type
>::
query
(
const
key_type
*
d_keys
,
const
int
num_keys
,
vec_type
*
d_vectors
,
cudaStream_t
stream
)
{
if
(
!
num_keys
)
return
;
CUDA_CHECK
(
cudaEventRecord
(
query_event_
,
stream
));
CUDA_CHECK
(
cudaStreamWaitEvent
(
query_stream_
,
query_event_
));
static_assert
(
num_buffers_
>=
2
);
device_table_
.
query
(
d_keys
,
num_keys
,
d_output_indices_
,
d_missing_keys_
,
d_missing_positions_
,
d_missing_count_
,
query_stream_
);
CUDA_CHECK
(
cudaEventRecord
(
query_event_
,
query_stream_
));
CUDA_CHECK
(
cudaStreamWaitEvent
(
cpy_streams_
[
0
],
query_event_
));
int
num_missing_keys
;
CUDA_CHECK
(
cudaMemcpyAsync
(
&
num_missing_keys
,
d_missing_count_
,
sizeof
(
*
d_missing_count_
),
cudaMemcpyDeviceToHost
,
cpy_streams_
[
0
]));
host_table_
.
query
(
d_missing_keys_
,
d_missing_count_
,
d_output_host_indices_
,
query_stream_
);
CUDA_CHECK
(
cudaStreamSynchronize
(
cpy_streams_
[
0
]));
CUDA_CHECK
(
cudaMemsetAsync
(
d_missing_count_
,
0
,
sizeof
(
*
d_missing_count_
),
query_stream_
));
CUDA_CHECK
(
cudaMemcpyAsync
(
h_output_host_indices_
,
d_output_host_indices_
,
sizeof
(
index_type
)
*
num_missing_keys
,
cudaMemcpyDeviceToHost
,
query_stream_
));
CUDA_CHECK
(
cudaMemcpyAsync
(
h_missing_keys_
,
d_missing_keys_
,
sizeof
(
key_type
)
*
num_missing_keys
,
cudaMemcpyDeviceToHost
,
cpy_streams_
[
0
]));
read_vectors_kernel
<<<
(
num_keys
-
1
)
/
block_size
+
1
,
block_size
,
0
,
cpy_streams_
[
1
]
>>>
(
d_output_indices_
,
num_keys
,
d_vectors_
,
vec_size_
,
d_vectors
);
CUDA_CHECK
(
cudaStreamSynchronize
(
query_stream_
));
CUDA_CHECK
(
cudaStreamSynchronize
(
cpy_streams_
[
0
]));
int
num_keys_per_buffer
=
ceil
(
1.0
*
num_missing_keys
/
num_buffers_
);
for
(
int
buffer_num
=
0
;
buffer_num
<
num_buffers_
;
buffer_num
++
)
{
int
num_keys_this_buffer
=
buffer_num
!=
num_buffers_
-
1
?
num_keys_per_buffer
:
num_missing_keys
-
num_keys_per_buffer
*
buffer_num
;
if
(
!
num_keys_this_buffer
)
break
;
#pragma omp parallel for num_threads(8)
for
(
size_t
i
=
0
;
i
<
static_cast
<
size_t
>
(
num_keys_this_buffer
);
i
++
)
{
size_t
idx_key
=
buffer_num
*
num_keys_per_buffer
+
i
;
index_type
index
=
h_output_host_indices_
[
idx_key
];
if
(
index
==
std
::
numeric_limits
<
index_type
>::
max
())
{
key_type
key
=
h_missing_keys_
[
idx_key
];
auto
iterator
=
h_final_missing_items_
.
find
(
key
);
if
(
iterator
!=
h_final_missing_items_
.
end
())
{
index
=
iterator
->
second
;
}
}
if
(
index
!=
std
::
numeric_limits
<
index_type
>::
max
())
{
memcpy
(
h_cpy_buffers_
[
buffer_num
]
+
i
*
vec_size_
,
h_vectors_
.
data
()
+
index
*
vec_size_
,
sizeof
(
vec_type
)
*
vec_size_
);
}
else
{
memcpy
(
h_cpy_buffers_
[
buffer_num
]
+
i
*
vec_size_
,
default_vector_
.
data
(),
sizeof
(
vec_type
)
*
vec_size_
);
}
}
CUDA_CHECK
(
cudaMemcpyAsync
(
d_cpy_buffers_
[
buffer_num
],
h_cpy_buffers_
[
buffer_num
],
sizeof
(
vec_type
)
*
num_keys_this_buffer
*
vec_size_
,
cudaMemcpyHostToDevice
,
cpy_streams_
[
buffer_num
]));
distribute_vectors_kernel
<<<
(
num_keys_this_buffer
-
1
)
/
block_size
+
1
,
block_size
,
0
,
cpy_streams_
[
buffer_num
]
>>>
(
d_missing_positions_
+
buffer_num
*
num_keys_per_buffer
,
num_keys_this_buffer
,
d_cpy_buffers_
[
buffer_num
],
vec_size_
,
d_vectors
);
}
for
(
int
i
=
0
;
i
<
num_buffers_
;
i
++
)
{
CUDA_CHECK
(
cudaEventRecord
(
cpy_events_
[
i
],
cpy_streams_
[
i
]));
CUDA_CHECK
(
cudaStreamWaitEvent
(
stream
,
cpy_events_
[
i
]));
}
}
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
>
void
UvmTable
<
key_type
,
index_type
,
vec_type
>::
clear
(
cudaStream_t
stream
)
{
device_table_
.
clear
(
stream
);
host_table_
.
clear
(
stream
);
}
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
>
UvmTable
<
key_type
,
index_type
,
vec_type
>::~
UvmTable
()
{
CUDA_CHECK
(
cudaFree
(
d_keys_buffer_
));
CUDA_CHECK
(
cudaFree
(
d_vectors_buffer_
));
CUDA_CHECK
(
cudaFree
(
d_vectors_
));
CUDA_CHECK
(
cudaFree
(
d_output_indices_
));
CUDA_CHECK
(
cudaFree
(
d_output_host_indices_
));
CUDA_CHECK
(
cudaFreeHost
(
h_output_host_indices_
));
CUDA_CHECK
(
cudaFree
(
d_missing_keys_
));
CUDA_CHECK
(
cudaFree
(
d_missing_positions_
));
CUDA_CHECK
(
cudaFree
(
d_missing_count_
));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_keys_
));
CUDA_CHECK
(
cudaStreamDestroy
(
query_stream_
));
CUDA_CHECK
(
cudaEventDestroy
(
query_event_
));
for
(
int
i
=
0
;
i
<
num_buffers_
;
i
++
)
{
CUDA_CHECK
(
cudaFreeHost
(
h_cpy_buffers_
[
i
]));
CUDA_CHECK
(
cudaFree
(
d_cpy_buffers_
[
i
]));
CUDA_CHECK
(
cudaStreamDestroy
(
cpy_streams_
[
i
]));
CUDA_CHECK
(
cudaEventDestroy
(
cpy_events_
[
i
]));
}
}
template
<
typename
key_type
,
typename
index_type
>
HashBlock
<
key_type
,
index_type
>::
HashBlock
(
size_t
expected_capacity
,
int
set_size
,
int
batch_size
)
:
max_set_size_
(
set_size
),
batch_size_
(
batch_size
)
{
if
(
expected_capacity
)
{
num_sets
=
(
expected_capacity
-
1
)
/
set_size
+
1
;
}
else
{
num_sets
=
10000
;
}
capacity
=
num_sets
*
set_size
;
CUDA_CHECK
(
cudaMalloc
(
&
keys
,
sizeof
(
*
keys
)
*
capacity
));
CUDA_CHECK
(
cudaMalloc
(
&
set_sizes_
,
sizeof
(
*
set_sizes_
)
*
num_sets
));
CUDA_CHECK
(
cudaMemset
(
set_sizes_
,
0
,
sizeof
(
*
set_sizes_
)
*
num_sets
));
}
template
<
typename
key_type
,
typename
index_type
>
HashBlock
<
key_type
,
index_type
>::~
HashBlock
()
{
CUDA_CHECK
(
cudaFree
(
keys
));
CUDA_CHECK
(
cudaFree
(
set_sizes_
));
}
template
<
typename
key_type
,
typename
index_type
>
void
HashBlock
<
key_type
,
index_type
>::
query
(
const
key_type
*
query_keys
,
const
size_t
num_keys
,
index_type
*
output_indices
,
key_type
*
missing_keys
,
int
*
missing_positions
,
int
*
num_missing_keys
,
cudaStream_t
stream
)
{
if
(
num_keys
==
0
)
{
return
;
}
size_t
num_batches
=
(
num_keys
-
1
)
/
batch_size_
+
1
;
for
(
size_t
i
=
0
;
i
<
num_batches
;
i
++
)
{
size_t
this_batch_size
=
i
!=
num_batches
-
1
?
batch_size_
:
num_keys
-
i
*
batch_size_
;
hash_query_kernel
<<<
(
this_batch_size
-
1
)
/
block_size
+
1
,
block_size
,
0
,
stream
>>>
(
query_keys
,
this_batch_size
,
keys
,
num_sets
,
max_set_size_
,
output_indices
,
missing_keys
,
missing_positions
,
num_missing_keys
);
}
}
template
<
typename
key_type
,
typename
index_type
>
void
HashBlock
<
key_type
,
index_type
>::
query
(
const
key_type
*
query_keys
,
int
*
num_keys
,
index_type
*
output_indices
,
cudaStream_t
stream
)
{
hash_query_kernel
<<<
128
,
64
,
0
,
stream
>>>
(
query_keys
,
num_keys
,
keys
,
num_sets
,
max_set_size_
,
output_indices
);
}
template
<
typename
key_type
,
typename
index_type
>
void
HashBlock
<
key_type
,
index_type
>::
add
(
const
key_type
*
new_keys
,
const
size_t
num_keys
,
key_type
*
missing_keys
,
int
*
num_missing_keys
,
cudaStream_t
stream
)
{
if
(
num_keys
==
0
)
{
return
;
}
size_t
num_batches
=
(
num_keys
-
1
)
/
batch_size_
+
1
;
for
(
size_t
i
=
0
;
i
<
num_batches
;
i
++
)
{
size_t
this_batch_size
=
i
!=
num_batches
-
1
?
batch_size_
:
num_keys
-
i
*
batch_size_
;
hash_add_kernel
<<<
(
this_batch_size
-
1
)
/
block_size
+
1
,
block_size
,
0
,
stream
>>>
(
new_keys
+
i
*
this_batch_size
,
this_batch_size
,
keys
,
num_sets
,
set_sizes_
,
max_set_size_
,
missing_keys
,
num_missing_keys
);
}
}
template
<
typename
key_type
,
typename
index_type
>
void
HashBlock
<
key_type
,
index_type
>::
clear
(
cudaStream_t
stream
)
{
CUDA_CHECK
(
cudaMemsetAsync
(
set_sizes_
,
0
,
sizeof
(
*
set_sizes_
)
*
num_sets
,
stream
));
}
template
class
HashBlock
<
int
,
size_t
>;
template
class
HashBlock
<
int64_t
,
size_t
>;
template
class
HashBlock
<
size_t
,
size_t
>;
template
class
HashBlock
<
unsigned
int
,
size_t
>;
template
class
HashBlock
<
long
long
,
size_t
>;
template
class
UvmTable
<
int
,
size_t
>;
template
class
UvmTable
<
int64_t
,
size_t
>;
template
class
UvmTable
<
size_t
,
size_t
>;
template
class
UvmTable
<
unsigned
int
,
size_t
>;
template
class
UvmTable
<
long
long
,
size_t
>;
// !!! This is a file automatically generated by hipify!!!
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <hip/hip_cooperative_groups.h>
#include <hip/hip_runtime_api.h>
#include <immintrin.h>
#include <atomic>
#include <iostream>
#include <limits>
#include <mutex>
#include <uvm_table.hpp>
namespace cg = cooperative_groups;
namespace {
constexpr int set_size = 4;
constexpr int block_size = 256;
template <typename key_type>
__host__ __device__ key_type hash(key_type key) {
return key;
}
template <typename key_type>
__global__ void hash_add_kernel(const key_type* new_keys, const int num_keys, key_type* keys,
const int num_sets, int* set_sizes, const int max_set_size,
key_type* missing_keys, int* num_missing_keys) {
__shared__ key_type s_missing_keys[block_size];
__shared__ int s_missing_count;
__shared__ size_t s_missing_idx;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
if (block.thread_rank() == 0) {
s_missing_count = 0;
}
block.sync();
size_t idx = grid.thread_rank();
if (idx < num_keys) {
auto key = new_keys[idx];
size_t idx_set = hash(key) % num_sets;
int prev_set_size = atomicAdd(&set_sizes[idx_set], 1);
if (prev_set_size < max_set_size) {
keys[idx_set * max_set_size + prev_set_size] = key;
} else {
int count = atomicAdd(&s_missing_count, 1);
s_missing_keys[count] = key;
}
}
block.sync();
if (block.thread_rank() == 0) {
s_missing_idx = atomicAdd(num_missing_keys, s_missing_count);
}
block.sync();
for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
missing_keys[s_missing_idx + i] = s_missing_keys[i];
}
}
template <typename key_type, typename index_type>
__global__ void hash_query_kernel(const key_type* query_keys, int* num_keys_ptr,
const key_type* keys, const size_t num_sets,
const int max_set_size, index_type* output_indices) {
constexpr int tile_size = set_size;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<tile_size>(block);
int num_keys = *num_keys_ptr;
if (num_keys == 0) return;
#if (DTK_VERSION < 11060)
size_t num_threads_per_grid = grid.size();
#else
size_t num_threads_per_grid = grid.num_threads();
#endif
size_t step = (num_keys - 1) / num_threads_per_grid + 1;
for (size_t i = 0; i < step; i++) {
size_t idx = i * num_threads_per_grid + grid.thread_rank();
key_type query_key = std::numeric_limits<key_type>::max();
if (idx < num_keys) {
query_key = query_keys[idx];
}
auto idx_set = hash(query_key) % num_sets;
for (int j = 0; j < tile_size; j++) {
auto current_idx_set = tile.shfl(idx_set, j);
auto current_query_key = tile.shfl(query_key, j);
if (current_query_key == std::numeric_limits<key_type>::max()) {
continue;
}
auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
int existed = tile.ballot(current_query_key == candidate_key);
auto current_idx = tile.shfl(idx, 0) + j;
if (existed) {
int src_lane = __ffs(existed) - 1;
size_t found_idx = current_idx_set * set_size + src_lane;
output_indices[current_idx] = num_sets * src_lane + current_idx_set;
} else {
output_indices[current_idx] = std::numeric_limits<index_type>::max();
}
}
}
}
template <typename key_type, typename index_type>
__global__ void hash_query_kernel(const key_type* query_keys, const int num_keys,
const key_type* keys, const size_t num_sets,
const int max_set_size, index_type* output_indices,
key_type* missing_keys, int* missing_positions,
int* missing_count) {
__shared__ key_type s_missing_keys[block_size];
__shared__ key_type s_missing_positions[block_size];
__shared__ int s_missing_count;
__shared__ int s_missing_idx;
constexpr int tile_size = set_size;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<tile_size>(block);
if (block.thread_rank() == 0) {
s_missing_count = 0;
}
block.sync();
size_t idx = grid.thread_rank();
key_type query_key = std::numeric_limits<key_type>::max();
if (idx < num_keys) {
query_key = query_keys[idx];
}
auto idx_set = hash(query_key) % num_sets;
for (int j = 0; j < tile_size; j++) {
auto current_idx_set = tile.shfl(idx_set, j);
auto current_query_key = tile.shfl(query_key, j);
if (current_query_key == std::numeric_limits<key_type>::max()) {
continue;
}
auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
int existed = tile.ballot(current_query_key == candidate_key);
if (existed) {
int src_lane = __ffs(existed) - 1;
size_t found_idx = current_idx_set * set_size + src_lane;
output_indices[tile.shfl(idx, 0) + j] = num_sets * src_lane + current_idx_set;
} else {
auto current_idx = tile.shfl(idx, 0) + j;
output_indices[current_idx] = std::numeric_limits<index_type>::max();
if (tile.thread_rank() == 0) {
int s_count = atomicAdd(&s_missing_count, 1);
s_missing_keys[s_count] = current_query_key;
s_missing_positions[s_count] = current_idx;
}
}
}
if (missing_keys == nullptr) {
if (grid.thread_rank() == 0 && missing_count) {
*missing_count = 0;
}
return;
}
block.sync();
if (block.thread_rank() == 0) {
s_missing_idx = atomicAdd(missing_count, s_missing_count);
}
block.sync();
for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
missing_keys[s_missing_idx + i] = s_missing_keys[i];
missing_positions[s_missing_idx + i] = s_missing_positions[i];
}
}
template <int warp_size>
__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
const size_t emb_vec_size_in_float,
volatile float* d_dst, const float* d_src) {
// 16 bytes align
if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) {
#pragma unroll
for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
d_dst[i] = d_src[i];
}
} else {
#pragma unroll
for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) {
*(float4*)(d_dst + i * 4) = __ldg((const float4*)(d_src + i * 4));
}
}
}
template <typename index_type, typename vec_type>
__global__ void read_vectors_kernel(const index_type* query_indices, const int num_keys,
const vec_type* vectors, const int vec_size,
vec_type* output_vectors) {
constexpr int warp_size = 32;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<warp_size>(block);
#if (DTK_VERSION < 11060)
auto num_threads_per_grid = grid.size();
#else
auto num_threads_per_grid = grid.num_threads();
#endif
for (int step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
int key_num = step * num_threads_per_grid + grid.thread_rank();
index_type idx = std::numeric_limits<index_type>::max();
if (key_num < num_keys) {
idx = query_indices[key_num];
}
#pragma unroll 4
for (size_t j = 0; j < warp_size; j++) {
index_type current_idx = tile.shfl(idx, j);
index_type idx_write = tile.shfl(key_num, 0) + j;
if (current_idx == std::numeric_limits<index_type>::max()) continue;
warp_tile_copy<warp_size>(tile.thread_rank(), vec_size, output_vectors + idx_write * vec_size,
vectors + current_idx * vec_size);
}
}
}
template <typename index_type, typename vec_type>
__global__ void distribute_vectors_kernel(const index_type* postions, const size_t num_keys,
const vec_type* vectors, const int vec_size,
vec_type* output_vectors) {
constexpr int warp_size = 32;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<warp_size>(block);
#if (DTK_VERSION < 11060)
auto num_threads_per_grid = grid.size();
#else
auto num_threads_per_grid = grid.num_threads();
#endif
for (size_t step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
size_t key_num = step * num_threads_per_grid + grid.thread_rank();
index_type idx = std::numeric_limits<index_type>::max();
if (key_num < num_keys) {
idx = postions[key_num];
}
#pragma unroll 4
for (size_t j = 0; j < warp_size; j++) {
size_t idx_write = tile.shfl(idx, j);
size_t idx_read = tile.shfl(key_num, 0) + j;
if (idx_write == std::numeric_limits<index_type>::max()) continue;
warp_tile_copy<warp_size>(tile.thread_rank(), vec_size,
output_vectors + (size_t)idx_write * vec_size,
vectors + (size_t)idx_read * vec_size);
}
}
}
} // namespace
namespace gpu_cache {
template <typename key_type, typename index_type, typename vec_type>
UvmTable<key_type, index_type, vec_type>::UvmTable(const size_t device_table_capacity,
const size_t host_table_capacity,
const int max_batch_size, const int vec_size,
const vec_type default_value)
: max_batch_size_(::max(100000, max_batch_size)),
vec_size_(vec_size),
num_set_((device_table_capacity - 1) / set_size + 1),
num_host_set_((host_table_capacity - 1) / set_size + 1),
table_capacity_(num_set_ * set_size),
default_vector_(vec_size, default_value),
device_table_(device_table_capacity, set_size, max_batch_size_),
host_table_(host_table_capacity * 1.3, set_size, max_batch_size_) {
CUDA_CHECK(hipMalloc(&d_keys_buffer_, sizeof(key_type) * max_batch_size_));
CUDA_CHECK(hipMalloc(&d_vectors_buffer_, sizeof(vec_type) * max_batch_size_ * vec_size_));
CUDA_CHECK(hipMalloc(&d_vectors_, sizeof(vec_type) * device_table_.capacity * vec_size_));
CUDA_CHECK(hipMalloc(&d_output_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(hipMalloc(&d_output_host_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(hipHostMalloc(&h_output_host_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(hipMalloc(&d_missing_keys_, sizeof(key_type) * max_batch_size_));
CUDA_CHECK(hipMalloc(&d_missing_positions_, sizeof(int) * max_batch_size_));
CUDA_CHECK(hipMalloc(&d_missing_count_, sizeof(int)));
CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(int)));
CUDA_CHECK(hipStreamCreate(&query_stream_));
for (int i = 0; i < num_buffers_; i++) {
int batch_size_per_buffer = ceil(1.0 * max_batch_size_ / num_buffers_);
CUDA_CHECK(
hipHostMalloc(&h_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
CUDA_CHECK(hipMalloc(&d_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
CUDA_CHECK(hipStreamCreate(&cpy_streams_[i]));
CUDA_CHECK(hipEventCreate(&cpy_events_[i]));
}
CUDA_CHECK(hipHostMalloc(&h_missing_keys_, sizeof(key_type) * max_batch_size_));
CUDA_CHECK(hipEventCreate(&query_event_));
h_vectors_.resize(host_table_.capacity * vec_size_);
}
template <typename key_type, typename index_type, typename vec_type>
void UvmTable<key_type, index_type, vec_type>::add(const key_type* h_keys,
const vec_type* h_vectors,
const size_t num_keys) {
std::vector<key_type> h_missing_keys;
size_t num_batches = (num_keys - 1) / max_batch_size_ + 1;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size =
i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice));
CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
device_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
CUDA_CHECK(hipDeviceSynchronize());
int num_missing_keys;
CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
hipMemcpyDeviceToHost));
size_t prev_size = h_missing_keys.size();
h_missing_keys.resize(prev_size + num_missing_keys);
CUDA_CHECK(hipMemcpy(h_missing_keys.data() + prev_size, d_missing_keys_,
sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost));
}
std::vector<key_type> h_final_missing_keys;
num_batches = h_missing_keys.size() ? (h_missing_keys.size() - 1) / max_batch_size_ + 1 : 0;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size =
i != num_batches - 1 ? max_batch_size_ : h_missing_keys.size() - i * max_batch_size_;
CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_missing_keys.data() + i * max_batch_size_,
sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice));
CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
host_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
CUDA_CHECK(hipDeviceSynchronize());
int num_missing_keys;
CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
hipMemcpyDeviceToHost));
size_t prev_size = h_final_missing_keys.size();
h_final_missing_keys.resize(prev_size + num_missing_keys);
CUDA_CHECK(hipMemcpy(h_final_missing_keys.data() + prev_size, d_missing_keys_,
sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost));
}
std::vector<key_type> h_keys_buffer(max_batch_size_);
std::vector<index_type> h_indices_buffer(max_batch_size_);
std::vector<int> h_positions_buffer(max_batch_size_);
num_batches = (num_keys - 1) / max_batch_size_ + 1;
size_t num_hit_keys = 0;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size =
i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
CUDA_CHECK(hipMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
sizeof(*d_keys_buffer_) * this_batch_size, hipMemcpyHostToDevice));
CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
device_table_.query(d_keys_buffer_, this_batch_size, d_output_indices_, d_missing_keys_,
d_missing_positions_, d_missing_count_, 0);
CUDA_CHECK(hipStreamSynchronize(0));
CUDA_CHECK(hipMemcpy(d_vectors_buffer_, h_vectors + i * max_batch_size_ * vec_size_,
sizeof(*d_vectors_) * this_batch_size * vec_size_,
hipMemcpyHostToDevice));
CUDA_CHECK(hipStreamSynchronize(0));
if (num_hit_keys < device_table_.capacity) {
hipLaunchKernelGGL(( distribute_vectors_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, 0,
d_output_indices_, this_batch_size, d_vectors_buffer_, vec_size_, d_vectors_);
CUDA_CHECK(hipStreamSynchronize(0));
}
int num_missing_keys;
CUDA_CHECK(hipMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
hipMemcpyDeviceToHost));
num_hit_keys += this_batch_size - num_missing_keys;
host_table_.query(d_missing_keys_, num_missing_keys, d_output_indices_, nullptr, nullptr,
nullptr, 0);
CUDA_CHECK(hipMemcpy(h_keys_buffer.data(), d_missing_keys_,
sizeof(*d_missing_keys_) * num_missing_keys, hipMemcpyDeviceToHost))
CUDA_CHECK(hipMemcpy(h_indices_buffer.data(), d_output_indices_,
sizeof(*d_output_indices_) * num_missing_keys, hipMemcpyDeviceToHost))
CUDA_CHECK(hipMemcpy(h_positions_buffer.data(), d_missing_positions_,
sizeof(*d_missing_positions_) * num_missing_keys, hipMemcpyDeviceToHost))
for (int j = 0; j < num_missing_keys; j++) {
if (h_indices_buffer[j] != std::numeric_limits<index_type>::max()) {
memcpy(h_vectors_.data() + h_indices_buffer[j] * vec_size_,
h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
sizeof(*h_vectors) * vec_size_);
} else {
size_t prev_idx = h_vectors_.size() / vec_size_;
h_final_missing_items_.emplace(h_keys_buffer[j], prev_idx);
h_vectors_.resize(h_vectors_.size() + vec_size_);
memcpy(h_vectors_.data() + prev_idx * vec_size_,
h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
sizeof(*h_vectors) * vec_size_);
}
}
}
CUDA_CHECK(hipMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
}
template <typename key_type, typename index_type, typename vec_type>
void UvmTable<key_type, index_type, vec_type>::query(const key_type* d_keys, const int num_keys,
vec_type* d_vectors, hipStream_t stream) {
if (!num_keys) return;
CUDA_CHECK(hipEventRecord(query_event_, stream));
CUDA_CHECK(hipStreamWaitEvent(query_stream_, query_event_));
static_assert(num_buffers_ >= 2);
device_table_.query(d_keys, num_keys, d_output_indices_, d_missing_keys_, d_missing_positions_,
d_missing_count_, query_stream_);
CUDA_CHECK(hipEventRecord(query_event_, query_stream_));
CUDA_CHECK(hipStreamWaitEvent(cpy_streams_[0], query_event_));
int num_missing_keys;
CUDA_CHECK(hipMemcpyAsync(&num_missing_keys, d_missing_count_, sizeof(*d_missing_count_),
hipMemcpyDeviceToHost, cpy_streams_[0]));
host_table_.query(d_missing_keys_, d_missing_count_, d_output_host_indices_, query_stream_);
CUDA_CHECK(hipStreamSynchronize(cpy_streams_[0]));
CUDA_CHECK(hipMemsetAsync(d_missing_count_, 0, sizeof(*d_missing_count_), query_stream_));
CUDA_CHECK(hipMemcpyAsync(h_output_host_indices_, d_output_host_indices_,
sizeof(index_type) * num_missing_keys, hipMemcpyDeviceToHost,
query_stream_));
CUDA_CHECK(hipMemcpyAsync(h_missing_keys_, d_missing_keys_, sizeof(key_type) * num_missing_keys,
hipMemcpyDeviceToHost, cpy_streams_[0]));
hipLaunchKernelGGL(( read_vectors_kernel), dim3((num_keys - 1) / block_size + 1), dim3(block_size), 0, cpy_streams_[1],
d_output_indices_, num_keys, d_vectors_, vec_size_, d_vectors);
CUDA_CHECK(hipStreamSynchronize(query_stream_));
CUDA_CHECK(hipStreamSynchronize(cpy_streams_[0]));
int num_keys_per_buffer = ceil(1.0 * num_missing_keys / num_buffers_);
for (int buffer_num = 0; buffer_num < num_buffers_; buffer_num++) {
int num_keys_this_buffer = buffer_num != num_buffers_ - 1
? num_keys_per_buffer
: num_missing_keys - num_keys_per_buffer * buffer_num;
if (!num_keys_this_buffer) break;
#pragma omp parallel for num_threads(8)
for (size_t i = 0; i < static_cast<size_t>(num_keys_this_buffer); i++) {
size_t idx_key = buffer_num * num_keys_per_buffer + i;
index_type index = h_output_host_indices_[idx_key];
if (index == std::numeric_limits<index_type>::max()) {
key_type key = h_missing_keys_[idx_key];
auto iterator = h_final_missing_items_.find(key);
if (iterator != h_final_missing_items_.end()) {
index = iterator->second;
}
}
if (index != std::numeric_limits<index_type>::max()) {
memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, h_vectors_.data() + index * vec_size_,
sizeof(vec_type) * vec_size_);
} else {
memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, default_vector_.data(),
sizeof(vec_type) * vec_size_);
}
}
CUDA_CHECK(hipMemcpyAsync(d_cpy_buffers_[buffer_num], h_cpy_buffers_[buffer_num],
sizeof(vec_type) * num_keys_this_buffer * vec_size_,
hipMemcpyHostToDevice, cpy_streams_[buffer_num]));
hipLaunchKernelGGL(( distribute_vectors_kernel), dim3((num_keys_this_buffer - 1) / block_size + 1), dim3(block_size), 0,
cpy_streams_[buffer_num],
d_missing_positions_ + buffer_num * num_keys_per_buffer, num_keys_this_buffer,
d_cpy_buffers_[buffer_num], vec_size_, d_vectors);
}
for (int i = 0; i < num_buffers_; i++) {
CUDA_CHECK(hipEventRecord(cpy_events_[i], cpy_streams_[i]));
CUDA_CHECK(hipStreamWaitEvent(stream, cpy_events_[i]));
}
}
template <typename key_type, typename index_type, typename vec_type>
void UvmTable<key_type, index_type, vec_type>::clear(hipStream_t stream) {
device_table_.clear(stream);
host_table_.clear(stream);
}
template <typename key_type, typename index_type, typename vec_type>
UvmTable<key_type, index_type, vec_type>::~UvmTable() {
CUDA_CHECK(hipFree(d_keys_buffer_));
CUDA_CHECK(hipFree(d_vectors_buffer_));
CUDA_CHECK(hipFree(d_vectors_));
CUDA_CHECK(hipFree(d_output_indices_));
CUDA_CHECK(hipFree(d_output_host_indices_));
CUDA_CHECK(hipHostFree(h_output_host_indices_));
CUDA_CHECK(hipFree(d_missing_keys_));
CUDA_CHECK(hipFree(d_missing_positions_));
CUDA_CHECK(hipFree(d_missing_count_));
CUDA_CHECK(hipHostFree(h_missing_keys_));
CUDA_CHECK(hipStreamDestroy(query_stream_));
CUDA_CHECK(hipEventDestroy(query_event_));
for (int i = 0; i < num_buffers_; i++) {
CUDA_CHECK(hipHostFree(h_cpy_buffers_[i]));
CUDA_CHECK(hipFree(d_cpy_buffers_[i]));
CUDA_CHECK(hipStreamDestroy(cpy_streams_[i]));
CUDA_CHECK(hipEventDestroy(cpy_events_[i]));
}
}
template <typename key_type, typename index_type>
HashBlock<key_type, index_type>::HashBlock(size_t expected_capacity, int set_size, int batch_size)
: max_set_size_(set_size), batch_size_(batch_size) {
if (expected_capacity) {
num_sets = (expected_capacity - 1) / set_size + 1;
} else {
num_sets = 10000;
}
capacity = num_sets * set_size;
CUDA_CHECK(hipMalloc(&keys, sizeof(*keys) * capacity));
CUDA_CHECK(hipMalloc(&set_sizes_, sizeof(*set_sizes_) * num_sets));
CUDA_CHECK(hipMemset(set_sizes_, 0, sizeof(*set_sizes_) * num_sets));
}
template <typename key_type, typename index_type>
HashBlock<key_type, index_type>::~HashBlock() {
CUDA_CHECK(hipFree(keys));
CUDA_CHECK(hipFree(set_sizes_));
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::query(const key_type* query_keys, const size_t num_keys,
index_type* output_indices, key_type* missing_keys,
int* missing_positions, int* num_missing_keys,
hipStream_t stream) {
if (num_keys == 0) {
return;
}
size_t num_batches = (num_keys - 1) / batch_size_ + 1;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
hipLaunchKernelGGL(( hash_query_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, stream,
query_keys, this_batch_size, keys, num_sets, max_set_size_, output_indices, missing_keys,
missing_positions, num_missing_keys);
}
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::query(const key_type* query_keys, int* num_keys,
index_type* output_indices, hipStream_t stream) {
hipLaunchKernelGGL(( hash_query_kernel), dim3(128), dim3(64), 0, stream, query_keys, num_keys, keys, num_sets, max_set_size_,
output_indices);
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::add(const key_type* new_keys, const size_t num_keys,
key_type* missing_keys, int* num_missing_keys,
hipStream_t stream) {
if (num_keys == 0) {
return;
}
size_t num_batches = (num_keys - 1) / batch_size_ + 1;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
hipLaunchKernelGGL(( hash_add_kernel), dim3((this_batch_size - 1) / block_size + 1), dim3(block_size), 0, stream,
new_keys + i * this_batch_size, this_batch_size, keys, num_sets, set_sizes_, max_set_size_,
missing_keys, num_missing_keys);
}
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::clear(hipStream_t stream) {
CUDA_CHECK(hipMemsetAsync(set_sizes_, 0, sizeof(*set_sizes_) * num_sets, stream));
}
template class HashBlock<int, size_t>;
template class HashBlock<int64_t, size_t>;
template class HashBlock<size_t, size_t>;
template class HashBlock<unsigned int, size_t>;
template class HashBlock<long long, size_t>;
template class UvmTable<int, size_t>;
template class UvmTable<int64_t, size_t>;
template class UvmTable<size_t, size_t>;
template class UvmTable<unsigned int, size_t>;
template class UvmTable<long long, size_t>;
} // namespace gpu_cache
\ No newline at end of file
third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
View file @
1d28bf8b
...
...
@@ -15,14 +15,14 @@
cmake_minimum_required
(
VERSION 3.8
)
file
(
GLOB gpu_cache_test_src
cache_op_sol_test.
cu
../../HugeCTR/src/hps/embedding_cache_gpu.
cu
cache_op_sol_test.
hip
../../HugeCTR/src/hps/embedding_cache_gpu.
hip
)
add_executable
(
cache_op_sol_test
${
gpu_cache_test_src
}
)
target_compile_features
(
cache_op_sol_test PUBLIC cxx_std_17
)
target_link_libraries
(
cache_op_sol_test PUBLIC gpu_cache
)
target_link_libraries
(
cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX
)
set_target_properties
(
cache_op_sol_test PROPERTIES
CUDA
_RESOLVE_DEVICE_SYMBOLS ON
)
set_target_properties
(
cache_op_sol_test PROPERTIES
CUDA
_ARCHITECTURES OFF
)
set_target_properties
(
cache_op_sol_test PROPERTIES
HIP
_RESOLVE_DEVICE_SYMBOLS
ON
)
set_target_properties
(
cache_op_sol_test PROPERTIES
HIP
_ARCHITECTURES OFF
)
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.
cu
→
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
...
...
@@ -155,7 +157,7 @@ void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_
template <typename T>
bool is_near(T a, T b) {
double diff = abs(a - b);
bool
ret
=
diff
<=
std
::
min
(
a
,
b
)
*
1e-6
;
bool ret = diff <= ::min(a, b) * 1e-6;
if (!ret) {
std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl;
}
...
...
@@ -224,7 +226,7 @@ int main(int argc, char** argv) {
const size_t cache_type = atoi(argv[7]);
// Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK
(
cuda
SetDevice
(
0
));
CUDA_CHECK(
hip
SetDevice(0));
// Host side buffers shared between threads
key_type* h_keys; // Buffer holding all keys in embedding table
...
...
@@ -302,7 +304,7 @@ int main(int argc, char** argv) {
int thread_id = omp_get_thread_num();
printf("Worker %d starts testing cache.\n", thread_id);
// Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK
(
cuda
SetDevice
(
0
));
CUDA_CHECK(
hip
SetDevice(0));
// Thread-private host side buffers
size_t* h_query_keys_index; // Buffer holding index for keys to be queried
...
...
@@ -324,32 +326,32 @@ int main(int argc, char** argv) {
// host-only buffers placed in normal host memory
h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t));
// host-device interactive buffers placed in pinned memory
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_query_keys
,
query_length
*
sizeof
(
key_type
),
cuda
Host
A
llocPortable
));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_vals_retrieved
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_query_keys, query_length * sizeof(key_type),
hip
Host
Ma
llocPortable));
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_missing_keys
,
query_length
*
sizeof
(
key_type
),
cuda
Host
A
llocPortable
));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_missing_vals
,
hip
Host
Ma
llocPortable));
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_missing_keys, query_length * sizeof(key_type),
hip
Host
Ma
llocPortable));
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_missing_vals,
query_length * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_missing_index
,
query_length
*
sizeof
(
uint64_t
),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_missing_index, query_length * sizeof(uint64_t),
hip
Host
Ma
llocPortable));
// Allocate device side buffers
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_query_keys
,
query_length
*
sizeof
(
key_type
)));
CUDA_CHECK(
hip
Malloc((void**)&d_query_keys, query_length * sizeof(key_type)));
CUDA_CHECK(
cuda
Malloc
((
void
**
)
&
d_vals_retrieved
,
query_length
*
embedding_vec_size
*
sizeof
(
float
)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_keys
,
query_length
*
sizeof
(
key_type
)));
hip
Malloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_keys, query_length * sizeof(key_type)));
CUDA_CHECK(
cuda
Malloc
((
void
**
)
&
d_missing_vals
,
query_length
*
embedding_vec_size
*
sizeof
(
float
)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_index
,
query_length
*
sizeof
(
uint64_t
)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_len
,
sizeof
(
size_t
)));
hip
Malloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_index, query_length * sizeof(uint64_t)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_len, sizeof(size_t)));
// Thread-private CUDA stream, all threads just use the #0 device
cuda
Stream_t
stream
;
CUDA_CHECK
(
cuda
StreamCreate
(
&
stream
));
hip
Stream_t stream;
CUDA_CHECK(
hip
StreamCreate(&stream));
// Timimg variables
double time_1;
...
...
@@ -382,33 +384,33 @@ int main(int argc, char** argv) {
std::cout << std::endl;
// Copy the keys to GPU memory
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_query_keys
,
h_query_keys
,
query_length
*
sizeof
(
key_type
),
cuda
MemcpyHostToDevice
,
stream
));
CUDA_CHECK(
hip
MemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type),
hip
MemcpyHostToDevice, stream));
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Record time
time_1 = W_time();
// Get pairs from hashtable
cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream);
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n",
thread_id, i, time_2);
// Copy the data back to host
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_vals_retrieved
,
d_vals_retrieved
,
CUDA_CHECK(
hip
MemcpyAsync(h_vals_retrieved, d_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_missing_index
,
d_missing_index
,
query_length
*
sizeof
(
uint64_t
),
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_missing_keys
,
d_missing_keys
,
query_length
*
sizeof
(
key_type
),
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CHECK
(
cuda
MemcpyAsync
(
&
h_missing_len
,
d_missing_len
,
sizeof
(
size_t
),
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK(
hip
MemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t),
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK(
hip
MemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type),
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK(
hip
MemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK(
hip
StreamSynchronize(stream));
printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i,
h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f));
...
...
@@ -433,13 +435,13 @@ int main(int argc, char** argv) {
thread_id, i, time_2);
// Copy the missing value to device
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_missing_vals
,
h_missing_vals
,
CUDA_CHECK(
hip
MemcpyAsync(d_missing_vals, h_missing_vals,
query_length * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_vals_retrieved
,
h_vals_retrieved
,
hip
MemcpyHostToDevice, stream));
CUDA_CHECK(
hip
MemcpyAsync(d_vals_retrieved, h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Record time
time_1 = W_time();
...
...
@@ -449,7 +451,7 @@ int main(int argc, char** argv) {
else
cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream);
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n",
...
...
@@ -466,20 +468,20 @@ int main(int argc, char** argv) {
printf("Worker %d : All Finished!\n", thread_id);
// Clean-up
cuda
StreamDestroy
(
stream
);
hip
StreamDestroy(stream);
free(h_query_keys_index);
CUDA_CHECK
(
cudaFreeHost
(
h_query_keys
));
CUDA_CHECK
(
cudaFreeHost
(
h_vals_retrieved
));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_keys
));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_vals
));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_index
));
CUDA_CHECK
(
cuda
Free
(
d_query_keys
));
CUDA_CHECK
(
cuda
Free
(
d_vals_retrieved
));
CUDA_CHECK
(
cuda
Free
(
d_missing_keys
));
CUDA_CHECK
(
cuda
Free
(
d_missing_vals
));
CUDA_CHECK
(
cuda
Free
(
d_missing_index
));
CUDA_CHECK
(
cuda
Free
(
d_missing_len
));
CUDA_CHECK(
hipHostFree
(h_query_keys));
CUDA_CHECK(
hipHostFree
(h_vals_retrieved));
CUDA_CHECK(
hipHostFree
(h_missing_keys));
CUDA_CHECK(
hipHostFree
(h_missing_vals));
CUDA_CHECK(
hipHostFree
(h_missing_index));
CUDA_CHECK(
hip
Free(d_query_keys));
CUDA_CHECK(
hip
Free(d_vals_retrieved));
CUDA_CHECK(
hip
Free(d_missing_keys));
CUDA_CHECK(
hip
Free(d_missing_vals));
CUDA_CHECK(
hip
Free(d_missing_index));
CUDA_CHECK(
hip
Free(d_missing_len));
}
// 1st test Clean-up
...
...
@@ -547,57 +549,57 @@ int main(int argc, char** argv) {
key_type* d_missing_keys;
size_t* d_missing_len;
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_insert_keys
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cuda
Host
A
llocPortable
));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_insert_vals
,
hip
Host
Ma
llocPortable));
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_dump_keys
,
hip
Host
Ma
llocPortable));
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cuda
Host
A
llocPortable
));
CUDA_CHECK
(
cuda
Host
A
lloc
(
hip
Host
Ma
llocPortable));
CUDA_CHECK(
hip
Host
Ma
lloc(
(void**)&h_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_acc_keys
,
hip
Host
Ma
llocPortable));
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_acc_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_keys
,
CUDA_CHECK(
hip
Malloc((void**)&d_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_vals
,
SLAB_SIZE
*
SET_ASSOCIATIVITY
*
cache_capacity_in_set
*
CUDA_CHECK(
hip
Malloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
embedding_vec_size * sizeof(float)));
CUDA_CHECK(
cuda
Malloc
((
void
**
)
&
d_insert_keys
,
SLAB_SIZE
*
cache_capacity_in_set
*
sizeof
(
key_type
)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_insert_vals
,
hip
Malloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK(
hip
Malloc((void**)&d_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_dump_keys
,
CUDA_CHECK(
hip
Malloc((void**)&d_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
(
CUDA_CHECK(
hip
Malloc(
(void**)&d_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_dump_counter
,
sizeof
(
size_t
)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_index
,
CUDA_CHECK(
hip
Malloc((void**)&d_dump_counter, sizeof(size_t)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_index,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_keys
,
CUDA_CHECK(
hip
Malloc((void**)&d_missing_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_len
,
sizeof
(
size_t
)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_len, sizeof(size_t)));
// CUDA stream
cuda
Stream_t
stream
;
CUDA_CHECK
(
cuda
StreamCreate
(
&
stream
));
hip
Stream_t stream;
CUDA_CHECK(
hip
StreamCreate(&stream));
// Copy all keys and values from host to device
CUDA_CHECK
(
cuda
MemcpyAsync
(
CUDA_CHECK(
hip
MemcpyAsync(
d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cuda
MemcpyHostToDevice
,
stream
));
CUDA_CHECK
(
cuda
MemcpyAsync
(
hip
MemcpyHostToDevice, stream));
CUDA_CHECK(
hip
MemcpyAsync(
d_vals, h_new_vals,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Each time insert 1 slab per slabset into the cache and check result
for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) {
...
...
@@ -615,17 +617,17 @@ int main(int argc, char** argv) {
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type));
// Copy the <k,v> pairs from host to device
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_insert_keys
,
h_insert_keys
,
CUDA_CHECK(
hip
MemcpyAsync(d_insert_keys, h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK(
cuda
MemcpyAsync
(
d_insert_vals
,
h_insert_vals
,
hip
MemcpyAsync(d_insert_vals, h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
// Insert the <k,v> pairs into the cache
cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream);
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Record time
time_a = W_time();
...
...
@@ -633,7 +635,7 @@ int main(int argc, char** argv) {
cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream,
SLAB_SIZE);
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
time_b = W_time() - time_a;
printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b);
...
...
@@ -644,31 +646,31 @@ int main(int argc, char** argv) {
// Dump the keys from the cache
cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream);
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
time_b = W_time() - time_a;
printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b);
// Copy the dump counter from device to host
CUDA_CHECK
(
cuda
MemcpyAsync
(
&
h_dump_counter
,
d_dump_counter
,
sizeof
(
size_t
),
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CHECK(
hip
MemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t),
hip
MemcpyDeviceToHost, stream));
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Check the dump counter
assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1));
// Query all the dumped keys from the cache
cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream);
// Copy result from device to host
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_dump_keys
,
d_dump_keys
,
h_dump_counter
*
sizeof
(
key_type
),
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_vals_retrieved
,
d_vals_retrieved
,
CUDA_CHECK(
hip
MemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type),
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK(
hip
MemcpyAsync(h_vals_retrieved, d_vals_retrieved,
h_dump_counter * embedding_vec_size * sizeof(float),
cuda
MemcpyDeviceToHost
,
stream
));
CUDA_CHECK
(
cuda
MemcpyAsync
(
&
h_missing_len
,
d_missing_len
,
sizeof
(
size_t
),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK(
hip
MemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
hip
MemcpyDeviceToHost, stream));
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Check result
assert(h_missing_len == 0);
compare_key(h_dump_keys, h_acc_keys, h_dump_counter);
...
...
@@ -679,27 +681,27 @@ int main(int argc, char** argv) {
printf("Update and Dump API test all finished!\n");
// 2nd test clean-up
CUDA_CHECK
(
cuda
StreamDestroy
(
stream
));
CUDA_CHECK(
hip
StreamDestroy(stream));
free(h_keys);
free(h_vals);
free(h_new_vals);
CUDA_CHECK
(
cudaFreeHost
(
h_insert_keys
));
CUDA_CHECK
(
cudaFreeHost
(
h_insert_vals
));
CUDA_CHECK
(
cudaFreeHost
(
h_dump_keys
));
CUDA_CHECK
(
cudaFreeHost
(
h_vals_retrieved
));
CUDA_CHECK
(
cudaFreeHost
(
h_acc_keys
));
CUDA_CHECK
(
cuda
Free
(
d_keys
));
CUDA_CHECK
(
cuda
Free
(
d_vals
));
CUDA_CHECK
(
cuda
Free
(
d_insert_keys
));
CUDA_CHECK
(
cuda
Free
(
d_insert_vals
));
CUDA_CHECK
(
cuda
Free
(
d_dump_keys
));
CUDA_CHECK
(
cuda
Free
(
d_vals_retrieved
));
CUDA_CHECK
(
cuda
Free
(
d_dump_counter
));
CUDA_CHECK
(
cuda
Free
(
d_missing_index
));
CUDA_CHECK
(
cuda
Free
(
d_missing_keys
));
CUDA_CHECK
(
cuda
Free
(
d_missing_len
));
CUDA_CHECK(
hipHostFree
(h_insert_keys));
CUDA_CHECK(
hipHostFree
(h_insert_vals));
CUDA_CHECK(
hipHostFree
(h_dump_keys));
CUDA_CHECK(
hipHostFree
(h_vals_retrieved));
CUDA_CHECK(
hipHostFree
(h_acc_keys));
CUDA_CHECK(
hip
Free(d_keys));
CUDA_CHECK(
hip
Free(d_vals));
CUDA_CHECK(
hip
Free(d_insert_keys));
CUDA_CHECK(
hip
Free(d_insert_vals));
CUDA_CHECK(
hip
Free(d_dump_keys));
CUDA_CHECK(
hip
Free(d_vals_retrieved));
CUDA_CHECK(
hip
Free(d_dump_counter));
CUDA_CHECK(
hip
Free(d_missing_index));
CUDA_CHECK(
hip
Free(d_missing_keys));
CUDA_CHECK(
hip
Free(d_missing_len));
delete cache;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment