Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
1d28bf8b
Commit
1d28bf8b
authored
Sep 23, 2024
by
sangwzh
Browse files
update third_party/HugeCTR/gpu_cache codes to hip
parent
f119ea7c
Changes
13
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
1049 additions
and
1029 deletions
+1049
-1029
third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
+6
-4
third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
+6
-4
third_party/HugeCTR/gpu_cache/include/nv_util.h
third_party/HugeCTR/gpu_cache/include/nv_util.h
+13
-12
third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
+5
-3
third_party/HugeCTR/gpu_cache/include/static_table.hpp
third_party/HugeCTR/gpu_cache/include/static_table.hpp
+5
-3
third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
+176
-174
third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
+6
-7
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.hip
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.hip
+76
-72
third_party/HugeCTR/gpu_cache/src/static_hash_table.hip
third_party/HugeCTR/gpu_cache/src/static_hash_table.hip
+23
-22
third_party/HugeCTR/gpu_cache/src/static_table.hip
third_party/HugeCTR/gpu_cache/src/static_table.hip
+6
-4
third_party/HugeCTR/gpu_cache/src/uvm_table.hip
third_party/HugeCTR/gpu_cache/src/uvm_table.hip
+607
-606
third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
+4
-4
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.hip
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.hip
+116
-114
No files found.
third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021, NVIDIA CORPORATION.
*
*
...
@@ -31,22 +33,22 @@ class gpu_cache_api {
...
@@ -31,22 +33,22 @@ class gpu_cache_api {
// Query API, i.e. A single read from the cache
// Query API, i.e. A single read from the cache
virtual
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
virtual
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
uint64_t
*
d_missing_index
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
uint64_t
*
d_missing_index
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
cuda
Stream_t
stream
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
virtual
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
virtual
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
// Update API, i.e. update the embeddings which exist in the cache
// Update API, i.e. update the embeddings which exist in the cache
virtual
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
virtual
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
// Dump API, i.e. dump some slabsets' keys from the cache
// Dump API, i.e. dump some slabsets' keys from the cache
virtual
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
virtual
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
const
size_t
end_set_index
,
cuda
Stream_t
stream
)
=
0
;
const
size_t
end_set_index
,
hip
Stream_t
stream
)
=
0
;
};
};
}
// namespace gpu_cache
}
// namespace gpu_cache
third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -61,20 +63,20 @@ class gpu_cache : public gpu_cache_api<key_type> {
...
@@ -61,20 +63,20 @@ class gpu_cache : public gpu_cache_api<key_type> {
// Query API, i.e. A single read from the cache
// Query API, i.e. A single read from the cache
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
uint64_t
*
d_missing_index
,
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
uint64_t
*
d_missing_index
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
cuda
Stream_t
stream
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
// Update API, i.e. update the embeddings which exist in the cache
// Update API, i.e. update the embeddings which exist in the cache
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
// Dump API, i.e. dump some slabsets' keys from the cache
// Dump API, i.e. dump some slabsets' keys from the cache
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
const
size_t
end_set_index
,
cuda
Stream_t
stream
)
override
;
const
size_t
end_set_index
,
hip
Stream_t
stream
)
override
;
public:
public:
using
slabset
=
slab_set
<
set_associativity
,
key_type
,
warp_size
>
;
using
slabset
=
slab_set
<
set_associativity
,
key_type
,
warp_size
>
;
...
...
third_party/HugeCTR/gpu_cache/include/nv_util.h
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -15,7 +16,7 @@
...
@@ -15,7 +16,7 @@
*/
*/
#pragma once
#pragma once
#include <
cuda
_runtime_api.h>
#include <
hip/hip
_runtime_api.h>
#include <stdexcept>
#include <stdexcept>
#include <string>
#include <string>
...
@@ -30,17 +31,17 @@ class CudaException : public std::runtime_error {
...
@@ -30,17 +31,17 @@ class CudaException : public std::runtime_error {
CudaException
(
const
std
::
string
&
what
)
:
runtime_error
(
what
)
{}
CudaException
(
const
std
::
string
&
what
)
:
runtime_error
(
what
)
{}
};
};
inline
void
cuda_check_
(
cuda
Error_t
val
,
const
char
*
file
,
int
line
)
{
inline
void
cuda_check_
(
hip
Error_t
val
,
const
char
*
file
,
int
line
)
{
if
(
val
!=
cuda
Success
)
{
if
(
val
!=
hip
Success
)
{
throw
CudaException
(
std
::
string
(
file
)
+
":"
+
std
::
to_string
(
line
)
+
": CUDA error "
+
throw
CudaException
(
std
::
string
(
file
)
+
":"
+
std
::
to_string
(
line
)
+
": CUDA error "
+
std
::
to_string
(
val
)
+
": "
+
cuda
GetErrorString
(
val
));
std
::
to_string
(
val
)
+
": "
+
hip
GetErrorString
(
val
));
}
}
}
}
class
CudaDeviceRestorer
{
class
CudaDeviceRestorer
{
public:
public:
CudaDeviceRestorer
()
{
CUDA_CHECK
(
cuda
GetDevice
(
&
dev_
));
}
CudaDeviceRestorer
()
{
CUDA_CHECK
(
hip
GetDevice
(
&
dev_
));
}
~
CudaDeviceRestorer
()
{
CUDA_CHECK
(
cuda
SetDevice
(
dev_
));
}
~
CudaDeviceRestorer
()
{
CUDA_CHECK
(
hip
SetDevice
(
dev_
));
}
void
check_device
(
int
device
)
const
{
void
check_device
(
int
device
)
const
{
if
(
device
!=
dev_
)
{
if
(
device
!=
dev_
)
{
throw
std
::
runtime_error
(
throw
std
::
runtime_error
(
...
@@ -54,14 +55,14 @@ class CudaDeviceRestorer {
...
@@ -54,14 +55,14 @@ class CudaDeviceRestorer {
};
};
inline
int
get_dev
(
const
void
*
ptr
)
{
inline
int
get_dev
(
const
void
*
ptr
)
{
cuda
PointerAttribute
s
attr
;
hip
PointerAttribute
_t
attr
;
CUDA_CHECK
(
cuda
PointerGetAttributes
(
&
attr
,
ptr
));
CUDA_CHECK
(
hip
PointerGetAttributes
(
&
attr
,
ptr
));
int
dev
=
-
1
;
int
dev
=
-
1
;
#if
CUDA
RT_VERSION >= 10000
#if
DTK
RT_VERSION >= 10000
if
(
attr
.
type
==
cuda
MemoryTypeDevice
)
if
(
attr
.
type
==
hip
MemoryTypeDevice
)
#else
#else
if
(
attr
.
memoryType
==
cuda
MemoryTypeDevice
)
if
(
attr
.
memoryType
==
hip
MemoryTypeDevice
)
#endif
#endif
{
{
dev
=
attr
.
device
;
dev
=
attr
.
device
;
...
@@ -72,7 +73,7 @@ inline int get_dev(const void* ptr) {
...
@@ -72,7 +73,7 @@ inline int get_dev(const void* ptr) {
inline
void
switch_to_dev
(
const
void
*
ptr
)
{
inline
void
switch_to_dev
(
const
void
*
ptr
)
{
int
dev
=
get_dev
(
ptr
);
int
dev
=
get_dev
(
ptr
);
if
(
dev
>=
0
)
{
if
(
dev
>=
0
)
{
CUDA_CHECK
(
cuda
SetDevice
(
dev
));
CUDA_CHECK
(
hip
SetDevice
(
dev
));
}
}
}
}
...
...
third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -50,17 +52,17 @@ class StaticHashTable {
...
@@ -50,17 +52,17 @@ class StaticHashTable {
return
keys_bytes
+
indices_bytes
+
values_bytes
;
return
keys_bytes
+
indices_bytes
+
values_bytes
;
}
}
void
clear
(
cuda
Stream_t
stream
=
0
);
void
clear
(
hip
Stream_t
stream
=
0
);
// Note:
// Note:
// 1. Please make sure the key to be inserted is not duplicated.
// 1. Please make sure the key to be inserted is not duplicated.
// 2. Please make sure the key to be inserted does not exist in the table.
// 2. Please make sure the key to be inserted does not exist in the table.
// 3. Please make sure (size() + num_keys) <= capacity().
// 3. Please make sure (size() + num_keys) <= capacity().
void
insert
(
const
key_type
*
keys
,
const
value_type
*
values
,
size_type
num_keys
,
void
insert
(
const
key_type
*
keys
,
const
value_type
*
values
,
size_type
num_keys
,
cuda
Stream_t
stream
=
0
);
hip
Stream_t
stream
=
0
);
void
lookup
(
const
key_type
*
keys
,
value_type
*
values
,
int
num_keys
,
value_type
default_value
=
0
,
void
lookup
(
const
key_type
*
keys
,
value_type
*
values
,
int
num_keys
,
value_type
default_value
=
0
,
cuda
Stream_t
stream
=
0
);
hip
Stream_t
stream
=
0
);
private:
private:
key_type
*
table_keys_
;
key_type
*
table_keys_
;
...
...
third_party/HugeCTR/gpu_cache/include/static_table.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -36,12 +38,12 @@ class static_table {
...
@@ -36,12 +38,12 @@ class static_table {
~
static_table
(){};
~
static_table
(){};
// Query API, i.e. A single read from the cache
// Query API, i.e. A single read from the cache
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
cuda
Stream_t
stream
);
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
hip
Stream_t
stream
);
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void
Init
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
);
void
Init
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
hip
Stream_t
stream
);
void
Clear
(
cuda
Stream_t
stream
);
void
Clear
(
hip
Stream_t
stream
);
private:
private:
StaticHashTable
<
key_type
,
float
>
static_hash_table_
;
StaticHashTable
<
key_type
,
float
>
static_hash_table_
;
...
...
third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
View file @
1d28bf8b
/*
// !!! This is a file automatically generated by hipify!!!
* Copyright (c) 2023, NVIDIA CORPORATION.
#include "hip/hip_runtime.h"
*
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* Copyright (c) 2023, NVIDIA CORPORATION.
* you may not use this file except in compliance with the License.
*
* You may obtain a copy of the License at
* Licensed under the Apache License, Version 2.0 (the "License");
*
* you may not use this file except in compliance with the License.
* http://www.apache.org/licenses/LICENSE-2.0
* You may obtain a copy of the License at
*
*
* Unless required by applicable law or agreed to in writing, software
* http://www.apache.org/licenses/LICENSE-2.0
* distributed under the License is distributed on an "AS IS" BASIS,
*
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* Unless required by applicable law or agreed to in writing, software
* See the License for the specific language governing permissions and
* distributed under the License is distributed on an "AS IS" BASIS,
* limitations under the License.
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
* See the License for the specific language governing permissions and
#pragma once
* limitations under the License.
#include <nv_util.h>
*/
#pragma once
#include <thread>
#include <nv_util.h>
#include <unordered_map>
#include <vector>
#include <thread>
#include <unordered_map>
namespace
gpu_cache
{
#include <vector>
template
<
typename
key_type
,
typename
index_type
>
namespace
gpu_cache
{
class
HashBlock
{
public:
template
<
typename
key_type
,
typename
index_type
>
key_type
*
keys
;
class
HashBlock
{
size_t
num_sets
;
public:
size_t
capacity
;
key_type
*
keys
;
size_t
num_sets
;
HashBlock
(
size_t
expected_capacity
,
int
set_size
,
int
batch_size
);
size_t
capacity
;
~
HashBlock
();
void
add
(
const
key_type
*
new_keys
,
const
size_t
num_keys
,
key_type
*
missing_keys
,
HashBlock
(
size_t
expected_capacity
,
int
set_size
,
int
batch_size
);
int
*
num_missing_keys
,
cudaStream_t
stream
);
~
HashBlock
();
void
query
(
const
key_type
*
query_keys
,
const
size_t
num_keys
,
index_type
*
output_indices
,
void
add
(
const
key_type
*
new_keys
,
const
size_t
num_keys
,
key_type
*
missing_keys
,
key_type
*
missing_keys
,
int
*
missing_positions
,
int
*
num_missing_keys
,
int
*
num_missing_keys
,
hipStream_t
stream
);
cudaStream_t
stream
);
void
query
(
const
key_type
*
query_keys
,
const
size_t
num_keys
,
index_type
*
output_indices
,
void
query
(
const
key_type
*
query_keys
,
int
*
num_keys
,
index_type
*
output_indices
,
key_type
*
missing_keys
,
int
*
missing_positions
,
int
*
num_missing_keys
,
cudaStream_t
stream
);
hipStream_t
stream
);
void
clear
(
cudaStream_t
stream
);
void
query
(
const
key_type
*
query_keys
,
int
*
num_keys
,
index_type
*
output_indices
,
hipStream_t
stream
);
private:
void
clear
(
hipStream_t
stream
);
int
max_set_size_
;
int
batch_size_
;
private:
int
*
set_sizes_
;
int
max_set_size_
;
};
int
batch_size_
;
int
*
set_sizes_
;
template
<
typename
vec_type
>
};
class
H2HCopy
{
public:
template
<
typename
vec_type
>
H2HCopy
(
int
num_threads
)
:
num_threads_
(
num_threads
),
working_
(
num_threads
)
{
class
H2HCopy
{
for
(
int
i
=
0
;
i
<
num_threads_
;
i
++
)
{
public:
threads_
.
emplace_back
(
H2HCopy
(
int
num_threads
)
:
num_threads_
(
num_threads
),
working_
(
num_threads
)
{
[
&
](
int
idx
)
{
for
(
int
i
=
0
;
i
<
num_threads_
;
i
++
)
{
while
(
!
terminate_
)
{
threads_
.
emplace_back
(
if
(
working_
[
idx
].
load
(
std
::
memory_order_relaxed
))
{
[
&
](
int
idx
)
{
working_
[
idx
].
store
(
false
,
std
::
memory_order_relaxed
);
while
(
!
terminate_
)
{
if
(
num_keys_
==
0
)
continue
;
if
(
working_
[
idx
].
load
(
std
::
memory_order_relaxed
))
{
size_t
num_keys_this_thread
=
(
num_keys_
-
1
)
/
num_threads_
+
1
;
working_
[
idx
].
store
(
false
,
std
::
memory_order_relaxed
);
size_t
begin
=
idx
*
num_keys_this_thread
;
if
(
num_keys_
==
0
)
continue
;
if
(
idx
==
num_threads_
-
1
)
{
size_t
num_keys_this_thread
=
(
num_keys_
-
1
)
/
num_threads_
+
1
;
num_keys_this_thread
=
num_keys_
-
num_keys_this_thread
*
idx
;
size_t
begin
=
idx
*
num_keys_this_thread
;
}
if
(
idx
==
num_threads_
-
1
)
{
size_t
end
=
begin
+
num_keys_this_thread
;
num_keys_this_thread
=
num_keys_
-
num_keys_this_thread
*
idx
;
}
for
(
size_t
i
=
begin
;
i
<
end
;
i
++
)
{
size_t
end
=
begin
+
num_keys_this_thread
;
size_t
idx_vec
=
get_index_
(
i
);
if
(
idx_vec
==
std
::
numeric_limits
<
size_t
>::
max
())
{
for
(
size_t
i
=
begin
;
i
<
end
;
i
++
)
{
continue
;
size_t
idx_vec
=
get_index_
(
i
);
}
if
(
idx_vec
==
std
::
numeric_limits
<
size_t
>::
max
())
{
memcpy
(
dst_data_ptr_
+
i
*
vec_size_
,
src_data_ptr_
+
idx_vec
*
vec_size_
,
continue
;
sizeof
(
vec_type
)
*
vec_size_
);
}
}
memcpy
(
dst_data_ptr_
+
i
*
vec_size_
,
src_data_ptr_
+
idx_vec
*
vec_size_
,
num_finished_workers_
++
;
sizeof
(
vec_type
)
*
vec_size_
);
}
}
}
num_finished_workers_
++
;
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
microseconds
(
1
));
}
},
}
i
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
microseconds
(
1
));
}
},
};
i
);
}
void
copy
(
vec_type
*
dst_data_ptr
,
vec_type
*
src_data_ptr
,
size_t
num_keys
,
int
vec_size
,
};
std
::
function
<
size_t
(
size_t
)
>
get_index_func
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
submit_mutex_
);
void
copy
(
vec_type
*
dst_data_ptr
,
vec_type
*
src_data_ptr
,
size_t
num_keys
,
int
vec_size
,
dst_data_ptr_
=
dst_data_ptr
;
std
::
function
<
size_t
(
size_t
)
>
get_index_func
)
{
src_data_ptr_
=
src_data_ptr
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
submit_mutex_
);
get_index_
=
get_index_func
;
dst_data_ptr_
=
dst_data_ptr
;
num_keys_
=
num_keys
;
src_data_ptr_
=
src_data_ptr
;
vec_size_
=
vec_size
;
get_index_
=
get_index_func
;
num_finished_workers_
.
store
(
0
,
std
::
memory_order_acquire
);
num_keys_
=
num_keys
;
vec_size_
=
vec_size
;
for
(
auto
&
working
:
working_
)
{
num_finished_workers_
.
store
(
0
,
std
::
memory_order_acquire
);
working
.
store
(
true
,
std
::
memory_order_relaxed
);
}
for
(
auto
&
working
:
working_
)
{
working
.
store
(
true
,
std
::
memory_order_relaxed
);
while
(
num_finished_workers_
!=
num_threads_
)
{
}
continue
;
}
while
(
num_finished_workers_
!=
num_threads_
)
{
}
continue
;
}
~
H2HCopy
()
{
}
terminate_
=
true
;
for
(
auto
&
t
:
threads_
)
{
~
H2HCopy
()
{
t
.
join
();
terminate_
=
true
;
}
for
(
auto
&
t
:
threads_
)
{
}
t
.
join
();
}
private:
}
vec_type
*
src_data_ptr_
;
vec_type
*
dst_data_ptr_
;
private:
vec_type
*
src_data_ptr_
;
std
::
function
<
size_t
(
size_t
)
>
get_index_
;
vec_type
*
dst_data_ptr_
;
size_t
num_keys_
;
std
::
function
<
size_t
(
size_t
)
>
get_index_
;
int
vec_size_
;
size_t
num_keys_
;
std
::
mutex
submit_mutex_
;
int
vec_size_
;
const
int
num_threads_
;
std
::
vector
<
std
::
thread
>
threads_
;
std
::
mutex
submit_mutex_
;
std
::
vector
<
std
::
atomic
<
bool
>>
working_
;
const
int
num_threads_
;
volatile
bool
terminate_
{
false
};
std
::
vector
<
std
::
thread
>
threads_
;
std
::
atomic
<
int
>
num_finished_workers_
{
0
};
std
::
vector
<
std
::
atomic
<
bool
>>
working_
;
};
volatile
bool
terminate_
{
false
};
std
::
atomic
<
int
>
num_finished_workers_
{
0
};
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
=
float
>
};
class
UvmTable
{
public:
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
=
float
>
UvmTable
(
const
size_t
device_table_capacity
,
const
size_t
host_table_capacity
,
class
UvmTable
{
const
int
max_batch_size
,
const
int
vec_size
,
public:
const
vec_type
default_value
=
(
vec_type
)
0
);
UvmTable
(
const
size_t
device_table_capacity
,
const
size_t
host_table_capacity
,
~
UvmTable
();
const
int
max_batch_size
,
const
int
vec_size
,
void
query
(
const
key_type
*
d_keys
,
const
int
len
,
vec_type
*
d_vectors
,
cudaStream_t
stream
=
0
);
const
vec_type
default_value
=
(
vec_type
)
0
);
void
add
(
const
key_type
*
h_keys
,
const
vec_type
*
h_vectors
,
const
size_t
len
);
~
UvmTable
();
void
clear
(
cudaStream_t
stream
=
0
);
void
query
(
const
key_type
*
d_keys
,
const
int
len
,
vec_type
*
d_vectors
,
hipStream_t
stream
=
0
);
void
add
(
const
key_type
*
h_keys
,
const
vec_type
*
h_vectors
,
const
size_t
len
);
private:
void
clear
(
hipStream_t
stream
=
0
);
static
constexpr
int
num_buffers_
=
2
;
key_type
*
d_keys_buffer_
;
private:
vec_type
*
d_vectors_buffer_
;
static
constexpr
int
num_buffers_
=
2
;
vec_type
*
d_vectors_
;
key_type
*
d_keys_buffer_
;
vec_type
*
d_vectors_buffer_
;
index_type
*
d_output_indices_
;
vec_type
*
d_vectors_
;
index_type
*
d_output_host_indices_
;
index_type
*
h_output_host_indices_
;
index_type
*
d_output_indices_
;
index_type
*
d_output_host_indices_
;
key_type
*
d_missing_keys_
;
index_type
*
h_output_host_indices_
;
int
*
d_missing_positions_
;
int
*
d_missing_count_
;
key_type
*
d_missing_keys_
;
int
*
d_missing_positions_
;
std
::
vector
<
vec_type
>
h_vectors_
;
int
*
d_missing_count_
;
key_type
*
h_missing_keys_
;
std
::
vector
<
vec_type
>
h_vectors_
;
cudaStream_t
query_stream_
;
key_type
*
h_missing_keys_
;
cudaEvent_t
query_event_
;
hipStream_t
query_stream_
;
vec_type
*
h_cpy_buffers_
[
num_buffers_
];
hipEvent_t
query_event_
;
vec_type
*
d_cpy_buffers_
[
num_buffers_
];
cudaStream_t
cpy_streams_
[
num_buffers_
];
vec_type
*
h_cpy_buffers_
[
num_buffers_
];
cudaEvent_t
cpy_events_
[
num_buffers_
];
vec_type
*
d_cpy_buffers_
[
num_buffers_
];
hipStream_t
cpy_streams_
[
num_buffers_
];
std
::
unordered_map
<
key_type
,
index_type
>
h_final_missing_items_
;
hipEvent_t
cpy_events_
[
num_buffers_
];
int
max_batch_size_
;
std
::
unordered_map
<
key_type
,
index_type
>
h_final_missing_items_
;
int
vec_size_
;
size_t
num_set_
;
int
max_batch_size_
;
size_t
num_host_set_
;
int
vec_size_
;
size_t
table_capacity_
;
size_t
num_set_
;
std
::
vector
<
vec_type
>
default_vector_
;
size_t
num_host_set_
;
size_t
table_capacity_
;
HashBlock
<
key_type
,
index_type
>
device_table_
;
std
::
vector
<
vec_type
>
default_vector_
;
HashBlock
<
key_type
,
index_type
>
host_table_
;
};
HashBlock
<
key_type
,
index_type
>
device_table_
;
HashBlock
<
key_type
,
index_type
>
host_table_
;
};
}
// namespace gpu_cache
}
// namespace gpu_cache
\ No newline at end of file
third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
View file @
1d28bf8b
...
@@ -15,15 +15,14 @@
...
@@ -15,15 +15,14 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
file
(
GLOB gpu_cache_src
file
(
GLOB gpu_cache_src
nv_gpu_cache.
cu
nv_gpu_cache.
hip
static_table.
cu
static_table.
hip
static_hash_table.
cu
static_hash_table.
hip
uvm_table.
cu
uvm_table.
hip
)
)
add_library
(
gpu_cache SHARED
${
gpu_cache_src
}
)
add_library
(
gpu_cache SHARED
${
gpu_cache_src
}
)
target_compile_features
(
gpu_cache PUBLIC cxx_std_11
)
target_compile_features
(
gpu_cache PUBLIC cxx_std_11
)
set_target_properties
(
gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
set_target_properties
(
gpu_cache PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS ON
)
set_target_properties
(
gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
# set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)
set_target_properties
(
gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF
)
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.
cu
→
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.
hip
View file @
1d28bf8b
This diff is collapsed.
Click to expand it.
third_party/HugeCTR/gpu_cache/src/static_hash_table.
cu
→
third_party/HugeCTR/gpu_cache/src/static_hash_table.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -14,8 +15,8 @@
...
@@ -14,8 +15,8 @@
* limitations under the License.
* limitations under the License.
*/
*/
#include <cooperative_groups.h>
#include <
hip/hip_
cooperative_groups.h>
#include <
cuda
.h>
#include <
hip/hip_runtime
.h>
#include <stdint.h>
#include <stdint.h>
#include <stdio.h>
#include <stdio.h>
...
@@ -49,7 +50,7 @@ __device__ size_type insert(key_type *table, size_type capacity, key_type key, c
...
@@ -49,7 +50,7 @@ __device__ size_type insert(key_type *table, size_type capacity, key_type key, c
// otherwise return invalid_slot.
// otherwise return invalid_slot.
const size_type num_groups = capacity / group_size;
const size_type num_groups = capacity / group_size;
#if (
CUDA
_VERSION < 11060)
#if (
DTK
_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size();
unsigned long long num_threads_per_group = cg.size();
#else
#else
unsigned long long num_threads_per_group = cg.num_threads();
unsigned long long num_threads_per_group = cg.num_threads();
...
@@ -152,7 +153,7 @@ __device__ size_type lookup(key_type *table, size_type capacity, key_type key, c
...
@@ -152,7 +153,7 @@ __device__ size_type lookup(key_type *table, size_type capacity, key_type key, c
const size_type num_groups = capacity / group_size;
const size_type num_groups = capacity / group_size;
#if (
CUDA
_VERSION < 11060)
#if (
DTK
_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size();
unsigned long long num_threads_per_group = cg.size();
#else
#else
unsigned long long num_threads_per_group = cg.num_threads();
unsigned long long num_threads_per_group = cg.num_threads();
...
@@ -300,19 +301,19 @@ StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHash
...
@@ -300,19 +301,19 @@ StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHash
size_t align_m = 16;
size_t align_m = 16;
size_t num_keys = key_capacity_ + 1;
size_t num_keys = key_capacity_ + 1;
size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m;
size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m;
CUDA_CHECK
(
cuda
Malloc
(
&
table_keys_
,
sizeof
(
key_type
)
*
num_keys
));
CUDA_CHECK(
hip
Malloc(&table_keys_, sizeof(key_type) * num_keys));
CUDA_CHECK
(
cuda
Malloc
(
&
table_indices_
,
sizeof
(
size_type
)
*
num_keys
));
CUDA_CHECK(
hip
Malloc(&table_indices_, sizeof(size_type) * num_keys));
CUDA_CHECK
(
cuda
Malloc
(
&
table_values_
,
sizeof
(
value_type
)
*
num_values
));
CUDA_CHECK(
hip
Malloc(&table_values_, sizeof(value_type) * num_values));
// Initialize table_keys_
// Initialize table_keys_
CUDA_CHECK
(
cuda
Memset
(
table_keys_
,
0xff
,
sizeof
(
key_type
)
*
key_capacity_
));
CUDA_CHECK(
hip
Memset(table_keys_, 0xff, sizeof(key_type) * key_capacity_));
CUDA_CHECK
(
cuda
Memset
(
table_keys_
+
key_capacity_
,
0
,
sizeof
(
key_type
)));
CUDA_CHECK(
hip
Memset(table_keys_ + key_capacity_, 0, sizeof(key_type)));
}
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert(
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert(
const
key_type
*
keys
,
const
value_type
*
values
,
size_type
num_keys
,
cuda
Stream_t
stream
)
{
const key_type *keys, const value_type *values, size_type num_keys,
hip
Stream_t stream) {
if (num_keys == 0) {
if (num_keys == 0) {
return;
return;
}
}
...
@@ -324,12 +325,12 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
...
@@ -324,12 +325,12 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
// Insert keys
// Insert keys
constexpr int block = 256;
constexpr int block = 256;
int grid = (num_keys - 1) / block + 1;
int grid = (num_keys - 1) / block + 1;
InsertKeyKernel
<
tile_size
,
group_size
>
hipLaunchKernelGGL((
InsertKeyKernel<tile_size, group_size>
)
<<<
grid
,
block
,
0
,
stream
>>>
(
table_keys_
,
table_indices_
,
key_capacity_
,
keys
,
num_keys
,
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
table_keys_, table_indices_, key_capacity_, keys, num_keys,
size_, hash_, empty_key, invalid_slot);
size_, hash_, empty_key, invalid_slot);
// Copy values
// Copy values
CUDA_CHECK
(
cuda
MemcpyAsync
(
table_values_
+
size_
*
value_dim_
,
values
,
CUDA_CHECK(
hip
MemcpyAsync(table_values_ + size_ * value_dim_, values,
sizeof
(
value_type
)
*
num_keys
*
value_dim_
,
cuda
MemcpyDeviceToDevice
,
sizeof(value_type) * num_keys * value_dim_,
hip
MemcpyDeviceToDevice,
stream));
stream));
size_ += num_keys;
size_ += num_keys;
}
}
...
@@ -337,25 +338,25 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
...
@@ -337,25 +338,25 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear(
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear(
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
CUDA_CHECK
(
cuda
MemsetAsync
(
table_keys_
,
0xff
,
sizeof
(
key_type
)
*
key_capacity_
,
stream
));
CUDA_CHECK(
hip
MemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream));
CUDA_CHECK
(
cuda
MemsetAsync
(
table_keys_
+
key_capacity_
,
0
,
sizeof
(
key_type
),
stream
));
CUDA_CHECK(
hip
MemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream));
size_ = 0;
size_ = 0;
}
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
typename hasher>
StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() {
StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() {
CUDA_CHECK
(
cuda
Free
(
table_keys_
));
CUDA_CHECK(
hip
Free(table_keys_));
CUDA_CHECK
(
cuda
Free
(
table_indices_
));
CUDA_CHECK(
hip
Free(table_indices_));
CUDA_CHECK
(
cuda
Free
(
table_values_
));
CUDA_CHECK(
hip
Free(table_values_));
}
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup(
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup(
const key_type *keys, value_type *values, int num_keys, value_type default_value,
const key_type *keys, value_type *values, int num_keys, value_type default_value,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
if (num_keys == 0) {
if (num_keys == 0) {
return;
return;
}
}
...
@@ -363,7 +364,7 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::looku
...
@@ -363,7 +364,7 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::looku
constexpr int block = 256;
constexpr int block = 256;
const int grid = (num_keys - 1) / block + 1;
const int grid = (num_keys - 1) / block + 1;
// Lookup keys
// Lookup keys
LookupKernel
<
tile_size
,
group_size
>
<<<
grid
,
block
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
LookupKernel<tile_size, group_size>
), dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values,
table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values,
hash_, empty_key, default_value, invalid_slot);
hash_, empty_key, default_value, invalid_slot);
}
}
...
...
third_party/HugeCTR/gpu_cache/src/static_table.
cu
→
third_party/HugeCTR/gpu_cache/src/static_table.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -14,7 +16,7 @@
...
@@ -14,7 +16,7 @@
* limitations under the License.
* limitations under the License.
*/
*/
#include <cooperative_groups.h>
#include <
hip/hip_
cooperative_groups.h>
#include <nv_util.h>
#include <nv_util.h>
#include <iostream>
#include <iostream>
...
@@ -38,18 +40,18 @@ static_table<key_type>::static_table(const size_t table_size, const size_t embed
...
@@ -38,18 +40,18 @@ static_table<key_type>::static_table(const size_t table_size, const size_t embed
template <typename key_type>
template <typename key_type>
void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values,
void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream);
static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream);
}
}
template <typename key_type>
template <typename key_type>
void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values,
void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
static_hash_table_.insert(d_keys, d_values, len, stream);
static_hash_table_.insert(d_keys, d_values, len, stream);
}
}
template <typename key_type>
template <typename key_type>
void
static_table
<
key_type
>::
Clear
(
cuda
Stream_t
stream
)
{
void static_table<key_type>::Clear(
hip
Stream_t stream) {
static_hash_table_.clear(stream);
static_hash_table_.clear(stream);
}
}
...
...
third_party/HugeCTR/gpu_cache/src/uvm_table.
cu
→
third_party/HugeCTR/gpu_cache/src/uvm_table.
hip
View file @
1d28bf8b
This diff is collapsed.
Click to expand it.
third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
View file @
1d28bf8b
...
@@ -15,14 +15,14 @@
...
@@ -15,14 +15,14 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
file
(
GLOB gpu_cache_test_src
file
(
GLOB gpu_cache_test_src
cache_op_sol_test.
cu
cache_op_sol_test.
hip
../../HugeCTR/src/hps/embedding_cache_gpu.
cu
../../HugeCTR/src/hps/embedding_cache_gpu.
hip
)
)
add_executable
(
cache_op_sol_test
${
gpu_cache_test_src
}
)
add_executable
(
cache_op_sol_test
${
gpu_cache_test_src
}
)
target_compile_features
(
cache_op_sol_test PUBLIC cxx_std_17
)
target_compile_features
(
cache_op_sol_test PUBLIC cxx_std_17
)
target_link_libraries
(
cache_op_sol_test PUBLIC gpu_cache
)
target_link_libraries
(
cache_op_sol_test PUBLIC gpu_cache
)
target_link_libraries
(
cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX
)
target_link_libraries
(
cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX
)
set_target_properties
(
cache_op_sol_test PROPERTIES
CUDA
_RESOLVE_DEVICE_SYMBOLS ON
)
set_target_properties
(
cache_op_sol_test PROPERTIES
HIP
_RESOLVE_DEVICE_SYMBOLS
ON
)
set_target_properties
(
cache_op_sol_test PROPERTIES
CUDA
_ARCHITECTURES OFF
)
set_target_properties
(
cache_op_sol_test PROPERTIES
HIP
_ARCHITECTURES OFF
)
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.
cu
→
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -155,7 +157,7 @@ void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_
...
@@ -155,7 +157,7 @@ void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_
template <typename T>
template <typename T>
bool is_near(T a, T b) {
bool is_near(T a, T b) {
double diff = abs(a - b);
double diff = abs(a - b);
bool
ret
=
diff
<=
std
::
min
(
a
,
b
)
*
1e-6
;
bool ret = diff <= ::min(a, b) * 1e-6;
if (!ret) {
if (!ret) {
std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl;
std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl;
}
}
...
@@ -224,7 +226,7 @@ int main(int argc, char** argv) {
...
@@ -224,7 +226,7 @@ int main(int argc, char** argv) {
const size_t cache_type = atoi(argv[7]);
const size_t cache_type = atoi(argv[7]);
// Since cache is designed for single-gpu, all threads just use GPU 0
// Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK
(
cuda
SetDevice
(
0
));
CUDA_CHECK(
hip
SetDevice(0));
// Host side buffers shared between threads
// Host side buffers shared between threads
key_type* h_keys; // Buffer holding all keys in embedding table
key_type* h_keys; // Buffer holding all keys in embedding table
...
@@ -302,7 +304,7 @@ int main(int argc, char** argv) {
...
@@ -302,7 +304,7 @@ int main(int argc, char** argv) {
int thread_id = omp_get_thread_num();
int thread_id = omp_get_thread_num();
printf("Worker %d starts testing cache.\n", thread_id);
printf("Worker %d starts testing cache.\n", thread_id);
// Since cache is designed for single-gpu, all threads just use GPU 0
// Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK
(
cuda
SetDevice
(
0
));
CUDA_CHECK(
hip
SetDevice(0));
// Thread-private host side buffers
// Thread-private host side buffers
size_t* h_query_keys_index; // Buffer holding index for keys to be queried
size_t* h_query_keys_index; // Buffer holding index for keys to be queried
...
@@ -324,32 +326,32 @@ int main(int argc, char** argv) {
...
@@ -324,32 +326,32 @@ int main(int argc, char** argv) {
// host-only buffers placed in normal host memory
// host-only buffers placed in normal host memory
h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t));
h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t));
// host-device interactive buffers placed in pinned memory
// host-device interactive buffers placed in pinned memory
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_query_keys
,
query_length
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_query_keys, query_length * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_vals_retrieved
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_missing_keys
,
query_length
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_missing_keys, query_length * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_missing_vals
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_missing_vals,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_missing_index
,
query_length
*
sizeof
(
uint64_t
),
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_missing_index, query_length * sizeof(uint64_t),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
// Allocate device side buffers
// Allocate device side buffers
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_query_keys
,
query_length
*
sizeof
(
key_type
)));
CUDA_CHECK(
hip
Malloc((void**)&d_query_keys, query_length * sizeof(key_type)));
CUDA_CHECK(
CUDA_CHECK(
cuda
Malloc
((
void
**
)
&
d_vals_retrieved
,
query_length
*
embedding_vec_size
*
sizeof
(
float
)));
hip
Malloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_keys
,
query_length
*
sizeof
(
key_type
)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_keys, query_length * sizeof(key_type)));
CUDA_CHECK(
CUDA_CHECK(
cuda
Malloc
((
void
**
)
&
d_missing_vals
,
query_length
*
embedding_vec_size
*
sizeof
(
float
)));
hip
Malloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_index
,
query_length
*
sizeof
(
uint64_t
)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_index, query_length * sizeof(uint64_t)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_len
,
sizeof
(
size_t
)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_len, sizeof(size_t)));
// Thread-private CUDA stream, all threads just use the #0 device
// Thread-private CUDA stream, all threads just use the #0 device
cuda
Stream_t
stream
;
hip
Stream_t stream;
CUDA_CHECK
(
cuda
StreamCreate
(
&
stream
));
CUDA_CHECK(
hip
StreamCreate(&stream));
// Timimg variables
// Timimg variables
double time_1;
double time_1;
...
@@ -382,33 +384,33 @@ int main(int argc, char** argv) {
...
@@ -382,33 +384,33 @@ int main(int argc, char** argv) {
std::cout << std::endl;
std::cout << std::endl;
// Copy the keys to GPU memory
// Copy the keys to GPU memory
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_query_keys
,
h_query_keys
,
query_length
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
MemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Record time
// Record time
time_1 = W_time();
time_1 = W_time();
// Get pairs from hashtable
// Get pairs from hashtable
cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys,
cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream);
d_missing_len, stream);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
// Elapsed wall time
time_2 = W_time() - time_1;
time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n",
printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n",
thread_id, i, time_2);
thread_id, i, time_2);
// Copy the data back to host
// Copy the data back to host
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_vals_retrieved
,
d_vals_retrieved
,
CUDA_CHECK(
hip
MemcpyAsync(h_vals_retrieved, d_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_missing_index
,
d_missing_index
,
query_length
*
sizeof
(
uint64_t
),
CUDA_CHECK(
hip
MemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_missing_keys
,
d_missing_keys
,
query_length
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
MemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
&
h_missing_len
,
d_missing_len
,
sizeof
(
size_t
),
CUDA_CHECK(
hip
MemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i,
printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i,
h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f));
h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f));
...
@@ -433,13 +435,13 @@ int main(int argc, char** argv) {
...
@@ -433,13 +435,13 @@ int main(int argc, char** argv) {
thread_id, i, time_2);
thread_id, i, time_2);
// Copy the missing value to device
// Copy the missing value to device
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_missing_vals
,
h_missing_vals
,
CUDA_CHECK(
hip
MemcpyAsync(d_missing_vals, h_missing_vals,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_vals_retrieved
,
h_vals_retrieved
,
CUDA_CHECK(
hip
MemcpyAsync(d_vals_retrieved, h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Record time
// Record time
time_1 = W_time();
time_1 = W_time();
...
@@ -449,7 +451,7 @@ int main(int argc, char** argv) {
...
@@ -449,7 +451,7 @@ int main(int argc, char** argv) {
else
else
cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream);
cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
// Elapsed wall time
time_2 = W_time() - time_1;
time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n",
printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n",
...
@@ -466,20 +468,20 @@ int main(int argc, char** argv) {
...
@@ -466,20 +468,20 @@ int main(int argc, char** argv) {
printf("Worker %d : All Finished!\n", thread_id);
printf("Worker %d : All Finished!\n", thread_id);
// Clean-up
// Clean-up
cuda
StreamDestroy
(
stream
);
hip
StreamDestroy(stream);
free(h_query_keys_index);
free(h_query_keys_index);
CUDA_CHECK
(
cudaFreeHost
(
h_query_keys
));
CUDA_CHECK(
hipHostFree
(h_query_keys));
CUDA_CHECK
(
cudaFreeHost
(
h_vals_retrieved
));
CUDA_CHECK(
hipHostFree
(h_vals_retrieved));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_keys
));
CUDA_CHECK(
hipHostFree
(h_missing_keys));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_vals
));
CUDA_CHECK(
hipHostFree
(h_missing_vals));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_index
));
CUDA_CHECK(
hipHostFree
(h_missing_index));
CUDA_CHECK
(
cuda
Free
(
d_query_keys
));
CUDA_CHECK(
hip
Free(d_query_keys));
CUDA_CHECK
(
cuda
Free
(
d_vals_retrieved
));
CUDA_CHECK(
hip
Free(d_vals_retrieved));
CUDA_CHECK
(
cuda
Free
(
d_missing_keys
));
CUDA_CHECK(
hip
Free(d_missing_keys));
CUDA_CHECK
(
cuda
Free
(
d_missing_vals
));
CUDA_CHECK(
hip
Free(d_missing_vals));
CUDA_CHECK
(
cuda
Free
(
d_missing_index
));
CUDA_CHECK(
hip
Free(d_missing_index));
CUDA_CHECK
(
cuda
Free
(
d_missing_len
));
CUDA_CHECK(
hip
Free(d_missing_len));
}
}
// 1st test Clean-up
// 1st test Clean-up
...
@@ -547,57 +549,57 @@ int main(int argc, char** argv) {
...
@@ -547,57 +549,57 @@ int main(int argc, char** argv) {
key_type* d_missing_keys;
key_type* d_missing_keys;
size_t* d_missing_len;
size_t* d_missing_len;
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_insert_keys
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_insert_vals
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_dump_keys
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
(
CUDA_CHECK(
hip
Host
Ma
lloc(
(void**)&h_vals_retrieved,
(void**)&h_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_acc_keys
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_acc_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_keys
,
CUDA_CHECK(
hip
Malloc((void**)&d_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_vals
,
SLAB_SIZE
*
SET_ASSOCIATIVITY
*
cache_capacity_in_set
*
CUDA_CHECK(
hip
Malloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
embedding_vec_size * sizeof(float)));
embedding_vec_size * sizeof(float)));
CUDA_CHECK(
CUDA_CHECK(
cuda
Malloc
((
void
**
)
&
d_insert_keys
,
SLAB_SIZE
*
cache_capacity_in_set
*
sizeof
(
key_type
)));
hip
Malloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_insert_vals
,
CUDA_CHECK(
hip
Malloc((void**)&d_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_dump_keys
,
CUDA_CHECK(
hip
Malloc((void**)&d_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
(
CUDA_CHECK(
hip
Malloc(
(void**)&d_vals_retrieved,
(void**)&d_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_dump_counter
,
sizeof
(
size_t
)));
CUDA_CHECK(
hip
Malloc((void**)&d_dump_counter, sizeof(size_t)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_index
,
CUDA_CHECK(
hip
Malloc((void**)&d_missing_index,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_keys
,
CUDA_CHECK(
hip
Malloc((void**)&d_missing_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_len
,
sizeof
(
size_t
)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_len, sizeof(size_t)));
// CUDA stream
// CUDA stream
cuda
Stream_t
stream
;
hip
Stream_t stream;
CUDA_CHECK
(
cuda
StreamCreate
(
&
stream
));
CUDA_CHECK(
hip
StreamCreate(&stream));
// Copy all keys and values from host to device
// Copy all keys and values from host to device
CUDA_CHECK
(
cuda
MemcpyAsync
(
CUDA_CHECK(
hip
MemcpyAsync(
d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
CUDA_CHECK(
hip
MemcpyAsync(
d_vals, h_new_vals,
d_vals, h_new_vals,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Each time insert 1 slab per slabset into the cache and check result
// Each time insert 1 slab per slabset into the cache and check result
for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) {
for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) {
...
@@ -615,17 +617,17 @@ int main(int argc, char** argv) {
...
@@ -615,17 +617,17 @@ int main(int argc, char** argv) {
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type));
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type));
// Copy the <k,v> pairs from host to device
// Copy the <k,v> pairs from host to device
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_insert_keys
,
h_insert_keys
,
CUDA_CHECK(
hip
MemcpyAsync(d_insert_keys, h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK(
CUDA_CHECK(
cuda
MemcpyAsync
(
d_insert_vals
,
h_insert_vals
,
hip
MemcpyAsync(d_insert_vals, h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
// Insert the <k,v> pairs into the cache
// Insert the <k,v> pairs into the cache
cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream);
cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Record time
// Record time
time_a = W_time();
time_a = W_time();
...
@@ -633,7 +635,7 @@ int main(int argc, char** argv) {
...
@@ -633,7 +635,7 @@ int main(int argc, char** argv) {
cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream,
cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream,
SLAB_SIZE);
SLAB_SIZE);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
// Elapsed wall time
time_b = W_time() - time_a;
time_b = W_time() - time_a;
printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b);
printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b);
...
@@ -644,31 +646,31 @@ int main(int argc, char** argv) {
...
@@ -644,31 +646,31 @@ int main(int argc, char** argv) {
// Dump the keys from the cache
// Dump the keys from the cache
cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream);
cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
// Elapsed wall time
time_b = W_time() - time_a;
time_b = W_time() - time_a;
printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b);
printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b);
// Copy the dump counter from device to host
// Copy the dump counter from device to host
CUDA_CHECK
(
cuda
MemcpyAsync
(
&
h_dump_counter
,
d_dump_counter
,
sizeof
(
size_t
),
CUDA_CHECK(
hip
MemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Check the dump counter
// Check the dump counter
assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1));
assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1));
// Query all the dumped keys from the cache
// Query all the dumped keys from the cache
cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys,
cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream);
d_missing_len, stream);
// Copy result from device to host
// Copy result from device to host
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_dump_keys
,
d_dump_keys
,
h_dump_counter
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
MemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_vals_retrieved
,
d_vals_retrieved
,
CUDA_CHECK(
hip
MemcpyAsync(h_vals_retrieved, d_vals_retrieved,
h_dump_counter * embedding_vec_size * sizeof(float),
h_dump_counter * embedding_vec_size * sizeof(float),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
&
h_missing_len
,
d_missing_len
,
sizeof
(
size_t
),
CUDA_CHECK(
hip
MemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Check result
// Check result
assert(h_missing_len == 0);
assert(h_missing_len == 0);
compare_key(h_dump_keys, h_acc_keys, h_dump_counter);
compare_key(h_dump_keys, h_acc_keys, h_dump_counter);
...
@@ -679,27 +681,27 @@ int main(int argc, char** argv) {
...
@@ -679,27 +681,27 @@ int main(int argc, char** argv) {
printf("Update and Dump API test all finished!\n");
printf("Update and Dump API test all finished!\n");
// 2nd test clean-up
// 2nd test clean-up
CUDA_CHECK
(
cuda
StreamDestroy
(
stream
));
CUDA_CHECK(
hip
StreamDestroy(stream));
free(h_keys);
free(h_keys);
free(h_vals);
free(h_vals);
free(h_new_vals);
free(h_new_vals);
CUDA_CHECK
(
cudaFreeHost
(
h_insert_keys
));
CUDA_CHECK(
hipHostFree
(h_insert_keys));
CUDA_CHECK
(
cudaFreeHost
(
h_insert_vals
));
CUDA_CHECK(
hipHostFree
(h_insert_vals));
CUDA_CHECK
(
cudaFreeHost
(
h_dump_keys
));
CUDA_CHECK(
hipHostFree
(h_dump_keys));
CUDA_CHECK
(
cudaFreeHost
(
h_vals_retrieved
));
CUDA_CHECK(
hipHostFree
(h_vals_retrieved));
CUDA_CHECK
(
cudaFreeHost
(
h_acc_keys
));
CUDA_CHECK(
hipHostFree
(h_acc_keys));
CUDA_CHECK
(
cuda
Free
(
d_keys
));
CUDA_CHECK(
hip
Free(d_keys));
CUDA_CHECK
(
cuda
Free
(
d_vals
));
CUDA_CHECK(
hip
Free(d_vals));
CUDA_CHECK
(
cuda
Free
(
d_insert_keys
));
CUDA_CHECK(
hip
Free(d_insert_keys));
CUDA_CHECK
(
cuda
Free
(
d_insert_vals
));
CUDA_CHECK(
hip
Free(d_insert_vals));
CUDA_CHECK
(
cuda
Free
(
d_dump_keys
));
CUDA_CHECK(
hip
Free(d_dump_keys));
CUDA_CHECK
(
cuda
Free
(
d_vals_retrieved
));
CUDA_CHECK(
hip
Free(d_vals_retrieved));
CUDA_CHECK
(
cuda
Free
(
d_dump_counter
));
CUDA_CHECK(
hip
Free(d_dump_counter));
CUDA_CHECK
(
cuda
Free
(
d_missing_index
));
CUDA_CHECK(
hip
Free(d_missing_index));
CUDA_CHECK
(
cuda
Free
(
d_missing_keys
));
CUDA_CHECK(
hip
Free(d_missing_keys));
CUDA_CHECK
(
cuda
Free
(
d_missing_len
));
CUDA_CHECK(
hip
Free(d_missing_len));
delete cache;
delete cache;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment