Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
1d28bf8b
You need to sign in or sign up before continuing.
Commit
1d28bf8b
authored
Sep 23, 2024
by
sangwzh
Browse files
update third_party/HugeCTR/gpu_cache codes to hip
parent
f119ea7c
Changes
13
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
1049 additions
and
1029 deletions
+1049
-1029
third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
+6
-4
third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
+6
-4
third_party/HugeCTR/gpu_cache/include/nv_util.h
third_party/HugeCTR/gpu_cache/include/nv_util.h
+13
-12
third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
+5
-3
third_party/HugeCTR/gpu_cache/include/static_table.hpp
third_party/HugeCTR/gpu_cache/include/static_table.hpp
+5
-3
third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
+176
-174
third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
+6
-7
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.hip
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.hip
+76
-72
third_party/HugeCTR/gpu_cache/src/static_hash_table.hip
third_party/HugeCTR/gpu_cache/src/static_hash_table.hip
+23
-22
third_party/HugeCTR/gpu_cache/src/static_table.hip
third_party/HugeCTR/gpu_cache/src/static_table.hip
+6
-4
third_party/HugeCTR/gpu_cache/src/uvm_table.hip
third_party/HugeCTR/gpu_cache/src/uvm_table.hip
+607
-606
third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
+4
-4
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.hip
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.hip
+116
-114
No files found.
third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021, NVIDIA CORPORATION.
*
*
...
@@ -31,22 +33,22 @@ class gpu_cache_api {
...
@@ -31,22 +33,22 @@ class gpu_cache_api {
// Query API, i.e. A single read from the cache
// Query API, i.e. A single read from the cache
virtual
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
virtual
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
uint64_t
*
d_missing_index
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
uint64_t
*
d_missing_index
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
cuda
Stream_t
stream
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
virtual
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
virtual
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
// Update API, i.e. update the embeddings which exist in the cache
// Update API, i.e. update the embeddings which exist in the cache
virtual
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
virtual
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
=
0
;
// Dump API, i.e. dump some slabsets' keys from the cache
// Dump API, i.e. dump some slabsets' keys from the cache
virtual
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
virtual
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
const
size_t
end_set_index
,
cuda
Stream_t
stream
)
=
0
;
const
size_t
end_set_index
,
hip
Stream_t
stream
)
=
0
;
};
};
}
// namespace gpu_cache
}
// namespace gpu_cache
third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -61,20 +63,20 @@ class gpu_cache : public gpu_cache_api<key_type> {
...
@@ -61,20 +63,20 @@ class gpu_cache : public gpu_cache_api<key_type> {
// Query API, i.e. A single read from the cache
// Query API, i.e. A single read from the cache
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
uint64_t
*
d_missing_index
,
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
uint64_t
*
d_missing_index
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
cuda
Stream_t
stream
,
key_type
*
d_missing_keys
,
size_t
*
d_missing_len
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
void
Replace
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
// Update API, i.e. update the embeddings which exist in the cache
// Update API, i.e. update the embeddings which exist in the cache
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
,
void
Update
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
hip
Stream_t
stream
,
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
const
size_t
task_per_warp_tile
=
TASK_PER_WARP_TILE_MACRO
)
override
;
// Dump API, i.e. dump some slabsets' keys from the cache
// Dump API, i.e. dump some slabsets' keys from the cache
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
void
Dump
(
key_type
*
d_keys
,
size_t
*
d_dump_counter
,
const
size_t
start_set_index
,
const
size_t
end_set_index
,
cuda
Stream_t
stream
)
override
;
const
size_t
end_set_index
,
hip
Stream_t
stream
)
override
;
public:
public:
using
slabset
=
slab_set
<
set_associativity
,
key_type
,
warp_size
>
;
using
slabset
=
slab_set
<
set_associativity
,
key_type
,
warp_size
>
;
...
...
third_party/HugeCTR/gpu_cache/include/nv_util.h
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -15,7 +16,7 @@
...
@@ -15,7 +16,7 @@
*/
*/
#pragma once
#pragma once
#include <
cuda
_runtime_api.h>
#include <
hip/hip
_runtime_api.h>
#include <stdexcept>
#include <stdexcept>
#include <string>
#include <string>
...
@@ -30,17 +31,17 @@ class CudaException : public std::runtime_error {
...
@@ -30,17 +31,17 @@ class CudaException : public std::runtime_error {
CudaException
(
const
std
::
string
&
what
)
:
runtime_error
(
what
)
{}
CudaException
(
const
std
::
string
&
what
)
:
runtime_error
(
what
)
{}
};
};
inline
void
cuda_check_
(
cuda
Error_t
val
,
const
char
*
file
,
int
line
)
{
inline
void
cuda_check_
(
hip
Error_t
val
,
const
char
*
file
,
int
line
)
{
if
(
val
!=
cuda
Success
)
{
if
(
val
!=
hip
Success
)
{
throw
CudaException
(
std
::
string
(
file
)
+
":"
+
std
::
to_string
(
line
)
+
": CUDA error "
+
throw
CudaException
(
std
::
string
(
file
)
+
":"
+
std
::
to_string
(
line
)
+
": CUDA error "
+
std
::
to_string
(
val
)
+
": "
+
cuda
GetErrorString
(
val
));
std
::
to_string
(
val
)
+
": "
+
hip
GetErrorString
(
val
));
}
}
}
}
class
CudaDeviceRestorer
{
class
CudaDeviceRestorer
{
public:
public:
CudaDeviceRestorer
()
{
CUDA_CHECK
(
cuda
GetDevice
(
&
dev_
));
}
CudaDeviceRestorer
()
{
CUDA_CHECK
(
hip
GetDevice
(
&
dev_
));
}
~
CudaDeviceRestorer
()
{
CUDA_CHECK
(
cuda
SetDevice
(
dev_
));
}
~
CudaDeviceRestorer
()
{
CUDA_CHECK
(
hip
SetDevice
(
dev_
));
}
void
check_device
(
int
device
)
const
{
void
check_device
(
int
device
)
const
{
if
(
device
!=
dev_
)
{
if
(
device
!=
dev_
)
{
throw
std
::
runtime_error
(
throw
std
::
runtime_error
(
...
@@ -54,14 +55,14 @@ class CudaDeviceRestorer {
...
@@ -54,14 +55,14 @@ class CudaDeviceRestorer {
};
};
inline
int
get_dev
(
const
void
*
ptr
)
{
inline
int
get_dev
(
const
void
*
ptr
)
{
cuda
PointerAttribute
s
attr
;
hip
PointerAttribute
_t
attr
;
CUDA_CHECK
(
cuda
PointerGetAttributes
(
&
attr
,
ptr
));
CUDA_CHECK
(
hip
PointerGetAttributes
(
&
attr
,
ptr
));
int
dev
=
-
1
;
int
dev
=
-
1
;
#if
CUDA
RT_VERSION >= 10000
#if
DTK
RT_VERSION >= 10000
if
(
attr
.
type
==
cuda
MemoryTypeDevice
)
if
(
attr
.
type
==
hip
MemoryTypeDevice
)
#else
#else
if
(
attr
.
memoryType
==
cuda
MemoryTypeDevice
)
if
(
attr
.
memoryType
==
hip
MemoryTypeDevice
)
#endif
#endif
{
{
dev
=
attr
.
device
;
dev
=
attr
.
device
;
...
@@ -72,7 +73,7 @@ inline int get_dev(const void* ptr) {
...
@@ -72,7 +73,7 @@ inline int get_dev(const void* ptr) {
inline
void
switch_to_dev
(
const
void
*
ptr
)
{
inline
void
switch_to_dev
(
const
void
*
ptr
)
{
int
dev
=
get_dev
(
ptr
);
int
dev
=
get_dev
(
ptr
);
if
(
dev
>=
0
)
{
if
(
dev
>=
0
)
{
CUDA_CHECK
(
cuda
SetDevice
(
dev
));
CUDA_CHECK
(
hip
SetDevice
(
dev
));
}
}
}
}
...
...
third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -50,17 +52,17 @@ class StaticHashTable {
...
@@ -50,17 +52,17 @@ class StaticHashTable {
return
keys_bytes
+
indices_bytes
+
values_bytes
;
return
keys_bytes
+
indices_bytes
+
values_bytes
;
}
}
void
clear
(
cuda
Stream_t
stream
=
0
);
void
clear
(
hip
Stream_t
stream
=
0
);
// Note:
// Note:
// 1. Please make sure the key to be inserted is not duplicated.
// 1. Please make sure the key to be inserted is not duplicated.
// 2. Please make sure the key to be inserted does not exist in the table.
// 2. Please make sure the key to be inserted does not exist in the table.
// 3. Please make sure (size() + num_keys) <= capacity().
// 3. Please make sure (size() + num_keys) <= capacity().
void
insert
(
const
key_type
*
keys
,
const
value_type
*
values
,
size_type
num_keys
,
void
insert
(
const
key_type
*
keys
,
const
value_type
*
values
,
size_type
num_keys
,
cuda
Stream_t
stream
=
0
);
hip
Stream_t
stream
=
0
);
void
lookup
(
const
key_type
*
keys
,
value_type
*
values
,
int
num_keys
,
value_type
default_value
=
0
,
void
lookup
(
const
key_type
*
keys
,
value_type
*
values
,
int
num_keys
,
value_type
default_value
=
0
,
cuda
Stream_t
stream
=
0
);
hip
Stream_t
stream
=
0
);
private:
private:
key_type
*
table_keys_
;
key_type
*
table_keys_
;
...
...
third_party/HugeCTR/gpu_cache/include/static_table.hpp
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -36,12 +38,12 @@ class static_table {
...
@@ -36,12 +38,12 @@ class static_table {
~
static_table
(){};
~
static_table
(){};
// Query API, i.e. A single read from the cache
// Query API, i.e. A single read from the cache
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
cuda
Stream_t
stream
);
void
Query
(
const
key_type
*
d_keys
,
const
size_t
len
,
float
*
d_values
,
hip
Stream_t
stream
);
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
// Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
void
Init
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
cuda
Stream_t
stream
);
void
Init
(
const
key_type
*
d_keys
,
const
size_t
len
,
const
float
*
d_values
,
hip
Stream_t
stream
);
void
Clear
(
cuda
Stream_t
stream
);
void
Clear
(
hip
Stream_t
stream
);
private:
private:
StaticHashTable
<
key_type
,
float
>
static_hash_table_
;
StaticHashTable
<
key_type
,
float
>
static_hash_table_
;
...
...
third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
View file @
1d28bf8b
/*
// !!! This is a file automatically generated by hipify!!!
* Copyright (c) 2023, NVIDIA CORPORATION.
#include "hip/hip_runtime.h"
*
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* Copyright (c) 2023, NVIDIA CORPORATION.
* you may not use this file except in compliance with the License.
*
* You may obtain a copy of the License at
* Licensed under the Apache License, Version 2.0 (the "License");
*
* you may not use this file except in compliance with the License.
* http://www.apache.org/licenses/LICENSE-2.0
* You may obtain a copy of the License at
*
*
* Unless required by applicable law or agreed to in writing, software
* http://www.apache.org/licenses/LICENSE-2.0
* distributed under the License is distributed on an "AS IS" BASIS,
*
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* Unless required by applicable law or agreed to in writing, software
* See the License for the specific language governing permissions and
* distributed under the License is distributed on an "AS IS" BASIS,
* limitations under the License.
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
* See the License for the specific language governing permissions and
#pragma once
* limitations under the License.
#include <nv_util.h>
*/
#pragma once
#include <thread>
#include <nv_util.h>
#include <unordered_map>
#include <vector>
#include <thread>
#include <unordered_map>
namespace
gpu_cache
{
#include <vector>
template
<
typename
key_type
,
typename
index_type
>
namespace
gpu_cache
{
class
HashBlock
{
public:
template
<
typename
key_type
,
typename
index_type
>
key_type
*
keys
;
class
HashBlock
{
size_t
num_sets
;
public:
size_t
capacity
;
key_type
*
keys
;
size_t
num_sets
;
HashBlock
(
size_t
expected_capacity
,
int
set_size
,
int
batch_size
);
size_t
capacity
;
~
HashBlock
();
void
add
(
const
key_type
*
new_keys
,
const
size_t
num_keys
,
key_type
*
missing_keys
,
HashBlock
(
size_t
expected_capacity
,
int
set_size
,
int
batch_size
);
int
*
num_missing_keys
,
cudaStream_t
stream
);
~
HashBlock
();
void
query
(
const
key_type
*
query_keys
,
const
size_t
num_keys
,
index_type
*
output_indices
,
void
add
(
const
key_type
*
new_keys
,
const
size_t
num_keys
,
key_type
*
missing_keys
,
key_type
*
missing_keys
,
int
*
missing_positions
,
int
*
num_missing_keys
,
int
*
num_missing_keys
,
hipStream_t
stream
);
cudaStream_t
stream
);
void
query
(
const
key_type
*
query_keys
,
const
size_t
num_keys
,
index_type
*
output_indices
,
void
query
(
const
key_type
*
query_keys
,
int
*
num_keys
,
index_type
*
output_indices
,
key_type
*
missing_keys
,
int
*
missing_positions
,
int
*
num_missing_keys
,
cudaStream_t
stream
);
hipStream_t
stream
);
void
clear
(
cudaStream_t
stream
);
void
query
(
const
key_type
*
query_keys
,
int
*
num_keys
,
index_type
*
output_indices
,
hipStream_t
stream
);
private:
void
clear
(
hipStream_t
stream
);
int
max_set_size_
;
int
batch_size_
;
private:
int
*
set_sizes_
;
int
max_set_size_
;
};
int
batch_size_
;
int
*
set_sizes_
;
template
<
typename
vec_type
>
};
class
H2HCopy
{
public:
template
<
typename
vec_type
>
H2HCopy
(
int
num_threads
)
:
num_threads_
(
num_threads
),
working_
(
num_threads
)
{
class
H2HCopy
{
for
(
int
i
=
0
;
i
<
num_threads_
;
i
++
)
{
public:
threads_
.
emplace_back
(
H2HCopy
(
int
num_threads
)
:
num_threads_
(
num_threads
),
working_
(
num_threads
)
{
[
&
](
int
idx
)
{
for
(
int
i
=
0
;
i
<
num_threads_
;
i
++
)
{
while
(
!
terminate_
)
{
threads_
.
emplace_back
(
if
(
working_
[
idx
].
load
(
std
::
memory_order_relaxed
))
{
[
&
](
int
idx
)
{
working_
[
idx
].
store
(
false
,
std
::
memory_order_relaxed
);
while
(
!
terminate_
)
{
if
(
num_keys_
==
0
)
continue
;
if
(
working_
[
idx
].
load
(
std
::
memory_order_relaxed
))
{
size_t
num_keys_this_thread
=
(
num_keys_
-
1
)
/
num_threads_
+
1
;
working_
[
idx
].
store
(
false
,
std
::
memory_order_relaxed
);
size_t
begin
=
idx
*
num_keys_this_thread
;
if
(
num_keys_
==
0
)
continue
;
if
(
idx
==
num_threads_
-
1
)
{
size_t
num_keys_this_thread
=
(
num_keys_
-
1
)
/
num_threads_
+
1
;
num_keys_this_thread
=
num_keys_
-
num_keys_this_thread
*
idx
;
size_t
begin
=
idx
*
num_keys_this_thread
;
}
if
(
idx
==
num_threads_
-
1
)
{
size_t
end
=
begin
+
num_keys_this_thread
;
num_keys_this_thread
=
num_keys_
-
num_keys_this_thread
*
idx
;
}
for
(
size_t
i
=
begin
;
i
<
end
;
i
++
)
{
size_t
end
=
begin
+
num_keys_this_thread
;
size_t
idx_vec
=
get_index_
(
i
);
if
(
idx_vec
==
std
::
numeric_limits
<
size_t
>::
max
())
{
for
(
size_t
i
=
begin
;
i
<
end
;
i
++
)
{
continue
;
size_t
idx_vec
=
get_index_
(
i
);
}
if
(
idx_vec
==
std
::
numeric_limits
<
size_t
>::
max
())
{
memcpy
(
dst_data_ptr_
+
i
*
vec_size_
,
src_data_ptr_
+
idx_vec
*
vec_size_
,
continue
;
sizeof
(
vec_type
)
*
vec_size_
);
}
}
memcpy
(
dst_data_ptr_
+
i
*
vec_size_
,
src_data_ptr_
+
idx_vec
*
vec_size_
,
num_finished_workers_
++
;
sizeof
(
vec_type
)
*
vec_size_
);
}
}
}
num_finished_workers_
++
;
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
microseconds
(
1
));
}
},
}
i
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
microseconds
(
1
));
}
},
};
i
);
}
void
copy
(
vec_type
*
dst_data_ptr
,
vec_type
*
src_data_ptr
,
size_t
num_keys
,
int
vec_size
,
};
std
::
function
<
size_t
(
size_t
)
>
get_index_func
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
submit_mutex_
);
void
copy
(
vec_type
*
dst_data_ptr
,
vec_type
*
src_data_ptr
,
size_t
num_keys
,
int
vec_size
,
dst_data_ptr_
=
dst_data_ptr
;
std
::
function
<
size_t
(
size_t
)
>
get_index_func
)
{
src_data_ptr_
=
src_data_ptr
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
submit_mutex_
);
get_index_
=
get_index_func
;
dst_data_ptr_
=
dst_data_ptr
;
num_keys_
=
num_keys
;
src_data_ptr_
=
src_data_ptr
;
vec_size_
=
vec_size
;
get_index_
=
get_index_func
;
num_finished_workers_
.
store
(
0
,
std
::
memory_order_acquire
);
num_keys_
=
num_keys
;
vec_size_
=
vec_size
;
for
(
auto
&
working
:
working_
)
{
num_finished_workers_
.
store
(
0
,
std
::
memory_order_acquire
);
working
.
store
(
true
,
std
::
memory_order_relaxed
);
}
for
(
auto
&
working
:
working_
)
{
working
.
store
(
true
,
std
::
memory_order_relaxed
);
while
(
num_finished_workers_
!=
num_threads_
)
{
}
continue
;
}
while
(
num_finished_workers_
!=
num_threads_
)
{
}
continue
;
}
~
H2HCopy
()
{
}
terminate_
=
true
;
for
(
auto
&
t
:
threads_
)
{
~
H2HCopy
()
{
t
.
join
();
terminate_
=
true
;
}
for
(
auto
&
t
:
threads_
)
{
}
t
.
join
();
}
private:
}
vec_type
*
src_data_ptr_
;
vec_type
*
dst_data_ptr_
;
private:
vec_type
*
src_data_ptr_
;
std
::
function
<
size_t
(
size_t
)
>
get_index_
;
vec_type
*
dst_data_ptr_
;
size_t
num_keys_
;
std
::
function
<
size_t
(
size_t
)
>
get_index_
;
int
vec_size_
;
size_t
num_keys_
;
std
::
mutex
submit_mutex_
;
int
vec_size_
;
const
int
num_threads_
;
std
::
vector
<
std
::
thread
>
threads_
;
std
::
mutex
submit_mutex_
;
std
::
vector
<
std
::
atomic
<
bool
>>
working_
;
const
int
num_threads_
;
volatile
bool
terminate_
{
false
};
std
::
vector
<
std
::
thread
>
threads_
;
std
::
atomic
<
int
>
num_finished_workers_
{
0
};
std
::
vector
<
std
::
atomic
<
bool
>>
working_
;
};
volatile
bool
terminate_
{
false
};
std
::
atomic
<
int
>
num_finished_workers_
{
0
};
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
=
float
>
};
class
UvmTable
{
public:
template
<
typename
key_type
,
typename
index_type
,
typename
vec_type
=
float
>
UvmTable
(
const
size_t
device_table_capacity
,
const
size_t
host_table_capacity
,
class
UvmTable
{
const
int
max_batch_size
,
const
int
vec_size
,
public:
const
vec_type
default_value
=
(
vec_type
)
0
);
UvmTable
(
const
size_t
device_table_capacity
,
const
size_t
host_table_capacity
,
~
UvmTable
();
const
int
max_batch_size
,
const
int
vec_size
,
void
query
(
const
key_type
*
d_keys
,
const
int
len
,
vec_type
*
d_vectors
,
cudaStream_t
stream
=
0
);
const
vec_type
default_value
=
(
vec_type
)
0
);
void
add
(
const
key_type
*
h_keys
,
const
vec_type
*
h_vectors
,
const
size_t
len
);
~
UvmTable
();
void
clear
(
cudaStream_t
stream
=
0
);
void
query
(
const
key_type
*
d_keys
,
const
int
len
,
vec_type
*
d_vectors
,
hipStream_t
stream
=
0
);
void
add
(
const
key_type
*
h_keys
,
const
vec_type
*
h_vectors
,
const
size_t
len
);
private:
void
clear
(
hipStream_t
stream
=
0
);
static
constexpr
int
num_buffers_
=
2
;
key_type
*
d_keys_buffer_
;
private:
vec_type
*
d_vectors_buffer_
;
static
constexpr
int
num_buffers_
=
2
;
vec_type
*
d_vectors_
;
key_type
*
d_keys_buffer_
;
vec_type
*
d_vectors_buffer_
;
index_type
*
d_output_indices_
;
vec_type
*
d_vectors_
;
index_type
*
d_output_host_indices_
;
index_type
*
h_output_host_indices_
;
index_type
*
d_output_indices_
;
index_type
*
d_output_host_indices_
;
key_type
*
d_missing_keys_
;
index_type
*
h_output_host_indices_
;
int
*
d_missing_positions_
;
int
*
d_missing_count_
;
key_type
*
d_missing_keys_
;
int
*
d_missing_positions_
;
std
::
vector
<
vec_type
>
h_vectors_
;
int
*
d_missing_count_
;
key_type
*
h_missing_keys_
;
std
::
vector
<
vec_type
>
h_vectors_
;
cudaStream_t
query_stream_
;
key_type
*
h_missing_keys_
;
cudaEvent_t
query_event_
;
hipStream_t
query_stream_
;
vec_type
*
h_cpy_buffers_
[
num_buffers_
];
hipEvent_t
query_event_
;
vec_type
*
d_cpy_buffers_
[
num_buffers_
];
cudaStream_t
cpy_streams_
[
num_buffers_
];
vec_type
*
h_cpy_buffers_
[
num_buffers_
];
cudaEvent_t
cpy_events_
[
num_buffers_
];
vec_type
*
d_cpy_buffers_
[
num_buffers_
];
hipStream_t
cpy_streams_
[
num_buffers_
];
std
::
unordered_map
<
key_type
,
index_type
>
h_final_missing_items_
;
hipEvent_t
cpy_events_
[
num_buffers_
];
int
max_batch_size_
;
std
::
unordered_map
<
key_type
,
index_type
>
h_final_missing_items_
;
int
vec_size_
;
size_t
num_set_
;
int
max_batch_size_
;
size_t
num_host_set_
;
int
vec_size_
;
size_t
table_capacity_
;
size_t
num_set_
;
std
::
vector
<
vec_type
>
default_vector_
;
size_t
num_host_set_
;
size_t
table_capacity_
;
HashBlock
<
key_type
,
index_type
>
device_table_
;
std
::
vector
<
vec_type
>
default_vector_
;
HashBlock
<
key_type
,
index_type
>
host_table_
;
};
HashBlock
<
key_type
,
index_type
>
device_table_
;
HashBlock
<
key_type
,
index_type
>
host_table_
;
};
}
// namespace gpu_cache
}
// namespace gpu_cache
\ No newline at end of file
third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
View file @
1d28bf8b
...
@@ -15,15 +15,14 @@
...
@@ -15,15 +15,14 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
file
(
GLOB gpu_cache_src
file
(
GLOB gpu_cache_src
nv_gpu_cache.
cu
nv_gpu_cache.
hip
static_table.
cu
static_table.
hip
static_hash_table.
cu
static_hash_table.
hip
uvm_table.
cu
uvm_table.
hip
)
)
add_library
(
gpu_cache SHARED
${
gpu_cache_src
}
)
add_library
(
gpu_cache SHARED
${
gpu_cache_src
}
)
target_compile_features
(
gpu_cache PUBLIC cxx_std_11
)
target_compile_features
(
gpu_cache PUBLIC cxx_std_11
)
set_target_properties
(
gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
set_target_properties
(
gpu_cache PROPERTIES HIP_RESOLVE_DEVICE_SYMBOLS ON
)
set_target_properties
(
gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
# set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)
set_target_properties
(
gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF
)
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.
cu
→
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.
hip
View file @
1d28bf8b
This diff is collapsed.
Click to expand it.
third_party/HugeCTR/gpu_cache/src/static_hash_table.
cu
→
third_party/HugeCTR/gpu_cache/src/static_hash_table.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -14,8 +15,8 @@
...
@@ -14,8 +15,8 @@
* limitations under the License.
* limitations under the License.
*/
*/
#include <cooperative_groups.h>
#include <
hip/hip_
cooperative_groups.h>
#include <
cuda
.h>
#include <
hip/hip_runtime
.h>
#include <stdint.h>
#include <stdint.h>
#include <stdio.h>
#include <stdio.h>
...
@@ -49,7 +50,7 @@ __device__ size_type insert(key_type *table, size_type capacity, key_type key, c
...
@@ -49,7 +50,7 @@ __device__ size_type insert(key_type *table, size_type capacity, key_type key, c
// otherwise return invalid_slot.
// otherwise return invalid_slot.
const size_type num_groups = capacity / group_size;
const size_type num_groups = capacity / group_size;
#if (
CUDA
_VERSION < 11060)
#if (
DTK
_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size();
unsigned long long num_threads_per_group = cg.size();
#else
#else
unsigned long long num_threads_per_group = cg.num_threads();
unsigned long long num_threads_per_group = cg.num_threads();
...
@@ -152,7 +153,7 @@ __device__ size_type lookup(key_type *table, size_type capacity, key_type key, c
...
@@ -152,7 +153,7 @@ __device__ size_type lookup(key_type *table, size_type capacity, key_type key, c
const size_type num_groups = capacity / group_size;
const size_type num_groups = capacity / group_size;
#if (
CUDA
_VERSION < 11060)
#if (
DTK
_VERSION < 11060)
unsigned long long num_threads_per_group = cg.size();
unsigned long long num_threads_per_group = cg.size();
#else
#else
unsigned long long num_threads_per_group = cg.num_threads();
unsigned long long num_threads_per_group = cg.num_threads();
...
@@ -300,19 +301,19 @@ StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHash
...
@@ -300,19 +301,19 @@ StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHash
size_t align_m = 16;
size_t align_m = 16;
size_t num_keys = key_capacity_ + 1;
size_t num_keys = key_capacity_ + 1;
size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m;
size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m;
CUDA_CHECK
(
cuda
Malloc
(
&
table_keys_
,
sizeof
(
key_type
)
*
num_keys
));
CUDA_CHECK(
hip
Malloc(&table_keys_, sizeof(key_type) * num_keys));
CUDA_CHECK
(
cuda
Malloc
(
&
table_indices_
,
sizeof
(
size_type
)
*
num_keys
));
CUDA_CHECK(
hip
Malloc(&table_indices_, sizeof(size_type) * num_keys));
CUDA_CHECK
(
cuda
Malloc
(
&
table_values_
,
sizeof
(
value_type
)
*
num_values
));
CUDA_CHECK(
hip
Malloc(&table_values_, sizeof(value_type) * num_values));
// Initialize table_keys_
// Initialize table_keys_
CUDA_CHECK
(
cuda
Memset
(
table_keys_
,
0xff
,
sizeof
(
key_type
)
*
key_capacity_
));
CUDA_CHECK(
hip
Memset(table_keys_, 0xff, sizeof(key_type) * key_capacity_));
CUDA_CHECK
(
cuda
Memset
(
table_keys_
+
key_capacity_
,
0
,
sizeof
(
key_type
)));
CUDA_CHECK(
hip
Memset(table_keys_ + key_capacity_, 0, sizeof(key_type)));
}
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert(
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert(
const
key_type
*
keys
,
const
value_type
*
values
,
size_type
num_keys
,
cuda
Stream_t
stream
)
{
const key_type *keys, const value_type *values, size_type num_keys,
hip
Stream_t stream) {
if (num_keys == 0) {
if (num_keys == 0) {
return;
return;
}
}
...
@@ -324,12 +325,12 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
...
@@ -324,12 +325,12 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
// Insert keys
// Insert keys
constexpr int block = 256;
constexpr int block = 256;
int grid = (num_keys - 1) / block + 1;
int grid = (num_keys - 1) / block + 1;
InsertKeyKernel
<
tile_size
,
group_size
>
hipLaunchKernelGGL((
InsertKeyKernel<tile_size, group_size>
)
<<<
grid
,
block
,
0
,
stream
>>>
(
table_keys_
,
table_indices_
,
key_capacity_
,
keys
,
num_keys
,
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
table_keys_, table_indices_, key_capacity_, keys, num_keys,
size_, hash_, empty_key, invalid_slot);
size_, hash_, empty_key, invalid_slot);
// Copy values
// Copy values
CUDA_CHECK
(
cuda
MemcpyAsync
(
table_values_
+
size_
*
value_dim_
,
values
,
CUDA_CHECK(
hip
MemcpyAsync(table_values_ + size_ * value_dim_, values,
sizeof
(
value_type
)
*
num_keys
*
value_dim_
,
cuda
MemcpyDeviceToDevice
,
sizeof(value_type) * num_keys * value_dim_,
hip
MemcpyDeviceToDevice,
stream));
stream));
size_ += num_keys;
size_ += num_keys;
}
}
...
@@ -337,25 +338,25 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
...
@@ -337,25 +338,25 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::inser
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear(
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear(
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
CUDA_CHECK
(
cuda
MemsetAsync
(
table_keys_
,
0xff
,
sizeof
(
key_type
)
*
key_capacity_
,
stream
));
CUDA_CHECK(
hip
MemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream));
CUDA_CHECK
(
cuda
MemsetAsync
(
table_keys_
+
key_capacity_
,
0
,
sizeof
(
key_type
),
stream
));
CUDA_CHECK(
hip
MemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream));
size_ = 0;
size_ = 0;
}
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
typename hasher>
StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() {
StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() {
CUDA_CHECK
(
cuda
Free
(
table_keys_
));
CUDA_CHECK(
hip
Free(table_keys_));
CUDA_CHECK
(
cuda
Free
(
table_indices_
));
CUDA_CHECK(
hip
Free(table_indices_));
CUDA_CHECK
(
cuda
Free
(
table_values_
));
CUDA_CHECK(
hip
Free(table_values_));
}
}
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
typename hasher>
typename hasher>
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup(
void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup(
const key_type *keys, value_type *values, int num_keys, value_type default_value,
const key_type *keys, value_type *values, int num_keys, value_type default_value,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
if (num_keys == 0) {
if (num_keys == 0) {
return;
return;
}
}
...
@@ -363,7 +364,7 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::looku
...
@@ -363,7 +364,7 @@ void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::looku
constexpr int block = 256;
constexpr int block = 256;
const int grid = (num_keys - 1) / block + 1;
const int grid = (num_keys - 1) / block + 1;
// Lookup keys
// Lookup keys
LookupKernel
<
tile_size
,
group_size
>
<<<
grid
,
block
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
LookupKernel<tile_size, group_size>
), dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values,
table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values,
hash_, empty_key, default_value, invalid_slot);
hash_, empty_key, default_value, invalid_slot);
}
}
...
...
third_party/HugeCTR/gpu_cache/src/static_table.
cu
→
third_party/HugeCTR/gpu_cache/src/static_table.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -14,7 +16,7 @@
...
@@ -14,7 +16,7 @@
* limitations under the License.
* limitations under the License.
*/
*/
#include <cooperative_groups.h>
#include <
hip/hip_
cooperative_groups.h>
#include <nv_util.h>
#include <nv_util.h>
#include <iostream>
#include <iostream>
...
@@ -38,18 +40,18 @@ static_table<key_type>::static_table(const size_t table_size, const size_t embed
...
@@ -38,18 +40,18 @@ static_table<key_type>::static_table(const size_t table_size, const size_t embed
template <typename key_type>
template <typename key_type>
void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values,
void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream);
static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream);
}
}
template <typename key_type>
template <typename key_type>
void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values,
void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values,
cuda
Stream_t
stream
)
{
hip
Stream_t stream) {
static_hash_table_.insert(d_keys, d_values, len, stream);
static_hash_table_.insert(d_keys, d_values, len, stream);
}
}
template <typename key_type>
template <typename key_type>
void
static_table
<
key_type
>::
Clear
(
cuda
Stream_t
stream
)
{
void static_table<key_type>::Clear(
hip
Stream_t stream) {
static_hash_table_.clear(stream);
static_hash_table_.clear(stream);
}
}
...
...
third_party/HugeCTR/gpu_cache/src/uvm_table.
cu
→
third_party/HugeCTR/gpu_cache/src/uvm_table.
hip
View file @
1d28bf8b
This diff is collapsed.
Click to expand it.
third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
View file @
1d28bf8b
...
@@ -15,14 +15,14 @@
...
@@ -15,14 +15,14 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
file
(
GLOB gpu_cache_test_src
file
(
GLOB gpu_cache_test_src
cache_op_sol_test.
cu
cache_op_sol_test.
hip
../../HugeCTR/src/hps/embedding_cache_gpu.
cu
../../HugeCTR/src/hps/embedding_cache_gpu.
hip
)
)
add_executable
(
cache_op_sol_test
${
gpu_cache_test_src
}
)
add_executable
(
cache_op_sol_test
${
gpu_cache_test_src
}
)
target_compile_features
(
cache_op_sol_test PUBLIC cxx_std_17
)
target_compile_features
(
cache_op_sol_test PUBLIC cxx_std_17
)
target_link_libraries
(
cache_op_sol_test PUBLIC gpu_cache
)
target_link_libraries
(
cache_op_sol_test PUBLIC gpu_cache
)
target_link_libraries
(
cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX
)
target_link_libraries
(
cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX
)
set_target_properties
(
cache_op_sol_test PROPERTIES
CUDA
_RESOLVE_DEVICE_SYMBOLS ON
)
set_target_properties
(
cache_op_sol_test PROPERTIES
HIP
_RESOLVE_DEVICE_SYMBOLS
ON
)
set_target_properties
(
cache_op_sol_test PROPERTIES
CUDA
_ARCHITECTURES OFF
)
set_target_properties
(
cache_op_sol_test PROPERTIES
HIP
_ARCHITECTURES OFF
)
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.
cu
→
third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.
hip
View file @
1d28bf8b
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/*
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
*
...
@@ -155,7 +157,7 @@ void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_
...
@@ -155,7 +157,7 @@ void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_
template <typename T>
template <typename T>
bool is_near(T a, T b) {
bool is_near(T a, T b) {
double diff = abs(a - b);
double diff = abs(a - b);
bool
ret
=
diff
<=
std
::
min
(
a
,
b
)
*
1e-6
;
bool ret = diff <= ::min(a, b) * 1e-6;
if (!ret) {
if (!ret) {
std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl;
std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl;
}
}
...
@@ -224,7 +226,7 @@ int main(int argc, char** argv) {
...
@@ -224,7 +226,7 @@ int main(int argc, char** argv) {
const size_t cache_type = atoi(argv[7]);
const size_t cache_type = atoi(argv[7]);
// Since cache is designed for single-gpu, all threads just use GPU 0
// Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK
(
cuda
SetDevice
(
0
));
CUDA_CHECK(
hip
SetDevice(0));
// Host side buffers shared between threads
// Host side buffers shared between threads
key_type* h_keys; // Buffer holding all keys in embedding table
key_type* h_keys; // Buffer holding all keys in embedding table
...
@@ -302,7 +304,7 @@ int main(int argc, char** argv) {
...
@@ -302,7 +304,7 @@ int main(int argc, char** argv) {
int thread_id = omp_get_thread_num();
int thread_id = omp_get_thread_num();
printf("Worker %d starts testing cache.\n", thread_id);
printf("Worker %d starts testing cache.\n", thread_id);
// Since cache is designed for single-gpu, all threads just use GPU 0
// Since cache is designed for single-gpu, all threads just use GPU 0
CUDA_CHECK
(
cuda
SetDevice
(
0
));
CUDA_CHECK(
hip
SetDevice(0));
// Thread-private host side buffers
// Thread-private host side buffers
size_t* h_query_keys_index; // Buffer holding index for keys to be queried
size_t* h_query_keys_index; // Buffer holding index for keys to be queried
...
@@ -324,32 +326,32 @@ int main(int argc, char** argv) {
...
@@ -324,32 +326,32 @@ int main(int argc, char** argv) {
// host-only buffers placed in normal host memory
// host-only buffers placed in normal host memory
h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t));
h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t));
// host-device interactive buffers placed in pinned memory
// host-device interactive buffers placed in pinned memory
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_query_keys
,
query_length
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_query_keys, query_length * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_vals_retrieved
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_missing_keys
,
query_length
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_missing_keys, query_length * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_missing_vals
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_missing_vals,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_missing_index
,
query_length
*
sizeof
(
uint64_t
),
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_missing_index, query_length * sizeof(uint64_t),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
// Allocate device side buffers
// Allocate device side buffers
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_query_keys
,
query_length
*
sizeof
(
key_type
)));
CUDA_CHECK(
hip
Malloc((void**)&d_query_keys, query_length * sizeof(key_type)));
CUDA_CHECK(
CUDA_CHECK(
cuda
Malloc
((
void
**
)
&
d_vals_retrieved
,
query_length
*
embedding_vec_size
*
sizeof
(
float
)));
hip
Malloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_keys
,
query_length
*
sizeof
(
key_type
)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_keys, query_length * sizeof(key_type)));
CUDA_CHECK(
CUDA_CHECK(
cuda
Malloc
((
void
**
)
&
d_missing_vals
,
query_length
*
embedding_vec_size
*
sizeof
(
float
)));
hip
Malloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_index
,
query_length
*
sizeof
(
uint64_t
)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_index, query_length * sizeof(uint64_t)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_len
,
sizeof
(
size_t
)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_len, sizeof(size_t)));
// Thread-private CUDA stream, all threads just use the #0 device
// Thread-private CUDA stream, all threads just use the #0 device
cuda
Stream_t
stream
;
hip
Stream_t stream;
CUDA_CHECK
(
cuda
StreamCreate
(
&
stream
));
CUDA_CHECK(
hip
StreamCreate(&stream));
// Timimg variables
// Timimg variables
double time_1;
double time_1;
...
@@ -382,33 +384,33 @@ int main(int argc, char** argv) {
...
@@ -382,33 +384,33 @@ int main(int argc, char** argv) {
std::cout << std::endl;
std::cout << std::endl;
// Copy the keys to GPU memory
// Copy the keys to GPU memory
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_query_keys
,
h_query_keys
,
query_length
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
MemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Record time
// Record time
time_1 = W_time();
time_1 = W_time();
// Get pairs from hashtable
// Get pairs from hashtable
cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys,
cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream);
d_missing_len, stream);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
// Elapsed wall time
time_2 = W_time() - time_1;
time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n",
printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n",
thread_id, i, time_2);
thread_id, i, time_2);
// Copy the data back to host
// Copy the data back to host
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_vals_retrieved
,
d_vals_retrieved
,
CUDA_CHECK(
hip
MemcpyAsync(h_vals_retrieved, d_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_missing_index
,
d_missing_index
,
query_length
*
sizeof
(
uint64_t
),
CUDA_CHECK(
hip
MemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_missing_keys
,
d_missing_keys
,
query_length
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
MemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
&
h_missing_len
,
d_missing_len
,
sizeof
(
size_t
),
CUDA_CHECK(
hip
MemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i,
printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i,
h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f));
h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f));
...
@@ -433,13 +435,13 @@ int main(int argc, char** argv) {
...
@@ -433,13 +435,13 @@ int main(int argc, char** argv) {
thread_id, i, time_2);
thread_id, i, time_2);
// Copy the missing value to device
// Copy the missing value to device
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_missing_vals
,
h_missing_vals
,
CUDA_CHECK(
hip
MemcpyAsync(d_missing_vals, h_missing_vals,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_vals_retrieved
,
h_vals_retrieved
,
CUDA_CHECK(
hip
MemcpyAsync(d_vals_retrieved, h_vals_retrieved,
query_length * embedding_vec_size * sizeof(float),
query_length * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Record time
// Record time
time_1 = W_time();
time_1 = W_time();
...
@@ -449,7 +451,7 @@ int main(int argc, char** argv) {
...
@@ -449,7 +451,7 @@ int main(int argc, char** argv) {
else
else
cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream);
cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
// Elapsed wall time
time_2 = W_time() - time_1;
time_2 = W_time() - time_1;
printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n",
printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n",
...
@@ -466,20 +468,20 @@ int main(int argc, char** argv) {
...
@@ -466,20 +468,20 @@ int main(int argc, char** argv) {
printf("Worker %d : All Finished!\n", thread_id);
printf("Worker %d : All Finished!\n", thread_id);
// Clean-up
// Clean-up
cuda
StreamDestroy
(
stream
);
hip
StreamDestroy(stream);
free(h_query_keys_index);
free(h_query_keys_index);
CUDA_CHECK
(
cudaFreeHost
(
h_query_keys
));
CUDA_CHECK(
hipHostFree
(h_query_keys));
CUDA_CHECK
(
cudaFreeHost
(
h_vals_retrieved
));
CUDA_CHECK(
hipHostFree
(h_vals_retrieved));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_keys
));
CUDA_CHECK(
hipHostFree
(h_missing_keys));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_vals
));
CUDA_CHECK(
hipHostFree
(h_missing_vals));
CUDA_CHECK
(
cudaFreeHost
(
h_missing_index
));
CUDA_CHECK(
hipHostFree
(h_missing_index));
CUDA_CHECK
(
cuda
Free
(
d_query_keys
));
CUDA_CHECK(
hip
Free(d_query_keys));
CUDA_CHECK
(
cuda
Free
(
d_vals_retrieved
));
CUDA_CHECK(
hip
Free(d_vals_retrieved));
CUDA_CHECK
(
cuda
Free
(
d_missing_keys
));
CUDA_CHECK(
hip
Free(d_missing_keys));
CUDA_CHECK
(
cuda
Free
(
d_missing_vals
));
CUDA_CHECK(
hip
Free(d_missing_vals));
CUDA_CHECK
(
cuda
Free
(
d_missing_index
));
CUDA_CHECK(
hip
Free(d_missing_index));
CUDA_CHECK
(
cuda
Free
(
d_missing_len
));
CUDA_CHECK(
hip
Free(d_missing_len));
}
}
// 1st test Clean-up
// 1st test Clean-up
...
@@ -547,57 +549,57 @@ int main(int argc, char** argv) {
...
@@ -547,57 +549,57 @@ int main(int argc, char** argv) {
key_type* d_missing_keys;
key_type* d_missing_keys;
size_t* d_missing_len;
size_t* d_missing_len;
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_insert_keys
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_insert_vals
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_dump_keys
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
(
CUDA_CHECK(
hip
Host
Ma
lloc(
(void**)&h_vals_retrieved,
(void**)&h_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Host
A
lloc
((
void
**
)
&
h_acc_keys
,
CUDA_CHECK(
hip
Host
Ma
lloc((void**)&h_acc_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cuda
Host
A
llocPortable
));
hip
Host
Ma
llocPortable));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_keys
,
CUDA_CHECK(
hip
Malloc((void**)&d_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_vals
,
SLAB_SIZE
*
SET_ASSOCIATIVITY
*
cache_capacity_in_set
*
CUDA_CHECK(
hip
Malloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
embedding_vec_size * sizeof(float)));
embedding_vec_size * sizeof(float)));
CUDA_CHECK(
CUDA_CHECK(
cuda
Malloc
((
void
**
)
&
d_insert_keys
,
SLAB_SIZE
*
cache_capacity_in_set
*
sizeof
(
key_type
)));
hip
Malloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_insert_vals
,
CUDA_CHECK(
hip
Malloc((void**)&d_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_dump_keys
,
CUDA_CHECK(
hip
Malloc((void**)&d_dump_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
(
CUDA_CHECK(
hip
Malloc(
(void**)&d_vals_retrieved,
(void**)&d_vals_retrieved,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_dump_counter
,
sizeof
(
size_t
)));
CUDA_CHECK(
hip
Malloc((void**)&d_dump_counter, sizeof(size_t)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_index
,
CUDA_CHECK(
hip
Malloc((void**)&d_missing_index,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_keys
,
CUDA_CHECK(
hip
Malloc((void**)&d_missing_keys,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
CUDA_CHECK
(
cuda
Malloc
((
void
**
)
&
d_missing_len
,
sizeof
(
size_t
)));
CUDA_CHECK(
hip
Malloc((void**)&d_missing_len, sizeof(size_t)));
// CUDA stream
// CUDA stream
cuda
Stream_t
stream
;
hip
Stream_t stream;
CUDA_CHECK
(
cuda
StreamCreate
(
&
stream
));
CUDA_CHECK(
hip
StreamCreate(&stream));
// Copy all keys and values from host to device
// Copy all keys and values from host to device
CUDA_CHECK
(
cuda
MemcpyAsync
(
CUDA_CHECK(
hip
MemcpyAsync(
d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
CUDA_CHECK(
hip
MemcpyAsync(
d_vals, h_new_vals,
d_vals, h_new_vals,
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Each time insert 1 slab per slabset into the cache and check result
// Each time insert 1 slab per slabset into the cache and check result
for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) {
for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) {
...
@@ -615,17 +617,17 @@ int main(int argc, char** argv) {
...
@@ -615,17 +617,17 @@ int main(int argc, char** argv) {
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type));
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type));
// Copy the <k,v> pairs from host to device
// Copy the <k,v> pairs from host to device
CUDA_CHECK
(
cuda
MemcpyAsync
(
d_insert_keys
,
h_insert_keys
,
CUDA_CHECK(
hip
MemcpyAsync(d_insert_keys, h_insert_keys,
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
CUDA_CHECK(
CUDA_CHECK(
cuda
MemcpyAsync
(
d_insert_vals
,
h_insert_vals
,
hip
MemcpyAsync(d_insert_vals, h_insert_vals,
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
cuda
MemcpyHostToDevice
,
stream
));
hip
MemcpyHostToDevice, stream));
// Insert the <k,v> pairs into the cache
// Insert the <k,v> pairs into the cache
cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream);
cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Record time
// Record time
time_a = W_time();
time_a = W_time();
...
@@ -633,7 +635,7 @@ int main(int argc, char** argv) {
...
@@ -633,7 +635,7 @@ int main(int argc, char** argv) {
cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream,
cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream,
SLAB_SIZE);
SLAB_SIZE);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
// Elapsed wall time
time_b = W_time() - time_a;
time_b = W_time() - time_a;
printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b);
printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b);
...
@@ -644,31 +646,31 @@ int main(int argc, char** argv) {
...
@@ -644,31 +646,31 @@ int main(int argc, char** argv) {
// Dump the keys from the cache
// Dump the keys from the cache
cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream);
cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream);
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Elapsed wall time
// Elapsed wall time
time_b = W_time() - time_a;
time_b = W_time() - time_a;
printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b);
printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b);
// Copy the dump counter from device to host
// Copy the dump counter from device to host
CUDA_CHECK
(
cuda
MemcpyAsync
(
&
h_dump_counter
,
d_dump_counter
,
sizeof
(
size_t
),
CUDA_CHECK(
hip
MemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Check the dump counter
// Check the dump counter
assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1));
assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1));
// Query all the dumped keys from the cache
// Query all the dumped keys from the cache
cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys,
cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys,
d_missing_len, stream);
d_missing_len, stream);
// Copy result from device to host
// Copy result from device to host
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_dump_keys
,
d_dump_keys
,
h_dump_counter
*
sizeof
(
key_type
),
CUDA_CHECK(
hip
MemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
h_vals_retrieved
,
d_vals_retrieved
,
CUDA_CHECK(
hip
MemcpyAsync(h_vals_retrieved, d_vals_retrieved,
h_dump_counter * embedding_vec_size * sizeof(float),
h_dump_counter * embedding_vec_size * sizeof(float),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
CUDA_CHECK
(
cuda
MemcpyAsync
(
&
h_missing_len
,
d_missing_len
,
sizeof
(
size_t
),
CUDA_CHECK(
hip
MemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
cuda
MemcpyDeviceToHost
,
stream
));
hip
MemcpyDeviceToHost, stream));
// Wait for stream to complete
// Wait for stream to complete
CUDA_CHECK
(
cuda
StreamSynchronize
(
stream
));
CUDA_CHECK(
hip
StreamSynchronize(stream));
// Check result
// Check result
assert(h_missing_len == 0);
assert(h_missing_len == 0);
compare_key(h_dump_keys, h_acc_keys, h_dump_counter);
compare_key(h_dump_keys, h_acc_keys, h_dump_counter);
...
@@ -679,27 +681,27 @@ int main(int argc, char** argv) {
...
@@ -679,27 +681,27 @@ int main(int argc, char** argv) {
printf("Update and Dump API test all finished!\n");
printf("Update and Dump API test all finished!\n");
// 2nd test clean-up
// 2nd test clean-up
CUDA_CHECK
(
cuda
StreamDestroy
(
stream
));
CUDA_CHECK(
hip
StreamDestroy(stream));
free(h_keys);
free(h_keys);
free(h_vals);
free(h_vals);
free(h_new_vals);
free(h_new_vals);
CUDA_CHECK
(
cudaFreeHost
(
h_insert_keys
));
CUDA_CHECK(
hipHostFree
(h_insert_keys));
CUDA_CHECK
(
cudaFreeHost
(
h_insert_vals
));
CUDA_CHECK(
hipHostFree
(h_insert_vals));
CUDA_CHECK
(
cudaFreeHost
(
h_dump_keys
));
CUDA_CHECK(
hipHostFree
(h_dump_keys));
CUDA_CHECK
(
cudaFreeHost
(
h_vals_retrieved
));
CUDA_CHECK(
hipHostFree
(h_vals_retrieved));
CUDA_CHECK
(
cudaFreeHost
(
h_acc_keys
));
CUDA_CHECK(
hipHostFree
(h_acc_keys));
CUDA_CHECK
(
cuda
Free
(
d_keys
));
CUDA_CHECK(
hip
Free(d_keys));
CUDA_CHECK
(
cuda
Free
(
d_vals
));
CUDA_CHECK(
hip
Free(d_vals));
CUDA_CHECK
(
cuda
Free
(
d_insert_keys
));
CUDA_CHECK(
hip
Free(d_insert_keys));
CUDA_CHECK
(
cuda
Free
(
d_insert_vals
));
CUDA_CHECK(
hip
Free(d_insert_vals));
CUDA_CHECK
(
cuda
Free
(
d_dump_keys
));
CUDA_CHECK(
hip
Free(d_dump_keys));
CUDA_CHECK
(
cuda
Free
(
d_vals_retrieved
));
CUDA_CHECK(
hip
Free(d_vals_retrieved));
CUDA_CHECK
(
cuda
Free
(
d_dump_counter
));
CUDA_CHECK(
hip
Free(d_dump_counter));
CUDA_CHECK
(
cuda
Free
(
d_missing_index
));
CUDA_CHECK(
hip
Free(d_missing_index));
CUDA_CHECK
(
cuda
Free
(
d_missing_keys
));
CUDA_CHECK(
hip
Free(d_missing_keys));
CUDA_CHECK
(
cuda
Free
(
d_missing_len
));
CUDA_CHECK(
hip
Free(d_missing_len));
delete cache;
delete cache;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment