Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
833803f3
Commit
833803f3
authored
Sep 23, 2024
by
sangwzh
Browse files
update dgl codes to hip
parent
1d28bf8b
Changes
36
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
92 additions
and
93 deletions
+92
-93
include/dgl/packed_func_ext.h
include/dgl/packed_func_ext.h
+4
-3
include/dgl/runtime/c_object_api.h
include/dgl/runtime/c_object_api.h
+2
-1
include/dgl/runtime/c_runtime_api.h
include/dgl/runtime/c_runtime_api.h
+1
-0
include/dgl/runtime/device_api.h
include/dgl/runtime/device_api.h
+4
-3
include/dgl/runtime/module.h
include/dgl/runtime/module.h
+1
-0
include/dgl/runtime/ndarray.h
include/dgl/runtime/ndarray.h
+24
-11
include/dgl/runtime/tensordispatch.h
include/dgl/runtime/tensordispatch.h
+10
-9
python/dgl/_ffi/runtime_ctypes.py
python/dgl/_ffi/runtime_ctypes.py
+2
-2
python/dgl/backend/pytorch/tensor.py
python/dgl/backend/pytorch/tensor.py
+1
-1
src/array/cuda/spmat_op_impl_csr.hip
src/array/cuda/spmat_op_impl_csr.hip
+3
-28
src/runtime/cuda/cuda_device_api.cc
src/runtime/cuda/cuda_device_api.cc
+5
-4
tensoradapter/include/tensoradapter.h
tensoradapter/include/tensoradapter.h
+6
-5
tensoradapter/pytorch/CMakeLists.txt
tensoradapter/pytorch/CMakeLists.txt
+3
-1
tensoradapter/pytorch/build.sh
tensoradapter/pytorch/build.sh
+5
-5
tensoradapter/pytorch/torch.cpp
tensoradapter/pytorch/torch.cpp
+20
-19
tests/cpp/message_queue_test.cc
tests/cpp/message_queue_test.cc
+1
-1
No files found.
include/dgl/packed_func_ext.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2019 by Contributors
* Copyright (c) 2019 by Contributors
* @file packed_func_ext.h
* @file packed_func_ext.h
...
@@ -12,9 +13,9 @@
...
@@ -12,9 +13,9 @@
#include <string>
#include <string>
#include <type_traits>
#include <type_traits>
#include "
./
runtime/container.h"
#include "runtime/container.h"
#include "
./
runtime/object.h"
#include "runtime/object.h"
#include "
./
runtime/packed_func.h"
#include "runtime/packed_func.h"
namespace
dgl
{
namespace
dgl
{
namespace
runtime
{
namespace
runtime
{
...
...
include/dgl/runtime/c_object_api.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2019 by Contributors
* Copyright (c) 2019 by Contributors
* @file dgl/runtime/c_object_api.h
* @file dgl/runtime/c_object_api.h
...
@@ -10,7 +11,7 @@
...
@@ -10,7 +11,7 @@
#ifndef DGL_RUNTIME_C_OBJECT_API_H_
#ifndef DGL_RUNTIME_C_OBJECT_API_H_
#define DGL_RUNTIME_C_OBJECT_API_H_
#define DGL_RUNTIME_C_OBJECT_API_H_
#include "
./
c_runtime_api.h"
#include "c_runtime_api.h"
#ifdef __cplusplus
#ifdef __cplusplus
extern
"C"
{
extern
"C"
{
...
...
include/dgl/runtime/c_runtime_api.h
View file @
833803f3
...
@@ -57,6 +57,7 @@ typedef enum {
...
@@ -57,6 +57,7 @@ typedef enum {
/** @brief CUDA GPU device */
/** @brief CUDA GPU device */
kDGLCUDA
=
2
,
kDGLCUDA
=
2
,
// add more devices once supported
// add more devices once supported
kDGLROCM
=
10
,
}
DGLDeviceType
;
}
DGLDeviceType
;
/**
/**
...
...
include/dgl/runtime/device_api.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2016 by Contributors
* Copyright (c) 2016 by Contributors
* @file dgl/runtime/device_api.h
* @file dgl/runtime/device_api.h
...
@@ -174,7 +175,7 @@ class DeviceAPI {
...
@@ -174,7 +175,7 @@ class DeviceAPI {
DGLContext
ctx
,
DGLStreamHandle
event_src
,
DGLStreamHandle
event_dst
);
DGLContext
ctx
,
DGLStreamHandle
event_src
,
DGLStreamHandle
event_dst
);
/**
/**
* @brief Pin host memory using
cuda
HostRegister().
* @brief Pin host memory using
hip
HostRegister().
*
*
* @param ptr The host memory pointer to be pinned.
* @param ptr The host memory pointer to be pinned.
* @param nbytes The size to be pinned.
* @param nbytes The size to be pinned.
...
@@ -183,7 +184,7 @@ class DeviceAPI {
...
@@ -183,7 +184,7 @@ class DeviceAPI {
DGL_DLL
virtual
bool
PinData
(
void
*
ptr
,
size_t
nbytes
);
DGL_DLL
virtual
bool
PinData
(
void
*
ptr
,
size_t
nbytes
);
/**
/**
* @brief Unpin host memory using
cuda
HostUnregister().
* @brief Unpin host memory using
hip
HostUnregister().
*
*
* @param ptr The host memory pointer to be unpinned.
* @param ptr The host memory pointer to be unpinned.
*/
*/
...
@@ -203,7 +204,7 @@ class DeviceAPI {
...
@@ -203,7 +204,7 @@ class DeviceAPI {
/**
/**
* @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
* @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
* @note It avoids unnecessary
cudaFreeHost
calls and puts the memory
* @note It avoids unnecessary
hipHostFree
calls and puts the memory
* block into CachingHostAllocator's free list.
* block into CachingHostAllocator's free list.
* @param deleter Pointer to the deleter function from PyTorch's
* @param deleter Pointer to the deleter function from PyTorch's
* CachingHostAllocator.
* CachingHostAllocator.
...
...
include/dgl/runtime/module.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2017 by Contributors
* Copyright (c) 2017 by Contributors
* @file dgl/runtime/module.h
* @file dgl/runtime/module.h
...
...
include/dgl/runtime/ndarray.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2017-2022 by Contributors
* Copyright (c) 2017-2022 by Contributors
* @file dgl/runtime/ndarray.h
* @file dgl/runtime/ndarray.h
...
@@ -18,13 +19,20 @@
...
@@ -18,13 +19,20 @@
#include "shared_mem.h"
#include "shared_mem.h"
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#define BF16_ENABLED (defined(CUDART_VERSION) && CUDART_VERSION >= 11000)
// #define BF16_ENABLED (defined(DTKRT_VERSION) && DTKRT_VERSION >= 11000)
#if defined(DTKRT_VERSION)
#define DTKRT_VERSION_CHECK (DTKRT_VERSION >= 11000)
#else
#define DTKRT_VERSION_CHECK 0
#endif
#include <cuda_fp16.h>
#define BF16_ENABLED DTKRT_VERSION_CHECK
#include <hip/hip_fp16.h>
#if BF16_ENABLED
#if BF16_ENABLED
#include <
cuda
_bf16.h>
#include <
hip/hip
_bf16.h>
#endif // BF16_ENABLED
#endif // BF16_ENABLED
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
...
@@ -60,7 +68,7 @@ GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64);
...
@@ -60,7 +68,7 @@ GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64);
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
GEN_DGLDATATYPETRAITS_FOR
(
__half
,
kDGLFloat
,
16
);
GEN_DGLDATATYPETRAITS_FOR
(
__half
,
kDGLFloat
,
16
);
#if BF16_ENABLED
#if BF16_ENABLED
GEN_DGLDATATYPETRAITS_FOR
(
__
nv
_bfloat16
,
kDGLBfloat
,
16
);
GEN_DGLDATATYPETRAITS_FOR
(
__
hip
_bfloat16
,
kDGLBfloat
,
16
);
#endif // BF16_ENABLED
#endif // BF16_ENABLED
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
GEN_DGLDATATYPETRAITS_FOR
(
float
,
kDGLFloat
,
32
);
GEN_DGLDATATYPETRAITS_FOR
(
float
,
kDGLFloat
,
32
);
...
@@ -185,7 +193,7 @@ class NDArray {
...
@@ -185,7 +193,7 @@ class NDArray {
* CachingHostAllocator for allocating pinned memory and copying data
* CachingHostAllocator for allocating pinned memory and copying data
* from the current NDAarray. As a result, PyTorch is responsible for
* from the current NDAarray. As a result, PyTorch is responsible for
* managing the lifecycle of the returned NDArray, including deciding
* managing the lifecycle of the returned NDArray, including deciding
* when to flush the data for reuse or call
cudaFreeHost
. The current
* when to flush the data for reuse or call
hipHostFree
. The current
* context must be kDGLCPU, otherwise, an error will be thrown.
* context must be kDGLCPU, otherwise, an error will be thrown.
*/
*/
inline
NDArray
PinMemory
();
inline
NDArray
PinMemory
();
...
@@ -194,7 +202,7 @@ class NDArray {
...
@@ -194,7 +202,7 @@ class NDArray {
* @brief In-place method to pin the current array by calling PinContainer
* @brief In-place method to pin the current array by calling PinContainer
* on the underlying NDArray:Container.
* on the underlying NDArray:Container.
* @note This is an in-place method that flags the memory as page-locked by
* @note This is an in-place method that flags the memory as page-locked by
* utilizing
cuda
HostRegister at the underlying level to pin the current
* utilizing
hip
HostRegister at the underlying level to pin the current
* instance of NDArray. The current context must be kDGLCPU, otherwise,
* instance of NDArray. The current context must be kDGLCPU, otherwise,
* an error will be thrown.
* an error will be thrown.
*/
*/
...
@@ -523,7 +531,7 @@ inline void NDArray::CopyFrom(const NDArray& other) {
...
@@ -523,7 +531,7 @@ inline void NDArray::CopyFrom(const NDArray& other) {
// Pinned by PyTorch
// Pinned by PyTorch
if
(
cpu_data
->
pinned_by_pytorch_
)
{
if
(
cpu_data
->
pinned_by_pytorch_
)
{
// To ensure correct behavior, the event must be recorded after
// To ensure correct behavior, the event must be recorded after
//
cuda
MemcpyAsync as long as the memory is pinned by PyTorch.
//
hip
MemcpyAsync as long as the memory is pinned by PyTorch.
void
*
pytorch_ctx
=
cpu_data
->
pytorch_ctx_
;
void
*
pytorch_ctx
=
cpu_data
->
pytorch_ctx_
;
RecordedCopyFromTo
(
RecordedCopyFromTo
(
&
(
other
.
data_
->
dl_tensor
),
&
(
data_
->
dl_tensor
),
pytorch_ctx
);
&
(
other
.
data_
->
dl_tensor
),
&
(
data_
->
dl_tensor
),
pytorch_ctx
);
...
@@ -549,7 +557,7 @@ inline void NDArray::CopyTo(const NDArray& other) const {
...
@@ -549,7 +557,7 @@ inline void NDArray::CopyTo(const NDArray& other) const {
// pinned by PyTorch
// pinned by PyTorch
if
(
cpu_data
->
pinned_by_pytorch_
)
{
if
(
cpu_data
->
pinned_by_pytorch_
)
{
// To ensure correct behavior, the event must be recorded after
// To ensure correct behavior, the event must be recorded after
//
cuda
MemcpyAsync as long as the memory is pinned by PyTorch.
//
hip
MemcpyAsync as long as the memory is pinned by PyTorch.
void
*
pytorch_ctx
=
cpu_data
->
pytorch_ctx_
;
void
*
pytorch_ctx
=
cpu_data
->
pytorch_ctx_
;
RecordedCopyFromTo
(
RecordedCopyFromTo
(
&
(
data_
->
dl_tensor
),
&
(
other
.
data_
->
dl_tensor
),
pytorch_ctx
);
&
(
data_
->
dl_tensor
),
&
(
other
.
data_
->
dl_tensor
),
pytorch_ctx
);
...
@@ -716,6 +724,8 @@ inline const char* DeviceTypeCode2Str(DGLDeviceType device_type) {
...
@@ -716,6 +724,8 @@ inline const char* DeviceTypeCode2Str(DGLDeviceType device_type) {
return
"cpu"
;
return
"cpu"
;
case
kDGLCUDA
:
case
kDGLCUDA
:
return
"cuda"
;
return
"cuda"
;
case
kDGLROCM
:
return
"cuda"
;
default:
default:
LOG
(
FATAL
)
<<
"Unsupported device type code="
LOG
(
FATAL
)
<<
"Unsupported device type code="
<<
static_cast
<
int
>
(
device_type
);
<<
static_cast
<
int
>
(
device_type
);
...
@@ -871,8 +881,11 @@ inline std::ostream& operator<<(std::ostream& os, DGLDataType t) {
...
@@ -871,8 +881,11 @@ inline std::ostream& operator<<(std::ostream& os, DGLDataType t) {
/** @brief Check whether two device contexts are the same.*/
/** @brief Check whether two device contexts are the same.*/
inline
bool
operator
==
(
const
DGLContext
&
ctx1
,
const
DGLContext
&
ctx2
)
{
inline
bool
operator
==
(
const
DGLContext
&
ctx1
,
const
DGLContext
&
ctx2
)
{
return
ctx1
.
device_type
==
ctx2
.
device_type
&&
// printf("**************** debug compare DGLContext, %d, %d\n",ctx1.device_type,ctx2.device_type);
ctx1
.
device_id
==
ctx2
.
device_id
;
int
ct1
=
ctx1
.
device_type
==
10
?
2
:
ctx1
.
device_type
;
int
ct2
=
ctx2
.
device_type
==
10
?
2
:
ctx2
.
device_type
;
return
ct1
==
ct2
&&
int
(
ctx1
.
device_id
)
==
int
(
ctx2
.
device_id
);
}
}
/** @brief Check whether two device contexts are different.*/
/** @brief Check whether two device contexts are different.*/
...
...
include/dgl/runtime/tensordispatch.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020-2022 by Contributors
* Copyright (c) 2020-2022 by Contributors
* @file array/tensordispatch.h
* @file array/tensordispatch.h
...
@@ -34,7 +35,7 @@
...
@@ -34,7 +35,7 @@
#include <windows.h>
#include <windows.h>
#endif // WIN32
#endif // WIN32
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
#include "ndarray.h"
#include "ndarray.h"
...
@@ -97,14 +98,14 @@ class TensorDispatcher {
...
@@ -97,14 +98,14 @@ class TensorDispatcher {
* Used in CUDADeviceAPI::AllocWorkspace().
* Used in CUDADeviceAPI::AllocWorkspace().
*
*
* @note THCCachingAllocator specify the device to allocate on
* @note THCCachingAllocator specify the device to allocate on
* via
cuda
GetDevice(). Make sure to call
cuda
SetDevice()
* via
hip
GetDevice(). Make sure to call
hip
SetDevice()
* before invoking this function.
* before invoking this function.
*
*
* @param nbytes The size to be allocated.
* @param nbytes The size to be allocated.
* @param stream The stream to be allocated on.
* @param stream The stream to be allocated on.
* @return Pointer to the allocated memory.
* @return Pointer to the allocated memory.
*/
*/
inline
void
*
CUDAAllocWorkspace
(
size_t
nbytes
,
cuda
Stream_t
stream
)
{
inline
void
*
CUDAAllocWorkspace
(
size_t
nbytes
,
hip
Stream_t
stream
)
{
auto
entry
=
entrypoints_
[
Op
::
kCUDARawAlloc
];
auto
entry
=
entrypoints_
[
Op
::
kCUDARawAlloc
];
return
FUNCCAST
(
tensoradapter
::
CUDARawAlloc
,
entry
)(
nbytes
,
stream
);
return
FUNCCAST
(
tensoradapter
::
CUDARawAlloc
,
entry
)(
nbytes
,
stream
);
}
}
...
@@ -122,15 +123,15 @@ class TensorDispatcher {
...
@@ -122,15 +123,15 @@ class TensorDispatcher {
/**
/**
* @brief Find the current PyTorch CUDA stream
* @brief Find the current PyTorch CUDA stream
* Used in runtime::getCurrent
CUDA
Stream().
* Used in runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
().
*
*
* @note PyTorch pre-allocates/sets the current CUDA stream
* @note PyTorch pre-allocates/sets the current CUDA stream
* on current device via
cuda
GetDevice(). Make sure to call
cuda
SetDevice()
* on current device via
hip
GetDevice(). Make sure to call
hip
SetDevice()
* before invoking this function.
* before invoking this function.
*
*
* @return
cuda
Stream_t stream handle
* @return
hip
Stream_t stream handle
*/
*/
inline
cuda
Stream_t
CUDAGetCurrentStream
()
{
inline
hip
Stream_t
CUDAGetCurrentStream
()
{
auto
entry
=
entrypoints_
[
Op
::
kCUDACurrentStream
];
auto
entry
=
entrypoints_
[
Op
::
kCUDACurrentStream
];
return
FUNCCAST
(
tensoradapter
::
CUDACurrentStream
,
entry
)();
return
FUNCCAST
(
tensoradapter
::
CUDACurrentStream
,
entry
)();
}
}
...
@@ -183,7 +184,7 @@ class TensorDispatcher {
...
@@ -183,7 +184,7 @@ class TensorDispatcher {
* @param device_id Device of the tensor.
* @param device_id Device of the tensor.
*/
*/
inline
void
CUDARecordHostAlloc
(
inline
void
CUDARecordHostAlloc
(
void
*
data
,
void
*
ctx
,
cuda
Stream_t
stream
,
int
device_id
)
{
void
*
data
,
void
*
ctx
,
hip
Stream_t
stream
,
int
device_id
)
{
auto
entry
=
entrypoints_
[
Op
::
kCUDARecordHostAlloc
];
auto
entry
=
entrypoints_
[
Op
::
kCUDARecordHostAlloc
];
auto
recorded_alloc
=
FUNCCAST
(
tensoradapter
::
CUDARecordHostAlloc
,
entry
);
auto
recorded_alloc
=
FUNCCAST
(
tensoradapter
::
CUDARecordHostAlloc
,
entry
);
recorded_alloc
(
data
,
ctx
,
stream
,
device_id
);
recorded_alloc
(
data
,
ctx
,
stream
,
device_id
);
...
@@ -212,7 +213,7 @@ class TensorDispatcher {
...
@@ -212,7 +213,7 @@ class TensorDispatcher {
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
auto
entry
=
entrypoints_
[
Op
::
kRecordStream
];
auto
entry
=
entrypoints_
[
Op
::
kRecordStream
];
FUNCCAST
(
tensoradapter
::
RecordStream
,
entry
)
FUNCCAST
(
tensoradapter
::
RecordStream
,
entry
)
(
ptr
,
static_cast
<
cuda
Stream_t
>
(
stream
),
device_id
);
(
ptr
,
static_cast
<
hip
Stream_t
>
(
stream
),
device_id
);
#endif
#endif
}
}
...
...
python/dgl/_ffi/runtime_ctypes.py
View file @
833803f3
...
@@ -123,7 +123,7 @@ class DGLContext(ctypes.Structure):
...
@@ -123,7 +123,7 @@ class DGLContext(ctypes.Structure):
7
:
"vulkan"
,
7
:
"vulkan"
,
8
:
"metal"
,
8
:
"metal"
,
9
:
"vpi"
,
9
:
"vpi"
,
10
:
"
rocm
"
,
10
:
"
gpu
"
,
11
:
"opengl"
,
11
:
"opengl"
,
12
:
"ext_dev"
,
12
:
"ext_dev"
,
}
}
...
@@ -142,7 +142,7 @@ class DGLContext(ctypes.Structure):
...
@@ -142,7 +142,7 @@ class DGLContext(ctypes.Structure):
"vulkan"
:
7
,
"vulkan"
:
7
,
"metal"
:
8
,
"metal"
:
8
,
"vpi"
:
9
,
"vpi"
:
9
,
"rocm"
:
10
,
"rocm"
:
2
,
"opengl"
:
11
,
"opengl"
:
11
,
"ext_dev"
:
12
,
"ext_dev"
:
12
,
}
}
...
...
python/dgl/backend/pytorch/tensor.py
View file @
833803f3
...
@@ -116,7 +116,7 @@ def to_backend_ctx(dglctx):
...
@@ -116,7 +116,7 @@ def to_backend_ctx(dglctx):
dev_type
=
dglctx
.
device_type
dev_type
=
dglctx
.
device_type
if
dev_type
==
1
:
if
dev_type
==
1
:
return
th
.
device
(
"cpu"
)
return
th
.
device
(
"cpu"
)
elif
dev_type
==
2
:
elif
dev_type
==
2
or
dev_type
==
10
:
return
th
.
device
(
"cuda"
,
dglctx
.
device_id
)
return
th
.
device
(
"cuda"
,
dglctx
.
device_id
)
else
:
else
:
raise
ValueError
(
"Unsupported DGL device context:"
,
dglctx
)
raise
ValueError
(
"Unsupported DGL device context:"
,
dglctx
)
...
...
src/array/cuda/spmat_op_impl_csr.hip
View file @
833803f3
...
@@ -548,8 +548,7 @@ __global__ void _SegmentMaskColKernel(
...
@@ -548,8 +548,7 @@ __global__ void _SegmentMaskColKernel(
mask[idx] = 1;
mask[idx] = 1;
}
}
}
}
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
printf("out_row = %d , reduce_count = %d \n", out_row, reduce_count);
if (laneid == 0) {
if (laneid == 0) {
count[out_row] = reduce_count;
count[out_row] = reduce_count;
}
}
...
@@ -567,9 +566,6 @@ CSRMatrix CSRSliceMatrix(
...
@@ -567,9 +566,6 @@ CSRMatrix CSRSliceMatrix(
const int64_t new_nrows = rows->shape[0];
const int64_t new_nrows = rows->shape[0];
const int64_t new_ncols = cols->shape[0];
const int64_t new_ncols = cols->shape[0];
std::cout << "new_nrows : " << new_nrows << std::endl;
std::cout << "new_ncols : " << new_ncols << std::endl;
if (new_nrows == 0 || new_ncols == 0)
if (new_nrows == 0 || new_ncols == 0)
return CSRMatrix(
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...
@@ -578,7 +574,6 @@ CSRMatrix CSRSliceMatrix(
...
@@ -578,7 +574,6 @@ CSRMatrix CSRSliceMatrix(
// First slice rows
// First slice rows
csr = CSRSliceRows(csr, rows);
csr = CSRSliceRows(csr, rows);
std::cout << "csr.indices->shape[0] : " << csr.indices->shape[0] << std::endl;
if (csr.indices->shape[0] == 0)
if (csr.indices->shape[0] == 0)
return CSRMatrix(
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...
@@ -588,11 +583,9 @@ CSRMatrix CSRSliceMatrix(
...
@@ -588,11 +583,9 @@ CSRMatrix CSRSliceMatrix(
IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
// A count for how many masked values per row.
// A count for how many masked values per row.
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
std::cout << "1 IdArray count : " << count << std::endl;
CUDA_CALL(
CUDA_CALL(
hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
std::cout << "2 IdArray count : " << count << std::endl;
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// For performance, the load factor of the hashmap is in (0.25, 0.5);
// For performance, the load factor of the hashmap is in (0.25, 0.5);
// Because num_cols is usually less than 1 Million (on GPU), the
// Because num_cols is usually less than 1 Million (on GPU), the
...
@@ -618,45 +611,29 @@ CSRMatrix CSRSliceMatrix(
...
@@ -618,45 +611,29 @@ CSRMatrix CSRSliceMatrix(
// Execute SegmentMaskColKernel
// Execute SegmentMaskColKernel
const int64_t num_rows = csr.num_rows;
const int64_t num_rows = csr.num_rows;
constexpr int WARP_SIZE =
32
;
constexpr int WARP_SIZE =
64
;
// With a simple fine-tuning, TILE_SIZE=16 gives a good performance.
// With a simple fine-tuning, TILE_SIZE=16 gives a good performance.
constexpr int TILE_SIZE =
16
;
constexpr int TILE_SIZE =
32
;
constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE;
constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE;
IdType nb =
IdType nb =
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nblks(nb);
const dim3 nblks(nb);
std::cout << "nthrs.x : " << nthrs.x << " nthrs.y : " << nthrs.y << " nthrs.z : " << nthrs.z << std::endl;
std::cout << "nblks.x : " << nblks.x << " nblks.y : " << nblks.y << " nblks.z : " << nblks.z << std::endl;
std::cout << "WARP_SIZE : " << WARP_SIZE << " BLOCK_WARPS : " << BLOCK_WARPS << "TILE_SIZE : " << std::endl;
std::cout << "indptr_data : " << indptr_data << std::endl;
std::cout << "indices_data : " << indices_data << std::endl;
std::cout << "num_rows : " << num_rows << std::endl;
std::cout << "buffer_size : " << buffer_size << std::endl;
std::cout << "mask : " << mask << std::endl;
std::cout << "count : " << count << std::endl;
std::cout << "hashmap_buffer : " << hashmap_buffer << std::endl;
CUDA_KERNEL_CALL(
CUDA_KERNEL_CALL(
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
nthrs, 0, stream, indptr_data, indices_data, num_rows,
nthrs, 0, stream, indptr_data, indices_data, num_rows,
hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
count.Ptr<IdType>());
count.Ptr<IdType>());
std::cout << "3 IdArray count : " << count << std::endl;
IdArray idx = AsNumBits(NonZero(mask), nbits);
IdArray idx = AsNumBits(NonZero(mask), nbits);
std::cout << "idx->shape[0] : " << idx->shape[0] << std::endl;
if (idx->shape[0] == 0)
if (idx->shape[0] == 0)
return CSRMatrix(
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
NullArray(dtype, ctx), NullArray(dtype, ctx));
NullArray(dtype, ctx), NullArray(dtype, ctx));
// Indptr needs to be adjusted according to the new nnz per row.
// Indptr needs to be adjusted according to the new nnz per row.
std::cout << " count : " << count << std::endl;
IdArray ret_indptr = CumSum(count, true);
IdArray ret_indptr = CumSum(count, true);
std::cout << " IdArray ret_indptr : " << ret_indptr << std::endl;
// Column & data can be obtained by index select.
// Column & data can be obtained by index select.
IdArray ret_col = IndexSelect(csr.indices, idx);
IdArray ret_col = IndexSelect(csr.indices, idx);
...
@@ -667,8 +644,6 @@ CSRMatrix CSRSliceMatrix(
...
@@ -667,8 +644,6 @@ CSRMatrix CSRSliceMatrix(
Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
ret_col = IndexSelect(col_hash, ret_col);
ret_col = IndexSelect(col_hash, ret_col);
// std::cout << "new_nrows : " << new_nrows << " new_ncols : " << new_ncols << " ret_indptr : " << ret_indptr << " ret_col : " << ret_col << " ret_data : " << std::endl;
return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
}
}
...
...
src/runtime/cuda/cuda_device_api.cc
View file @
833803f3
...
@@ -74,7 +74,7 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -74,7 +74,7 @@ class CUDADeviceAPI final : public DeviceAPI {
hipDeviceProp_t
props
;
hipDeviceProp_t
props
;
CUDA_CALL
(
hipGetDeviceProperties
(
&
props
,
ctx
.
device_id
));
CUDA_CALL
(
hipGetDeviceProperties
(
&
props
,
ctx
.
device_id
));
*
rv
=
std
::
string
(
props
.
name
);
*
rv
=
std
::
string
(
props
.
name
);
//
printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
printf
(
"******* debug: device.name:%s
\n
"
,
std
::
string
(
props
.
name
).
c_str
());
return
;
return
;
}
}
case
kMaxClockRate
:
{
case
kMaxClockRate
:
{
...
@@ -136,7 +136,8 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -136,7 +136,8 @@ class CUDADeviceAPI final : public DeviceAPI {
hipStream_t
cu_stream
=
static_cast
<
hipStream_t
>
(
stream
);
hipStream_t
cu_stream
=
static_cast
<
hipStream_t
>
(
stream
);
from
=
static_cast
<
const
char
*>
(
from
)
+
from_offset
;
from
=
static_cast
<
const
char
*>
(
from
)
+
from_offset
;
to
=
static_cast
<
char
*>
(
to
)
+
to_offset
;
to
=
static_cast
<
char
*>
(
to
)
+
to_offset
;
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_from
.
device_type
==
kDGLROCM
&&
ctx_to
.
device_type
==
kDGLROCM
)
{
// if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
if
((
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_from
.
device_type
==
kDGLROCM
)
&&
(
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_to
.
device_type
==
kDGLROCM
))
{
CUDA_CALL
(
hipSetDevice
(
ctx_from
.
device_id
));
CUDA_CALL
(
hipSetDevice
(
ctx_from
.
device_id
));
if
(
ctx_from
.
device_id
==
ctx_to
.
device_id
)
{
if
(
ctx_from
.
device_id
==
ctx_to
.
device_id
)
{
GPUCopy
(
from
,
to
,
size
,
hipMemcpyDeviceToDevice
,
cu_stream
);
GPUCopy
(
from
,
to
,
size
,
hipMemcpyDeviceToDevice
,
cu_stream
);
...
@@ -145,7 +146,7 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -145,7 +146,7 @@ class CUDADeviceAPI final : public DeviceAPI {
to
,
ctx_to
.
device_id
,
from
,
ctx_from
.
device_id
,
size
,
cu_stream
));
to
,
ctx_to
.
device_id
,
from
,
ctx_from
.
device_id
,
size
,
cu_stream
));
}
}
}
else
if
(
}
else
if
(
(
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_
to
.
device_type
==
kDGLROCM
)
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
(
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_
from
.
device_type
==
kDGLROCM
)
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
CUDA_CALL
(
hipSetDevice
(
ctx_from
.
device_id
));
CUDA_CALL
(
hipSetDevice
(
ctx_from
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
hipMemcpyDeviceToHost
,
cu_stream
);
GPUCopy
(
from
,
to
,
size
,
hipMemcpyDeviceToHost
,
cu_stream
);
}
else
if
(
}
else
if
(
...
@@ -153,7 +154,7 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -153,7 +154,7 @@ class CUDADeviceAPI final : public DeviceAPI {
CUDA_CALL
(
hipSetDevice
(
ctx_to
.
device_id
));
CUDA_CALL
(
hipSetDevice
(
ctx_to
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
hipMemcpyHostToDevice
,
cu_stream
);
GPUCopy
(
from
,
to
,
size
,
hipMemcpyHostToDevice
,
cu_stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"expect copy from/to GPU or between GPU
"
;
LOG
(
FATAL
)
<<
"expect copy from/to GPU or between GPU
. ctx_from.device_type: "
<<
ctx_from
.
device_type
<<
", ctx_to.device_type: "
<<
ctx_to
.
device_type
;
}
}
}
}
...
...
tensoradapter/include/tensoradapter.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020-2022 by Contributors
* Copyright (c) 2020-2022 by Contributors
* @file tensoradapter.h
* @file tensoradapter.h
...
@@ -11,7 +12,7 @@
...
@@ -11,7 +12,7 @@
#define TENSORADAPTER_H_
#define TENSORADAPTER_H_
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
namespace
tensoradapter
{
namespace
tensoradapter
{
...
@@ -43,7 +44,7 @@ void CPURawDelete(void* ptr);
...
@@ -43,7 +44,7 @@ void CPURawDelete(void* ptr);
* @param stream The stream to be allocated on.
* @param stream The stream to be allocated on.
* @return Pointer to the allocated memory.
* @return Pointer to the allocated memory.
*/
*/
void
*
CUDARawAlloc
(
size_t
nbytes
,
cuda
Stream_t
stream
);
void
*
CUDARawAlloc
(
size_t
nbytes
,
hip
Stream_t
stream
);
/**
/**
* @brief Free the GPU memory.
* @brief Free the GPU memory.
...
@@ -55,7 +56,7 @@ void CUDARawDelete(void* ptr);
...
@@ -55,7 +56,7 @@ void CUDARawDelete(void* ptr);
/**
/**
* @brief Get the current CUDA stream.
* @brief Get the current CUDA stream.
*/
*/
cuda
Stream_t
CUDACurrentStream
();
hip
Stream_t
CUDACurrentStream
();
/**
/**
* @brief Let the caching allocator know which streams are using this tensor.
* @brief Let the caching allocator know which streams are using this tensor.
...
@@ -64,7 +65,7 @@ cudaStream_t CUDACurrentStream();
...
@@ -64,7 +65,7 @@ cudaStream_t CUDACurrentStream();
* @param stream The stream that is using this tensor.
* @param stream The stream that is using this tensor.
* @param device_id Device of the tensor.
* @param device_id Device of the tensor.
*/
*/
void
RecordStream
(
void
*
ptr
,
cuda
Stream_t
stream
,
int
device_id
);
void
RecordStream
(
void
*
ptr
,
hip
Stream_t
stream
,
int
device_id
);
/**
/**
* @brief Allocate a piece of pinned CPU memory via
* @brief Allocate a piece of pinned CPU memory via
...
@@ -98,7 +99,7 @@ void CUDARawHostDelete(void** raw_deleter);
...
@@ -98,7 +99,7 @@ void CUDARawHostDelete(void** raw_deleter);
* @param device_id Device of the tensor.
* @param device_id Device of the tensor.
*/
*/
void
CUDARecordHostAlloc
(
void
CUDARecordHostAlloc
(
void
*
data
,
void
*
ctx
,
cuda
Stream_t
stream
,
int
device_id
);
void
*
data
,
void
*
ctx
,
hip
Stream_t
stream
,
int
device_id
);
/**
/**
* @brief Release cached pinned memory allocations via cudaHostFree.
* @brief Release cached pinned memory allocations via cudaHostFree.
...
...
tensoradapter/pytorch/CMakeLists.txt
View file @
833803f3
...
@@ -17,7 +17,8 @@ list(GET TORCH_PREFIX_VER 0 TORCH_PREFIX)
...
@@ -17,7 +17,8 @@ list(GET TORCH_PREFIX_VER 0 TORCH_PREFIX)
list
(
GET TORCH_PREFIX_VER 1 TORCH_VER
)
list
(
GET TORCH_PREFIX_VER 1 TORCH_VER
)
message
(
STATUS
"Configuring for PyTorch
${
TORCH_VER
}
"
)
message
(
STATUS
"Configuring for PyTorch
${
TORCH_VER
}
"
)
if
(
USE_CUDA
)
if
(
USE_HIP
)
message
(
STATUS
"<<<<<<<<<<<<<< PYTORCH USE_HIP:
${
USE_HIP
}
"
)
add_definitions
(
-DDGL_USE_CUDA
)
add_definitions
(
-DDGL_USE_CUDA
)
endif
()
endif
()
...
@@ -30,6 +31,7 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb")
...
@@ -30,6 +31,7 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb")
set
(
TORCH_TARGET_NAME
"tensoradapter_pytorch_
${
TORCH_VER
}
"
)
set
(
TORCH_TARGET_NAME
"tensoradapter_pytorch_
${
TORCH_VER
}
"
)
file
(
GLOB TA_TORCH_SRC *.cpp
)
file
(
GLOB TA_TORCH_SRC *.cpp
)
add_library
(
${
TORCH_TARGET_NAME
}
SHARED
"
${
TA_TORCH_SRC
}
"
)
add_library
(
${
TORCH_TARGET_NAME
}
SHARED
"
${
TA_TORCH_SRC
}
"
)
message
(
STATUS
" <<<<<<<<< pytorch source:
${
TA_TORCH_SRC
}
"
)
# use the library name rather than the path
# use the library name rather than the path
set
(
TENSORADAPTER_TORCH_LIBS torch
)
set
(
TENSORADAPTER_TORCH_LIBS torch
)
...
...
tensoradapter/pytorch/build.sh
View file @
833803f3
...
@@ -12,19 +12,19 @@ else
...
@@ -12,19 +12,19 @@ else
CPSOURCE
=
*
.so
CPSOURCE
=
*
.so
fi
fi
CMAKE_FLAGS
=
"
-DCUDA_TOOLKIT_ROOT_DIR=
$CUDA_TOOLKIT_ROOT_DIR
-DTORCH_CUDA_ARCH_LIST=
$TORCH_CUDA_ARCH_LIST
-DUSE_CUDA
=
$USE_
CUDA
"
CMAKE_FLAGS
=
"
-DUSE_HIP
=
$USE_
HIP
"
if
[
$#
-eq
0
]
;
then
if
[
$#
-eq
0
]
;
then
$CMAKE_COMMAND
$CMAKE_FLAGS
..
CC
=
hipcc
CXX
=
hipcc
$CMAKE_COMMAND
$CMAKE_FLAGS
..
make
-j
make
-j
VERBOSE
=
1
cp
-v
$CPSOURCE
$BINDIR
/tensoradapter/pytorch
cp
-v
$CPSOURCE
$BINDIR
/tensoradapter/pytorch
else
else
for
PYTHON_INTERP
in
$@
;
do
for
PYTHON_INTERP
in
$@
;
do
TORCH_VER
=
$(
$PYTHON_INTERP
-c
'import torch; print(torch.__version__.split("+")[0])'
)
TORCH_VER
=
$(
$PYTHON_INTERP
-c
'import torch; print(torch.__version__.split("+")[0])'
)
mkdir
-p
$TORCH_VER
mkdir
-p
$TORCH_VER
cd
$TORCH_VER
cd
$TORCH_VER
$CMAKE_COMMAND
$CMAKE_FLAGS
-DPYTHON_INTERP
=
$PYTHON_INTERP
../..
CC
=
hipcc
CXX
=
hipcc
$CMAKE_COMMAND
$CMAKE_FLAGS
-DPYTHON_INTERP
=
$PYTHON_INTERP
../..
make
-j
make
-j
VERBOSE
=
1
cp
-v
$CPSOURCE
$BINDIR
/tensoradapter/pytorch
cp
-v
$CPSOURCE
$BINDIR
/tensoradapter/pytorch
cd
..
cd
..
done
done
...
...
tensoradapter/pytorch/torch.cpp
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
/**
* Copyright (c) 2020-2022 by Contributors
* Copyright (c) 2020-2022 by Contributors
* @file torch/torch.cpp
* @file torch/torch.cpp
...
@@ -7,11 +8,11 @@
...
@@ -7,11 +8,11 @@
#include <c10/core/CPUAllocator.h>
#include <c10/core/CPUAllocator.h>
#include <tensoradapter_exports.h>
#include <tensoradapter_exports.h>
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
#include <ATen/
cuda/CUDA
Context.h>
#include <ATen/
hip/HIP
Context.h>
#include <ATen/
cuda
/CachingHostAllocator.h>
#include <ATen/
hip
/CachingHostAllocator.h>
#include <
c10/cuda/CUDACachingAllocator
.h>
#include <
ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA
.h>
#include <
c10/cuda/CUDAStream
.h>
#include <
ATen/hip/impl/HIPStreamMasqueradingAsCUDA
.h>
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
namespace
tensoradapter
{
namespace
tensoradapter
{
...
@@ -27,29 +28,29 @@ TA_EXPORTS void CPURawDelete(void* ptr) {
...
@@ -27,29 +28,29 @@ TA_EXPORTS void CPURawDelete(void* ptr) {
}
}
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
TA_EXPORTS
void
*
CUDARawAlloc
(
size_t
nbytes
,
cuda
Stream_t
stream
)
{
TA_EXPORTS
void
*
CUDARawAlloc
(
size_t
nbytes
,
hip
Stream_t
stream
)
{
at
::
globalContext
().
lazyInitCUDA
();
at
::
globalContext
().
lazyInitCUDA
();
return
c10
::
cuda
::
CUDA
CachingAllocator
::
raw_alloc_with_stream
(
nbytes
,
stream
);
return
c10
::
hip
::
HIP
CachingAllocator
::
raw_alloc_with_stream
(
nbytes
,
stream
);
}
}
TA_EXPORTS
void
CUDARawDelete
(
void
*
ptr
)
{
TA_EXPORTS
void
CUDARawDelete
(
void
*
ptr
)
{
c10
::
cuda
::
CUDA
CachingAllocator
::
raw_delete
(
ptr
);
c10
::
hip
::
HIP
CachingAllocator
::
raw_delete
(
ptr
);
}
}
TA_EXPORTS
cuda
Stream_t
CUDACurrentStream
()
{
TA_EXPORTS
hip
Stream_t
CUDACurrentStream
()
{
return
at
::
cuda
::
getCurrent
CUDA
Stream
();
return
at
::
hip
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
}
}
TA_EXPORTS
void
RecordStream
(
void
*
ptr
,
cuda
Stream_t
stream
,
int
device_id
)
{
TA_EXPORTS
void
RecordStream
(
void
*
ptr
,
hip
Stream_t
stream
,
int
device_id
)
{
c10
::
DataPtr
data_ptr
{
c10
::
DataPtr
data_ptr
{
ptr
,
ptr
,
c10
::
cuda
::
CUDA
CachingAllocator
::
get
()
->
raw_deleter
(),
ptr
,
ptr
,
c10
::
hip
::
HIP
CachingAllocator
MasqueradingAsCUDA
::
get
()
->
raw_deleter
(),
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
)};
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
)};
c10
::
cuda
::
CUDA
CachingAllocator
::
recordStream
(
c10
::
hip
::
HIP
CachingAllocator
MasqueradingAsCUDA
::
recordStreamMasqueradingAsCUDA
(
data_ptr
,
data_ptr
,
// getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it
// getStreamFromExternal
MasqueradingAsCUDA
doesn't exist before PyTorch 1.10, just copy it
// here
// here
c10
::
cuda
::
CUDAStream
(
c10
::
hip
::
HIPStreamMasqueradingAsCUDA
(
c10
::
cuda
::
CUDAStream
::
UNCHECKED
,
c10
::
hip
::
HIPStreamMasqueradingAsCUDA
::
UNCHECKED
,
c10
::
Stream
(
c10
::
Stream
(
c10
::
Stream
::
UNSAFE
,
c10
::
Stream
::
UNSAFE
,
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
),
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
),
...
@@ -86,11 +87,11 @@ TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) {
...
@@ -86,11 +87,11 @@ TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) {
}
}
TA_EXPORTS
void
CUDARecordHostAlloc
(
TA_EXPORTS
void
CUDARecordHostAlloc
(
void
*
ptr
,
void
*
ctx
,
cuda
Stream_t
stream
,
int
device_id
)
{
void
*
ptr
,
void
*
ctx
,
hip
Stream_t
stream
,
int
device_id
)
{
at
::
cuda
::
CachingHostAllocator_recordEvent
(
at
::
cuda
::
CachingHostAllocator_recordEvent
(
ptr
,
ctx
,
ptr
,
ctx
,
c10
::
cuda
::
CUDAStream
(
c10
::
hip
::
HIPStreamMasqueradingAsCUDA
(
c10
::
cuda
::
CUDAStream
::
UNCHECKED
,
c10
::
hip
::
HIPStreamMasqueradingAsCUDA
::
UNCHECKED
,
c10
::
Stream
(
c10
::
Stream
(
c10
::
Stream
::
UNSAFE
,
c10
::
Stream
::
UNSAFE
,
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
),
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
),
...
...
tests/cpp/message_queue_test.cc
View file @
833803f3
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
#include <thread>
#include <thread>
#include <vector>
#include <vector>
#include "../src/rpc/network/msg_queue.h"
#include "../
../
src/rpc/network/msg_queue.h"
using
dgl
::
network
::
Message
;
using
dgl
::
network
::
Message
;
using
dgl
::
network
::
MessageQueue
;
using
dgl
::
network
::
MessageQueue
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment