Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
833803f3
Commit
833803f3
authored
Sep 23, 2024
by
sangwzh
Browse files
update dgl codes to hip
parent
1d28bf8b
Changes
36
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
92 additions
and
93 deletions
+92
-93
include/dgl/packed_func_ext.h
include/dgl/packed_func_ext.h
+4
-3
include/dgl/runtime/c_object_api.h
include/dgl/runtime/c_object_api.h
+2
-1
include/dgl/runtime/c_runtime_api.h
include/dgl/runtime/c_runtime_api.h
+1
-0
include/dgl/runtime/device_api.h
include/dgl/runtime/device_api.h
+4
-3
include/dgl/runtime/module.h
include/dgl/runtime/module.h
+1
-0
include/dgl/runtime/ndarray.h
include/dgl/runtime/ndarray.h
+24
-11
include/dgl/runtime/tensordispatch.h
include/dgl/runtime/tensordispatch.h
+10
-9
python/dgl/_ffi/runtime_ctypes.py
python/dgl/_ffi/runtime_ctypes.py
+2
-2
python/dgl/backend/pytorch/tensor.py
python/dgl/backend/pytorch/tensor.py
+1
-1
src/array/cuda/spmat_op_impl_csr.hip
src/array/cuda/spmat_op_impl_csr.hip
+3
-28
src/runtime/cuda/cuda_device_api.cc
src/runtime/cuda/cuda_device_api.cc
+5
-4
tensoradapter/include/tensoradapter.h
tensoradapter/include/tensoradapter.h
+6
-5
tensoradapter/pytorch/CMakeLists.txt
tensoradapter/pytorch/CMakeLists.txt
+3
-1
tensoradapter/pytorch/build.sh
tensoradapter/pytorch/build.sh
+5
-5
tensoradapter/pytorch/torch.cpp
tensoradapter/pytorch/torch.cpp
+20
-19
tests/cpp/message_queue_test.cc
tests/cpp/message_queue_test.cc
+1
-1
No files found.
include/dgl/packed_func_ext.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file packed_func_ext.h
...
...
@@ -12,9 +13,9 @@
#include <string>
#include <type_traits>
#include "
./
runtime/container.h"
#include "
./
runtime/object.h"
#include "
./
runtime/packed_func.h"
#include "runtime/container.h"
#include "runtime/object.h"
#include "runtime/packed_func.h"
namespace
dgl
{
namespace
runtime
{
...
...
include/dgl/runtime/c_object_api.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2019 by Contributors
* @file dgl/runtime/c_object_api.h
...
...
@@ -10,7 +11,7 @@
#ifndef DGL_RUNTIME_C_OBJECT_API_H_
#define DGL_RUNTIME_C_OBJECT_API_H_
#include "
./
c_runtime_api.h"
#include "c_runtime_api.h"
#ifdef __cplusplus
extern
"C"
{
...
...
include/dgl/runtime/c_runtime_api.h
View file @
833803f3
...
...
@@ -57,6 +57,7 @@ typedef enum {
/** @brief CUDA GPU device */
kDGLCUDA
=
2
,
// add more devices once supported
kDGLROCM
=
10
,
}
DGLDeviceType
;
/**
...
...
include/dgl/runtime/device_api.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2016 by Contributors
* @file dgl/runtime/device_api.h
...
...
@@ -174,7 +175,7 @@ class DeviceAPI {
DGLContext
ctx
,
DGLStreamHandle
event_src
,
DGLStreamHandle
event_dst
);
/**
* @brief Pin host memory using
cuda
HostRegister().
* @brief Pin host memory using
hip
HostRegister().
*
* @param ptr The host memory pointer to be pinned.
* @param nbytes The size to be pinned.
...
...
@@ -183,7 +184,7 @@ class DeviceAPI {
DGL_DLL
virtual
bool
PinData
(
void
*
ptr
,
size_t
nbytes
);
/**
* @brief Unpin host memory using
cuda
HostUnregister().
* @brief Unpin host memory using
hip
HostUnregister().
*
* @param ptr The host memory pointer to be unpinned.
*/
...
...
@@ -203,7 +204,7 @@ class DeviceAPI {
/**
* @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
* @note It avoids unnecessary
cudaFreeHost
calls and puts the memory
* @note It avoids unnecessary
hipHostFree
calls and puts the memory
* block into CachingHostAllocator's free list.
* @param deleter Pointer to the deleter function from PyTorch's
* CachingHostAllocator.
...
...
include/dgl/runtime/module.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017 by Contributors
* @file dgl/runtime/module.h
...
...
include/dgl/runtime/ndarray.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017-2022 by Contributors
* @file dgl/runtime/ndarray.h
...
...
@@ -18,13 +19,20 @@
#include "shared_mem.h"
#ifdef DGL_USE_CUDA
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#define BF16_ENABLED (defined(CUDART_VERSION) && CUDART_VERSION >= 11000)
// #define BF16_ENABLED (defined(DTKRT_VERSION) && DTKRT_VERSION >= 11000)
#if defined(DTKRT_VERSION)
#define DTKRT_VERSION_CHECK (DTKRT_VERSION >= 11000)
#else
#define DTKRT_VERSION_CHECK 0
#endif
#include <cuda_fp16.h>
#define BF16_ENABLED DTKRT_VERSION_CHECK
#include <hip/hip_fp16.h>
#if BF16_ENABLED
#include <
cuda
_bf16.h>
#include <
hip/hip
_bf16.h>
#endif // BF16_ENABLED
#endif // DGL_USE_CUDA
...
...
@@ -60,7 +68,7 @@ GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64);
#ifdef DGL_USE_CUDA
GEN_DGLDATATYPETRAITS_FOR
(
__half
,
kDGLFloat
,
16
);
#if BF16_ENABLED
GEN_DGLDATATYPETRAITS_FOR
(
__
nv
_bfloat16
,
kDGLBfloat
,
16
);
GEN_DGLDATATYPETRAITS_FOR
(
__
hip
_bfloat16
,
kDGLBfloat
,
16
);
#endif // BF16_ENABLED
#endif // DGL_USE_CUDA
GEN_DGLDATATYPETRAITS_FOR
(
float
,
kDGLFloat
,
32
);
...
...
@@ -185,7 +193,7 @@ class NDArray {
* CachingHostAllocator for allocating pinned memory and copying data
* from the current NDAarray. As a result, PyTorch is responsible for
* managing the lifecycle of the returned NDArray, including deciding
* when to flush the data for reuse or call
cudaFreeHost
. The current
* when to flush the data for reuse or call
hipHostFree
. The current
* context must be kDGLCPU, otherwise, an error will be thrown.
*/
inline
NDArray
PinMemory
();
...
...
@@ -194,7 +202,7 @@ class NDArray {
* @brief In-place method to pin the current array by calling PinContainer
* on the underlying NDArray:Container.
* @note This is an in-place method that flags the memory as page-locked by
* utilizing
cuda
HostRegister at the underlying level to pin the current
* utilizing
hip
HostRegister at the underlying level to pin the current
* instance of NDArray. The current context must be kDGLCPU, otherwise,
* an error will be thrown.
*/
...
...
@@ -523,7 +531,7 @@ inline void NDArray::CopyFrom(const NDArray& other) {
// Pinned by PyTorch
if
(
cpu_data
->
pinned_by_pytorch_
)
{
// To ensure correct behavior, the event must be recorded after
//
cuda
MemcpyAsync as long as the memory is pinned by PyTorch.
//
hip
MemcpyAsync as long as the memory is pinned by PyTorch.
void
*
pytorch_ctx
=
cpu_data
->
pytorch_ctx_
;
RecordedCopyFromTo
(
&
(
other
.
data_
->
dl_tensor
),
&
(
data_
->
dl_tensor
),
pytorch_ctx
);
...
...
@@ -549,7 +557,7 @@ inline void NDArray::CopyTo(const NDArray& other) const {
// pinned by PyTorch
if
(
cpu_data
->
pinned_by_pytorch_
)
{
// To ensure correct behavior, the event must be recorded after
//
cuda
MemcpyAsync as long as the memory is pinned by PyTorch.
//
hip
MemcpyAsync as long as the memory is pinned by PyTorch.
void
*
pytorch_ctx
=
cpu_data
->
pytorch_ctx_
;
RecordedCopyFromTo
(
&
(
data_
->
dl_tensor
),
&
(
other
.
data_
->
dl_tensor
),
pytorch_ctx
);
...
...
@@ -716,6 +724,8 @@ inline const char* DeviceTypeCode2Str(DGLDeviceType device_type) {
return
"cpu"
;
case
kDGLCUDA
:
return
"cuda"
;
case
kDGLROCM
:
return
"cuda"
;
default:
LOG
(
FATAL
)
<<
"Unsupported device type code="
<<
static_cast
<
int
>
(
device_type
);
...
...
@@ -871,8 +881,11 @@ inline std::ostream& operator<<(std::ostream& os, DGLDataType t) {
/** @brief Check whether two device contexts are the same.*/
inline
bool
operator
==
(
const
DGLContext
&
ctx1
,
const
DGLContext
&
ctx2
)
{
return
ctx1
.
device_type
==
ctx2
.
device_type
&&
ctx1
.
device_id
==
ctx2
.
device_id
;
// printf("**************** debug compare DGLContext, %d, %d\n",ctx1.device_type,ctx2.device_type);
int
ct1
=
ctx1
.
device_type
==
10
?
2
:
ctx1
.
device_type
;
int
ct2
=
ctx2
.
device_type
==
10
?
2
:
ctx2
.
device_type
;
return
ct1
==
ct2
&&
int
(
ctx1
.
device_id
)
==
int
(
ctx2
.
device_id
);
}
/** @brief Check whether two device contexts are different.*/
...
...
include/dgl/runtime/tensordispatch.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020-2022 by Contributors
* @file array/tensordispatch.h
...
...
@@ -34,7 +35,7 @@
#include <windows.h>
#endif // WIN32
#ifdef DGL_USE_CUDA
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#endif // DGL_USE_CUDA
#include "ndarray.h"
...
...
@@ -97,14 +98,14 @@ class TensorDispatcher {
* Used in CUDADeviceAPI::AllocWorkspace().
*
* @note THCCachingAllocator specify the device to allocate on
* via
cuda
GetDevice(). Make sure to call
cuda
SetDevice()
* via
hip
GetDevice(). Make sure to call
hip
SetDevice()
* before invoking this function.
*
* @param nbytes The size to be allocated.
* @param stream The stream to be allocated on.
* @return Pointer to the allocated memory.
*/
inline
void
*
CUDAAllocWorkspace
(
size_t
nbytes
,
cuda
Stream_t
stream
)
{
inline
void
*
CUDAAllocWorkspace
(
size_t
nbytes
,
hip
Stream_t
stream
)
{
auto
entry
=
entrypoints_
[
Op
::
kCUDARawAlloc
];
return
FUNCCAST
(
tensoradapter
::
CUDARawAlloc
,
entry
)(
nbytes
,
stream
);
}
...
...
@@ -122,15 +123,15 @@ class TensorDispatcher {
/**
* @brief Find the current PyTorch CUDA stream
* Used in runtime::getCurrent
CUDA
Stream().
* Used in runtime::getCurrent
HIP
Stream
MasqueradingAsCUDA
().
*
* @note PyTorch pre-allocates/sets the current CUDA stream
* on current device via
cuda
GetDevice(). Make sure to call
cuda
SetDevice()
* on current device via
hip
GetDevice(). Make sure to call
hip
SetDevice()
* before invoking this function.
*
* @return
cuda
Stream_t stream handle
* @return
hip
Stream_t stream handle
*/
inline
cuda
Stream_t
CUDAGetCurrentStream
()
{
inline
hip
Stream_t
CUDAGetCurrentStream
()
{
auto
entry
=
entrypoints_
[
Op
::
kCUDACurrentStream
];
return
FUNCCAST
(
tensoradapter
::
CUDACurrentStream
,
entry
)();
}
...
...
@@ -183,7 +184,7 @@ class TensorDispatcher {
* @param device_id Device of the tensor.
*/
inline
void
CUDARecordHostAlloc
(
void
*
data
,
void
*
ctx
,
cuda
Stream_t
stream
,
int
device_id
)
{
void
*
data
,
void
*
ctx
,
hip
Stream_t
stream
,
int
device_id
)
{
auto
entry
=
entrypoints_
[
Op
::
kCUDARecordHostAlloc
];
auto
recorded_alloc
=
FUNCCAST
(
tensoradapter
::
CUDARecordHostAlloc
,
entry
);
recorded_alloc
(
data
,
ctx
,
stream
,
device_id
);
...
...
@@ -212,7 +213,7 @@ class TensorDispatcher {
#ifdef DGL_USE_CUDA
auto
entry
=
entrypoints_
[
Op
::
kRecordStream
];
FUNCCAST
(
tensoradapter
::
RecordStream
,
entry
)
(
ptr
,
static_cast
<
cuda
Stream_t
>
(
stream
),
device_id
);
(
ptr
,
static_cast
<
hip
Stream_t
>
(
stream
),
device_id
);
#endif
}
...
...
python/dgl/_ffi/runtime_ctypes.py
View file @
833803f3
...
...
@@ -123,7 +123,7 @@ class DGLContext(ctypes.Structure):
7
:
"vulkan"
,
8
:
"metal"
,
9
:
"vpi"
,
10
:
"
rocm
"
,
10
:
"
gpu
"
,
11
:
"opengl"
,
12
:
"ext_dev"
,
}
...
...
@@ -142,7 +142,7 @@ class DGLContext(ctypes.Structure):
"vulkan"
:
7
,
"metal"
:
8
,
"vpi"
:
9
,
"rocm"
:
10
,
"rocm"
:
2
,
"opengl"
:
11
,
"ext_dev"
:
12
,
}
...
...
python/dgl/backend/pytorch/tensor.py
View file @
833803f3
...
...
@@ -116,7 +116,7 @@ def to_backend_ctx(dglctx):
dev_type
=
dglctx
.
device_type
if
dev_type
==
1
:
return
th
.
device
(
"cpu"
)
elif
dev_type
==
2
:
elif
dev_type
==
2
or
dev_type
==
10
:
return
th
.
device
(
"cuda"
,
dglctx
.
device_id
)
else
:
raise
ValueError
(
"Unsupported DGL device context:"
,
dglctx
)
...
...
src/array/cuda/spmat_op_impl_csr.hip
View file @
833803f3
...
...
@@ -549,7 +549,6 @@ __global__ void _SegmentMaskColKernel(
}
}
IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
printf("out_row = %d , reduce_count = %d \n", out_row, reduce_count);
if (laneid == 0) {
count[out_row] = reduce_count;
}
...
...
@@ -567,9 +566,6 @@ CSRMatrix CSRSliceMatrix(
const int64_t new_nrows = rows->shape[0];
const int64_t new_ncols = cols->shape[0];
std::cout << "new_nrows : " << new_nrows << std::endl;
std::cout << "new_ncols : " << new_ncols << std::endl;
if (new_nrows == 0 || new_ncols == 0)
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...
...
@@ -578,7 +574,6 @@ CSRMatrix CSRSliceMatrix(
// First slice rows
csr = CSRSliceRows(csr, rows);
std::cout << "csr.indices->shape[0] : " << csr.indices->shape[0] << std::endl;
if (csr.indices->shape[0] == 0)
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
...
...
@@ -588,11 +583,9 @@ CSRMatrix CSRSliceMatrix(
IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
// A count for how many masked values per row.
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
std::cout << "1 IdArray count : " << count << std::endl;
CUDA_CALL(
hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
std::cout << "2 IdArray count : " << count << std::endl;
// Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
// For performance, the load factor of the hashmap is in (0.25, 0.5);
// Because num_cols is usually less than 1 Million (on GPU), the
...
...
@@ -618,45 +611,29 @@ CSRMatrix CSRSliceMatrix(
// Execute SegmentMaskColKernel
const int64_t num_rows = csr.num_rows;
constexpr int WARP_SIZE =
32
;
constexpr int WARP_SIZE =
64
;
// With a simple fine-tuning, TILE_SIZE=16 gives a good performance.
constexpr int TILE_SIZE =
16
;
constexpr int TILE_SIZE =
32
;
constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE;
IdType nb =
dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
const dim3 nblks(nb);
std::cout << "nthrs.x : " << nthrs.x << " nthrs.y : " << nthrs.y << " nthrs.z : " << nthrs.z << std::endl;
std::cout << "nblks.x : " << nblks.x << " nblks.y : " << nblks.y << " nblks.z : " << nblks.z << std::endl;
std::cout << "WARP_SIZE : " << WARP_SIZE << " BLOCK_WARPS : " << BLOCK_WARPS << "TILE_SIZE : " << std::endl;
std::cout << "indptr_data : " << indptr_data << std::endl;
std::cout << "indices_data : " << indices_data << std::endl;
std::cout << "num_rows : " << num_rows << std::endl;
std::cout << "buffer_size : " << buffer_size << std::endl;
std::cout << "mask : " << mask << std::endl;
std::cout << "count : " << count << std::endl;
std::cout << "hashmap_buffer : " << hashmap_buffer << std::endl;
CUDA_KERNEL_CALL(
(_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
nthrs, 0, stream, indptr_data, indices_data, num_rows,
hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
count.Ptr<IdType>());
std::cout << "3 IdArray count : " << count << std::endl;
IdArray idx = AsNumBits(NonZero(mask), nbits);
std::cout << "idx->shape[0] : " << idx->shape[0] << std::endl;
if (idx->shape[0] == 0)
return CSRMatrix(
new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
NullArray(dtype, ctx), NullArray(dtype, ctx));
// Indptr needs to be adjusted according to the new nnz per row.
std::cout << " count : " << count << std::endl;
IdArray ret_indptr = CumSum(count, true);
std::cout << " IdArray ret_indptr : " << ret_indptr << std::endl;
// Column & data can be obtained by index select.
IdArray ret_col = IndexSelect(csr.indices, idx);
...
...
@@ -667,8 +644,6 @@ CSRMatrix CSRSliceMatrix(
Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
ret_col = IndexSelect(col_hash, ret_col);
// std::cout << "new_nrows : " << new_nrows << " new_ncols : " << new_ncols << " ret_indptr : " << ret_indptr << " ret_col : " << ret_col << " ret_data : " << std::endl;
return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
}
...
...
src/runtime/cuda/cuda_device_api.cc
View file @
833803f3
...
...
@@ -74,7 +74,7 @@ class CUDADeviceAPI final : public DeviceAPI {
hipDeviceProp_t
props
;
CUDA_CALL
(
hipGetDeviceProperties
(
&
props
,
ctx
.
device_id
));
*
rv
=
std
::
string
(
props
.
name
);
//
printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
printf
(
"******* debug: device.name:%s
\n
"
,
std
::
string
(
props
.
name
).
c_str
());
return
;
}
case
kMaxClockRate
:
{
...
...
@@ -136,7 +136,8 @@ class CUDADeviceAPI final : public DeviceAPI {
hipStream_t
cu_stream
=
static_cast
<
hipStream_t
>
(
stream
);
from
=
static_cast
<
const
char
*>
(
from
)
+
from_offset
;
to
=
static_cast
<
char
*>
(
to
)
+
to_offset
;
if
(
ctx_from
.
device_type
==
kDGLCUDA
&&
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_from
.
device_type
==
kDGLROCM
&&
ctx_to
.
device_type
==
kDGLROCM
)
{
// if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
if
((
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_from
.
device_type
==
kDGLROCM
)
&&
(
ctx_to
.
device_type
==
kDGLCUDA
||
ctx_to
.
device_type
==
kDGLROCM
))
{
CUDA_CALL
(
hipSetDevice
(
ctx_from
.
device_id
));
if
(
ctx_from
.
device_id
==
ctx_to
.
device_id
)
{
GPUCopy
(
from
,
to
,
size
,
hipMemcpyDeviceToDevice
,
cu_stream
);
...
...
@@ -145,7 +146,7 @@ class CUDADeviceAPI final : public DeviceAPI {
to
,
ctx_to
.
device_id
,
from
,
ctx_from
.
device_id
,
size
,
cu_stream
));
}
}
else
if
(
(
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_
to
.
device_type
==
kDGLROCM
)
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
(
ctx_from
.
device_type
==
kDGLCUDA
||
ctx_
from
.
device_type
==
kDGLROCM
)
&&
ctx_to
.
device_type
==
kDGLCPU
)
{
CUDA_CALL
(
hipSetDevice
(
ctx_from
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
hipMemcpyDeviceToHost
,
cu_stream
);
}
else
if
(
...
...
@@ -153,7 +154,7 @@ class CUDADeviceAPI final : public DeviceAPI {
CUDA_CALL
(
hipSetDevice
(
ctx_to
.
device_id
));
GPUCopy
(
from
,
to
,
size
,
hipMemcpyHostToDevice
,
cu_stream
);
}
else
{
LOG
(
FATAL
)
<<
"expect copy from/to GPU or between GPU
"
;
LOG
(
FATAL
)
<<
"expect copy from/to GPU or between GPU
. ctx_from.device_type: "
<<
ctx_from
.
device_type
<<
", ctx_to.device_type: "
<<
ctx_to
.
device_type
;
}
}
...
...
tensoradapter/include/tensoradapter.h
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020-2022 by Contributors
* @file tensoradapter.h
...
...
@@ -11,7 +12,7 @@
#define TENSORADAPTER_H_
#ifdef DGL_USE_CUDA
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#endif // DGL_USE_CUDA
namespace
tensoradapter
{
...
...
@@ -43,7 +44,7 @@ void CPURawDelete(void* ptr);
* @param stream The stream to be allocated on.
* @return Pointer to the allocated memory.
*/
void
*
CUDARawAlloc
(
size_t
nbytes
,
cuda
Stream_t
stream
);
void
*
CUDARawAlloc
(
size_t
nbytes
,
hip
Stream_t
stream
);
/**
* @brief Free the GPU memory.
...
...
@@ -55,7 +56,7 @@ void CUDARawDelete(void* ptr);
/**
* @brief Get the current CUDA stream.
*/
cuda
Stream_t
CUDACurrentStream
();
hip
Stream_t
CUDACurrentStream
();
/**
* @brief Let the caching allocator know which streams are using this tensor.
...
...
@@ -64,7 +65,7 @@ cudaStream_t CUDACurrentStream();
* @param stream The stream that is using this tensor.
* @param device_id Device of the tensor.
*/
void
RecordStream
(
void
*
ptr
,
cuda
Stream_t
stream
,
int
device_id
);
void
RecordStream
(
void
*
ptr
,
hip
Stream_t
stream
,
int
device_id
);
/**
* @brief Allocate a piece of pinned CPU memory via
...
...
@@ -98,7 +99,7 @@ void CUDARawHostDelete(void** raw_deleter);
* @param device_id Device of the tensor.
*/
void
CUDARecordHostAlloc
(
void
*
data
,
void
*
ctx
,
cuda
Stream_t
stream
,
int
device_id
);
void
*
data
,
void
*
ctx
,
hip
Stream_t
stream
,
int
device_id
);
/**
* @brief Release cached pinned memory allocations via cudaHostFree.
...
...
tensoradapter/pytorch/CMakeLists.txt
View file @
833803f3
...
...
@@ -17,7 +17,8 @@ list(GET TORCH_PREFIX_VER 0 TORCH_PREFIX)
list
(
GET TORCH_PREFIX_VER 1 TORCH_VER
)
message
(
STATUS
"Configuring for PyTorch
${
TORCH_VER
}
"
)
if
(
USE_CUDA
)
if
(
USE_HIP
)
message
(
STATUS
"<<<<<<<<<<<<<< PYTORCH USE_HIP:
${
USE_HIP
}
"
)
add_definitions
(
-DDGL_USE_CUDA
)
endif
()
...
...
@@ -30,6 +31,7 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb")
set
(
TORCH_TARGET_NAME
"tensoradapter_pytorch_
${
TORCH_VER
}
"
)
file
(
GLOB TA_TORCH_SRC *.cpp
)
add_library
(
${
TORCH_TARGET_NAME
}
SHARED
"
${
TA_TORCH_SRC
}
"
)
message
(
STATUS
" <<<<<<<<< pytorch source:
${
TA_TORCH_SRC
}
"
)
# use the library name rather than the path
set
(
TENSORADAPTER_TORCH_LIBS torch
)
...
...
tensoradapter/pytorch/build.sh
View file @
833803f3
...
...
@@ -12,19 +12,19 @@ else
CPSOURCE
=
*
.so
fi
CMAKE_FLAGS
=
"
-DCUDA_TOOLKIT_ROOT_DIR=
$CUDA_TOOLKIT_ROOT_DIR
-DTORCH_CUDA_ARCH_LIST=
$TORCH_CUDA_ARCH_LIST
-DUSE_CUDA
=
$USE_
CUDA
"
CMAKE_FLAGS
=
"
-DUSE_HIP
=
$USE_
HIP
"
if
[
$#
-eq
0
]
;
then
$CMAKE_COMMAND
$CMAKE_FLAGS
..
make
-j
CC
=
hipcc
CXX
=
hipcc
$CMAKE_COMMAND
$CMAKE_FLAGS
..
make
-j
VERBOSE
=
1
cp
-v
$CPSOURCE
$BINDIR
/tensoradapter/pytorch
else
for
PYTHON_INTERP
in
$@
;
do
TORCH_VER
=
$(
$PYTHON_INTERP
-c
'import torch; print(torch.__version__.split("+")[0])'
)
mkdir
-p
$TORCH_VER
cd
$TORCH_VER
$CMAKE_COMMAND
$CMAKE_FLAGS
-DPYTHON_INTERP
=
$PYTHON_INTERP
../..
make
-j
CC
=
hipcc
CXX
=
hipcc
$CMAKE_COMMAND
$CMAKE_FLAGS
-DPYTHON_INTERP
=
$PYTHON_INTERP
../..
make
-j
VERBOSE
=
1
cp
-v
$CPSOURCE
$BINDIR
/tensoradapter/pytorch
cd
..
done
...
...
tensoradapter/pytorch/torch.cpp
View file @
833803f3
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2020-2022 by Contributors
* @file torch/torch.cpp
...
...
@@ -7,11 +8,11 @@
#include <c10/core/CPUAllocator.h>
#include <tensoradapter_exports.h>
#ifdef DGL_USE_CUDA
#include <ATen/
cuda/CUDA
Context.h>
#include <ATen/
cuda
/CachingHostAllocator.h>
#include <
c10/cuda/CUDACachingAllocator
.h>
#include <
c10/cuda/CUDAStream
.h>
#include <
cuda
_runtime.h>
#include <ATen/
hip/HIP
Context.h>
#include <ATen/
hip
/CachingHostAllocator.h>
#include <
ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA
.h>
#include <
ATen/hip/impl/HIPStreamMasqueradingAsCUDA
.h>
#include <
hip/hip
_runtime.h>
#endif // DGL_USE_CUDA
namespace
tensoradapter
{
...
...
@@ -27,29 +28,29 @@ TA_EXPORTS void CPURawDelete(void* ptr) {
}
#ifdef DGL_USE_CUDA
TA_EXPORTS
void
*
CUDARawAlloc
(
size_t
nbytes
,
cuda
Stream_t
stream
)
{
TA_EXPORTS
void
*
CUDARawAlloc
(
size_t
nbytes
,
hip
Stream_t
stream
)
{
at
::
globalContext
().
lazyInitCUDA
();
return
c10
::
cuda
::
CUDA
CachingAllocator
::
raw_alloc_with_stream
(
nbytes
,
stream
);
return
c10
::
hip
::
HIP
CachingAllocator
::
raw_alloc_with_stream
(
nbytes
,
stream
);
}
TA_EXPORTS
void
CUDARawDelete
(
void
*
ptr
)
{
c10
::
cuda
::
CUDA
CachingAllocator
::
raw_delete
(
ptr
);
c10
::
hip
::
HIP
CachingAllocator
::
raw_delete
(
ptr
);
}
TA_EXPORTS
cuda
Stream_t
CUDACurrentStream
()
{
return
at
::
cuda
::
getCurrent
CUDA
Stream
();
TA_EXPORTS
hip
Stream_t
CUDACurrentStream
()
{
return
at
::
hip
::
getCurrent
HIP
Stream
MasqueradingAsCUDA
();
}
TA_EXPORTS
void
RecordStream
(
void
*
ptr
,
cuda
Stream_t
stream
,
int
device_id
)
{
TA_EXPORTS
void
RecordStream
(
void
*
ptr
,
hip
Stream_t
stream
,
int
device_id
)
{
c10
::
DataPtr
data_ptr
{
ptr
,
ptr
,
c10
::
cuda
::
CUDA
CachingAllocator
::
get
()
->
raw_deleter
(),
ptr
,
ptr
,
c10
::
hip
::
HIP
CachingAllocator
MasqueradingAsCUDA
::
get
()
->
raw_deleter
(),
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
)};
c10
::
cuda
::
CUDA
CachingAllocator
::
recordStream
(
c10
::
hip
::
HIP
CachingAllocator
MasqueradingAsCUDA
::
recordStreamMasqueradingAsCUDA
(
data_ptr
,
// getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it
// getStreamFromExternal
MasqueradingAsCUDA
doesn't exist before PyTorch 1.10, just copy it
// here
c10
::
cuda
::
CUDAStream
(
c10
::
cuda
::
CUDAStream
::
UNCHECKED
,
c10
::
hip
::
HIPStreamMasqueradingAsCUDA
(
c10
::
hip
::
HIPStreamMasqueradingAsCUDA
::
UNCHECKED
,
c10
::
Stream
(
c10
::
Stream
::
UNSAFE
,
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
),
...
...
@@ -86,11 +87,11 @@ TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) {
}
TA_EXPORTS
void
CUDARecordHostAlloc
(
void
*
ptr
,
void
*
ctx
,
cuda
Stream_t
stream
,
int
device_id
)
{
void
*
ptr
,
void
*
ctx
,
hip
Stream_t
stream
,
int
device_id
)
{
at
::
cuda
::
CachingHostAllocator_recordEvent
(
ptr
,
ctx
,
c10
::
cuda
::
CUDAStream
(
c10
::
cuda
::
CUDAStream
::
UNCHECKED
,
c10
::
hip
::
HIPStreamMasqueradingAsCUDA
(
c10
::
hip
::
HIPStreamMasqueradingAsCUDA
::
UNCHECKED
,
c10
::
Stream
(
c10
::
Stream
::
UNSAFE
,
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
),
...
...
tests/cpp/message_queue_test.cc
View file @
833803f3
...
...
@@ -9,7 +9,7 @@
#include <thread>
#include <vector>
#include "../src/rpc/network/msg_queue.h"
#include "../
../
src/rpc/network/msg_queue.h"
using
dgl
::
network
::
Message
;
using
dgl
::
network
::
MessageQueue
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment