Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
62b5f50a
Unverified
Commit
62b5f50a
authored
Apr 10, 2023
by
Chang Liu
Committed by
GitHub
Apr 10, 2023
Browse files
[Feature] Import PyTorch's `pin_memory()` method for DGL graph structure (#5366)
parent
27b008b9
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
719 additions
and
56 deletions
+719
-56
include/dgl/aten/coo.h
include/dgl/aten/coo.h
+17
-0
include/dgl/aten/csr.h
include/dgl/aten/csr.h
+16
-0
include/dgl/runtime/device_api.h
include/dgl/runtime/device_api.h
+57
-6
include/dgl/runtime/ndarray.h
include/dgl/runtime/ndarray.h
+113
-11
include/dgl/runtime/tensordispatch.h
include/dgl/runtime/tensordispatch.h
+75
-4
python/dgl/heterograph_index.py
python/dgl/heterograph_index.py
+23
-3
src/graph/heterograph.cc
src/graph/heterograph.cc
+25
-0
src/graph/heterograph.h
src/graph/heterograph.h
+10
-0
src/graph/heterograph_capi.cc
src/graph/heterograph_capi.cc
+7
-0
src/graph/unit_graph.cc
src/graph/unit_graph.cc
+49
-0
src/graph/unit_graph.h
src/graph/unit_graph.h
+9
-0
src/runtime/c_runtime_api.cc
src/runtime/c_runtime_api.cc
+10
-0
src/runtime/cpu_device_api.cc
src/runtime/cpu_device_api.cc
+21
-8
src/runtime/cuda/cuda_device_api.cc
src/runtime/cuda/cuda_device_api.cc
+66
-15
src/runtime/ndarray.cc
src/runtime/ndarray.cc
+53
-9
tensoradapter/include/tensoradapter.h
tensoradapter/include/tensoradapter.h
+40
-0
tensoradapter/pytorch/torch.cpp
tensoradapter/pytorch/torch.cpp
+45
-0
tests/python/common/test_heterograph-index.py
tests/python/common/test_heterograph-index.py
+83
-0
No files found.
include/dgl/aten/coo.h
View file @
62b5f50a
...
@@ -64,6 +64,9 @@ struct COOMatrix {
...
@@ -64,6 +64,9 @@ struct COOMatrix {
data
(
darr
),
data
(
darr
),
row_sorted
(
rsorted
),
row_sorted
(
rsorted
),
col_sorted
(
csorted
)
{
col_sorted
(
csorted
)
{
is_pinned
=
(
aten
::
IsNullArray
(
row
)
||
row
.
IsPinned
())
&&
(
aten
::
IsNullArray
(
col
)
||
col
.
IsPinned
())
&&
(
aten
::
IsNullArray
(
data
)
||
data
.
IsPinned
());
CheckValidity
();
CheckValidity
();
}
}
...
@@ -133,6 +136,20 @@ struct COOMatrix {
...
@@ -133,6 +136,20 @@ struct COOMatrix {
col_sorted
);
col_sorted
);
}
}
/** @brief Return a copy of this matrix in pinned (page-locked) memory. */
inline
COOMatrix
PinMemory
()
{
if
(
is_pinned
)
return
*
this
;
auto
new_coo
=
COOMatrix
(
num_rows
,
num_cols
,
row
.
PinMemory
(),
col
.
PinMemory
(),
aten
::
IsNullArray
(
data
)
?
data
:
data
.
PinMemory
(),
row_sorted
,
col_sorted
);
CHECK
(
new_coo
.
is_pinned
)
<<
"An internal DGL error has occured while trying to pin a COO "
"matrix. Please file a bug at 'https://github.com/dmlc/dgl/issues' "
"with the above stacktrace."
;
return
new_coo
;
}
/**
/**
* @brief Pin the row, col and data (if not Null) of the matrix.
* @brief Pin the row, col and data (if not Null) of the matrix.
* @note This is an in-place method. Behavior depends on the current context,
* @note This is an in-place method. Behavior depends on the current context,
...
...
include/dgl/aten/csr.h
View file @
62b5f50a
...
@@ -60,6 +60,9 @@ struct CSRMatrix {
...
@@ -60,6 +60,9 @@ struct CSRMatrix {
indices
(
iarr
),
indices
(
iarr
),
data
(
darr
),
data
(
darr
),
sorted
(
sorted_flag
)
{
sorted
(
sorted_flag
)
{
is_pinned
=
(
aten
::
IsNullArray
(
indptr
)
||
indptr
.
IsPinned
())
&&
(
aten
::
IsNullArray
(
indices
)
||
indices
.
IsPinned
())
&&
(
aten
::
IsNullArray
(
data
)
||
data
.
IsPinned
());
CheckValidity
();
CheckValidity
();
}
}
...
@@ -126,6 +129,19 @@ struct CSRMatrix {
...
@@ -126,6 +129,19 @@ struct CSRMatrix {
aten
::
IsNullArray
(
data
)
?
data
:
data
.
CopyTo
(
ctx
),
sorted
);
aten
::
IsNullArray
(
data
)
?
data
:
data
.
CopyTo
(
ctx
),
sorted
);
}
}
/** @brief Return a copy of this matrix in pinned (page-locked) memory. */
inline
CSRMatrix
PinMemory
()
{
if
(
is_pinned
)
return
*
this
;
auto
new_csr
=
CSRMatrix
(
num_rows
,
num_cols
,
indptr
.
PinMemory
(),
indices
.
PinMemory
(),
aten
::
IsNullArray
(
data
)
?
data
:
data
.
PinMemory
(),
sorted
);
CHECK
(
new_csr
.
is_pinned
)
<<
"An internal DGL error has occured while trying to pin a CSR "
"matrix. Please file a bug at 'https://github.com/dmlc/dgl/issues' "
"with the above stacktrace."
;
return
new_csr
;
}
/**
/**
* @brief Pin the indptr, indices and data (if not Null) of the matrix.
* @brief Pin the indptr, indices and data (if not Null) of the matrix.
* @note This is an in-place method. Behavior depends on the current context,
* @note This is an in-place method. Behavior depends on the current context,
...
...
include/dgl/runtime/device_api.h
View file @
62b5f50a
...
@@ -50,11 +50,13 @@ class DeviceAPI {
...
@@ -50,11 +50,13 @@ class DeviceAPI {
* @brief Check whether the device is available.
* @brief Check whether the device is available.
*/
*/
virtual
bool
IsAvailable
()
{
return
true
;
}
virtual
bool
IsAvailable
()
{
return
true
;
}
/**
/**
* @brief Set the environment device id to ctx
* @brief Set the environment device id to ctx
* @param ctx The context to be set.
* @param ctx The context to be set.
*/
*/
virtual
void
SetDevice
(
DGLContext
ctx
)
=
0
;
virtual
void
SetDevice
(
DGLContext
ctx
)
=
0
;
/**
/**
* @brief Get attribute of specified device.
* @brief Get attribute of specified device.
* @param ctx The device context
* @param ctx The device context
...
@@ -64,6 +66,7 @@ class DeviceAPI {
...
@@ -64,6 +66,7 @@ class DeviceAPI {
*/
*/
virtual
void
GetAttr
(
virtual
void
GetAttr
(
DGLContext
ctx
,
DeviceAttrKind
kind
,
DGLRetValue
*
rv
)
=
0
;
DGLContext
ctx
,
DeviceAttrKind
kind
,
DGLRetValue
*
rv
)
=
0
;
/**
/**
* @brief Allocate a data space on device.
* @brief Allocate a data space on device.
* @param ctx The device context to perform operation.
* @param ctx The device context to perform operation.
...
@@ -76,28 +79,51 @@ class DeviceAPI {
...
@@ -76,28 +79,51 @@ class DeviceAPI {
virtual
void
*
AllocDataSpace
(
virtual
void
*
AllocDataSpace
(
DGLContext
ctx
,
size_t
nbytes
,
size_t
alignment
,
DGLContext
ctx
,
size_t
nbytes
,
size_t
alignment
,
DGLDataType
type_hint
)
=
0
;
DGLDataType
type_hint
)
=
0
;
/**
/**
* @brief Free a data space on device.
* @brief Free a data space on device.
* @param ctx The device context to perform operation.
* @param ctx The device context to perform operation.
* @param ptr The data space.
* @param ptr The data space.
*/
*/
virtual
void
FreeDataSpace
(
DGLContext
ctx
,
void
*
ptr
)
=
0
;
virtual
void
FreeDataSpace
(
DGLContext
ctx
,
void
*
ptr
)
=
0
;
/**
/**
* @brief copy data from one place to another
* @brief copy data from one place to another
* @param from The source array.
* @param from The source array.
* @param from_offset The byte offeset in the from.
* @param from_offset The byte offeset in the from.
* @param to The target array.
* @param to The target array.
* @param to_offset The byte offset in the to.
* @param to_offset The byte offset in the to.
* @param num_bytes The size of the memory in bytes
* @param num_bytes The size of the memory in bytes
.
* @param ctx_from The source context
* @param ctx_from The source context
.
* @param ctx_to The target context
* @param ctx_to The target context
.
* @param type_hint The type of elements, only neded by certain backends
.
* @param type_hint The type of elements, only ne
e
ded by certain backends
,
*
can be useful for cross device endian converison.
* can be useful for cross device endian converison.
*/
*/
virtual
void
CopyDataFromTo
(
virtual
void
CopyDataFromTo
(
const
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
const
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
num_bytes
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
size_t
num_bytes
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
)
=
0
;
DGLDataType
type_hint
)
=
0
;
/**
* @brief copy data between device and CPU while recording the event.
* @param from The source array.
* @param from_offset The byte offeset in the from.
* @param to The target array.
* @param to_offset The byte offset in the to.
* @param num_bytes The size of the memory in bytes.
* @param ctx_from The source context.
* @param ctx_to The target context.
* @param type_hint The type of elements, only needed by certain backends,
* can be useful for cross device endian converison.
* @param pytorch_ctx The context pointer from PyTorch's CachingHostAllocator.
* @note This function only works when PyTorch CachingHostAllocator is
* available.
*/
virtual
void
RecordedCopyDataFromTo
(
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
num_bytes
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
void
*
pytorch_ctx
)
=
0
;
/**
/**
* @brief Create a new stream of execution.
* @brief Create a new stream of execution.
*
*
...
@@ -119,16 +145,19 @@ class DeviceAPI {
...
@@ -119,16 +145,19 @@ class DeviceAPI {
* @param stream The stream to be sync.
* @param stream The stream to be sync.
*/
*/
virtual
void
StreamSync
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
=
0
;
virtual
void
StreamSync
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
=
0
;
/**
/**
* @brief Set the stream
* @brief Set the stream
* @param ctx The context to set stream.
* @param ctx The context to set stream.
* @param stream The stream to be set.
* @param stream The stream to be set.
*/
*/
virtual
void
SetStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
{}
virtual
void
SetStream
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
{}
/**
/**
* @brief Get the stream
* @brief Get the stream
*/
*/
virtual
DGLStreamHandle
GetStream
()
const
{
return
nullptr
;
}
virtual
DGLStreamHandle
GetStream
()
const
{
return
nullptr
;
}
/**
/**
* @brief Synchronize 2 streams of execution.
* @brief Synchronize 2 streams of execution.
*
*
...
@@ -160,6 +189,27 @@ class DeviceAPI {
...
@@ -160,6 +189,27 @@ class DeviceAPI {
*/
*/
DGL_DLL
virtual
void
UnpinData
(
void
*
ptr
);
DGL_DLL
virtual
void
UnpinData
(
void
*
ptr
);
/**
* @brief Allocate the pinned memory using PyTorch CachingHostAllocator.
*
* @param nbytes The size to be pinned.
* @param ctx Pointer to the context pointer from PyTorch's
* CachingHostAllocator.
* @param deleter Pointer to the deleter function from PyTorch's
* CachingHostAllocator.
*/
DGL_DLL
virtual
void
*
AllocPinnedDataSpace
(
size_t
nbytes
,
void
**
ctx
,
void
**
deleter
);
/**
* @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
* @note It avoids unnecessary cudaFreeHost calls and puts the memory
* block into CachingHostAllocator's free list.
* @param deleter Pointer to the deleter function from PyTorch's
* CachingHostAllocator.
*/
DGL_DLL
virtual
void
FreePinnedDataSpace
(
void
**
deleter
);
/**
/**
* @brief Check whether the memory is in pinned memory.
* @brief Check whether the memory is in pinned memory.
*/
*/
...
@@ -184,6 +234,7 @@ class DeviceAPI {
...
@@ -184,6 +234,7 @@ class DeviceAPI {
*/
*/
DGL_DLL
virtual
void
*
AllocWorkspace
(
DGL_DLL
virtual
void
*
AllocWorkspace
(
DGLContext
ctx
,
size_t
nbytes
,
DGLDataType
type_hint
=
{});
DGLContext
ctx
,
size_t
nbytes
,
DGLDataType
type_hint
=
{});
/**
/**
* @brief Free temporal workspace in backend execution.
* @brief Free temporal workspace in backend execution.
*
*
...
@@ -201,7 +252,7 @@ class DeviceAPI {
...
@@ -201,7 +252,7 @@ class DeviceAPI {
DGL_DLL
static
DeviceAPI
*
Get
(
DGLContext
ctx
,
bool
allow_missing
=
false
);
DGL_DLL
static
DeviceAPI
*
Get
(
DGLContext
ctx
,
bool
allow_missing
=
false
);
/**
/**
* @brief Get device API based on
context
.
* @brief Get device API based on
device type
.
* @param dev_type The device type
* @param dev_type The device type
* @param allow_missing Whether allow missing
* @param allow_missing Whether allow missing
* @return The corresponding device API.
* @return The corresponding device API.
...
...
include/dgl/runtime/ndarray.h
View file @
62b5f50a
...
@@ -154,6 +154,7 @@ class NDArray {
...
@@ -154,6 +154,7 @@ class NDArray {
else
else
return
static_cast
<
T
*>
(
operator
->
()
->
data
);
return
static_cast
<
T
*>
(
operator
->
()
->
data
);
}
}
/**
/**
* @brief Copy data content from/into another array.
* @brief Copy data content from/into another array.
* @param other The source array to be copied from.
* @param other The source array to be copied from.
...
@@ -171,19 +172,34 @@ class NDArray {
...
@@ -171,19 +172,34 @@ class NDArray {
* @return The array under another context.
* @return The array under another context.
*/
*/
inline
NDArray
CopyTo
(
const
DGLContext
&
ctx
)
const
;
inline
NDArray
CopyTo
(
const
DGLContext
&
ctx
)
const
;
/**
/**
* @brief Return a new array with a copy of the content.
* @brief Return a new array with a copy of the content.
*/
*/
inline
NDArray
Clone
()
const
;
inline
NDArray
Clone
()
const
;
/**
* @brief Return a copy of the current instance of NDArray in pinned
* (page-locked) memory.
* @note This is an out-of-place method, which utilizes PyTorch's
* CachingHostAllocator for allocating pinned memory and copying data
* from the current NDAarray. As a result, PyTorch is responsible for
* managing the lifecycle of the returned NDArray, including deciding
* when to flush the data for reuse or call cudaFreeHost. The current
* context must be kDGLCPU, otherwise, an error will be thrown.
*/
inline
NDArray
PinMemory
();
/**
/**
* @brief In-place method to pin the current array by calling PinContainer
* @brief In-place method to pin the current array by calling PinContainer
* on the underlying NDArray:Container.
* on the underlying NDArray:Container.
* @note This is an in-place method
. Behavior depends on the current context,
* @note This is an in-place method
that flags the memory as page-locked by
*
kDGLCPU: will be pinned;
*
utilizing cudaHostRegister at the underlying level to pin the current
*
IsPinned: directly return;
*
instance of NDArray. The current context must be kDGLCPU, otherwise,
*
kDGLCUDA: invalid,
will throw
an error
.
*
an error
will
be
throw
n
.
*/
*/
inline
void
PinMemory_
();
inline
void
PinMemory_
();
/**
/**
* @brief In-place method to unpin the current array by calling UnpinContainer
* @brief In-place method to unpin the current array by calling UnpinContainer
* on the underlying NDArray:Container.
* on the underlying NDArray:Container.
...
@@ -192,26 +208,31 @@ class NDArray {
...
@@ -192,26 +208,31 @@ class NDArray {
* others: directly return.
* others: directly return.
*/
*/
inline
void
UnpinMemory_
();
inline
void
UnpinMemory_
();
/**
/**
* @brief Check if the array is pinned.
* @brief Check if the array is pinned.
*/
*/
inline
bool
IsPinned
()
const
;
inline
bool
IsPinned
()
const
;
/**
/**
* @brief Record streams that are using the underlying tensor.
* @brief Record streams that are using the underlying tensor.
* @param stream The stream that is using the underlying tensor.
* @param stream The stream that is using the underlying tensor.
*/
*/
inline
void
RecordStream
(
DGLStreamHandle
stream
)
const
;
inline
void
RecordStream
(
DGLStreamHandle
stream
)
const
;
/**
/**
* @brief Load NDArray from stream
* @brief Load NDArray from stream
* @param stream The input data stream
* @param stream The input data stream
* @return Whether load is successful
* @return Whether load is successful
*/
*/
bool
Load
(
dmlc
::
Stream
*
stream
);
bool
Load
(
dmlc
::
Stream
*
stream
);
/**
/**
* @brief Save NDArray to stream
* @brief Save NDArray to stream
* @param stream The output data stream
* @param stream The output data stream
*/
*/
void
Save
(
dmlc
::
Stream
*
stream
)
const
;
void
Save
(
dmlc
::
Stream
*
stream
)
const
;
/**
/**
* @brief Create a NDArray that shares the data memory with the current one.
* @brief Create a NDArray that shares the data memory with the current one.
* @param shape The shape of the new array.
* @param shape The shape of the new array.
...
@@ -221,27 +242,40 @@ class NDArray {
...
@@ -221,27 +242,40 @@ class NDArray {
*/
*/
DGL_DLL
NDArray
DGL_DLL
NDArray
CreateView
(
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
int64_t
offset
=
0
);
CreateView
(
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
int64_t
offset
=
0
);
/**
/**
* @brief Create an empty NDArray.
* @brief Create an empty NDArray.
* @param shape The shape of the new array.
* @param shape The shape of the new array.
* @param dtype The data type of the new array.
* @param dtype The data type of the new array.
* @param ctx The context of the
A
rray.
* @param ctx The context of the
a
rray.
* @return The created Array
* @return The created Array
*/
*/
DGL_DLL
static
NDArray
Empty
(
DGL_DLL
static
NDArray
Empty
(
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
DGLContext
ctx
);
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
DGLContext
ctx
);
/**
* @brief Create an empty NDArray in pinned memory.
* @param shape The shape of the new array.
* @param dtype The data type of the new array.
* @param ctx The context of the array.
* @return The created array.
*/
DGL_DLL
static
NDArray
PinnedEmpty
(
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
DGLContext
ctx
);
/**
/**
* @brief Create an empty NDArray with shared memory.
* @brief Create an empty NDArray with shared memory.
* @param name The name of shared memory.
* @param name The name of shared memory.
* @param shape The shape of the new array.
* @param shape The shape of the new array.
* @param dtype The data type of the new array.
* @param dtype The data type of the new array.
* @param ctx The context of the
A
rray.
* @param ctx The context of the
a
rray.
* @param is_create whether to create shared memory.
* @param is_create whether to create shared memory.
* @return The created Array
* @return The created Array
*/
*/
DGL_DLL
static
NDArray
EmptyShared
(
DGL_DLL
static
NDArray
EmptyShared
(
const
std
::
string
&
name
,
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
const
std
::
string
&
name
,
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
DGLContext
ctx
,
bool
is_create
);
DGLContext
ctx
,
bool
is_create
);
/**
/**
* @brief Get the size of the array in the number of bytes.
* @brief Get the size of the array in the number of bytes.
*/
*/
...
@@ -288,6 +322,18 @@ class NDArray {
...
@@ -288,6 +322,18 @@ class NDArray {
DGL_DLL
static
void
CopyFromTo
(
DGL_DLL
static
void
CopyFromTo
(
DGLArray
*
from
,
DGLArray
*
to
,
DGLStreamHandle
stream
);
DGLArray
*
from
,
DGLArray
*
to
,
DGLStreamHandle
stream
);
/**
* @brief Function to copy data between device and CPU while recording the
* event.
* @param from The source array.
* @param to The target array.
* @param pytorch_ctx The context pointer from PyTorch's CachingHostAllocator.
* @note This function fuses data-copy and event recording to ensure
* CachingHostAllocator works properly.
*/
DGL_DLL
static
void
RecordedCopyFromTo
(
DGLArray
*
from
,
DGLArray
*
to
,
void
*
pytorch_ctx
);
/**
/**
* @brief Function to pin the DGLArray of a Container.
* @brief Function to pin the DGLArray of a Container.
* @param ptr The container to be pinned.
* @param ptr The container to be pinned.
...
@@ -428,7 +474,20 @@ struct NDArray::Container {
...
@@ -428,7 +474,20 @@ struct NDArray::Container {
/** @brief The internal array object */
/** @brief The internal array object */
std
::
atomic
<
int
>
ref_counter_
{
0
};
std
::
atomic
<
int
>
ref_counter_
{
0
};
/** @brief Whether underlying dl_tensor is pinned by DGL. */
bool
pinned_by_dgl_
{
false
};
bool
pinned_by_dgl_
{
false
};
/** @brief Whether underlying dl_tensor is pinned by PyTorch
* (CachingHostAllocator). */
bool
pinned_by_pytorch_
{
false
};
/** @brief The PyTorch storage ctx ptr if pinned_by_pytorch_ = True. */
void
*
pytorch_ctx_
{
nullptr
};
/** @brief Pointer to the corresp. PyTorch deleter if pinned_by_pytorch_ =
* True.
*/
void
*
pytorch_raw_deleter_
{
nullptr
};
};
};
// implementations of inline functions
// implementations of inline functions
...
@@ -455,6 +514,22 @@ inline void NDArray::CopyFrom(DGLArray* other) {
...
@@ -455,6 +514,22 @@ inline void NDArray::CopyFrom(DGLArray* other) {
inline
void
NDArray
::
CopyFrom
(
const
NDArray
&
other
)
{
inline
void
NDArray
::
CopyFrom
(
const
NDArray
&
other
)
{
CHECK
(
other
.
data_
!=
nullptr
);
CHECK
(
other
.
data_
!=
nullptr
);
// Copy between two devices
if
(
data_
->
dl_tensor
.
ctx
.
device_type
!=
other
.
data_
->
dl_tensor
.
ctx
.
device_type
)
{
CHECK
(
data_
!=
nullptr
);
auto
to_ctx_type
=
data_
->
dl_tensor
.
ctx
.
device_type
;
auto
cpu_data
=
(
to_ctx_type
==
kDGLCPU
?
data_
:
other
.
data_
);
// Pinned by PyTorch
if
(
cpu_data
->
pinned_by_pytorch_
)
{
// To ensure correct behavior, the event must be recorded after
// cudaMemcpyAsync as long as the memory is pinned by PyTorch.
void
*
pytorch_ctx
=
cpu_data
->
pytorch_ctx_
;
RecordedCopyFromTo
(
&
(
other
.
data_
->
dl_tensor
),
&
(
data_
->
dl_tensor
),
pytorch_ctx
);
return
;
}
}
CopyFrom
(
&
(
other
.
data_
->
dl_tensor
));
CopyFrom
(
&
(
other
.
data_
->
dl_tensor
));
}
}
...
@@ -465,23 +540,50 @@ inline void NDArray::CopyTo(DGLArray* other) const {
...
@@ -465,23 +540,50 @@ inline void NDArray::CopyTo(DGLArray* other) const {
inline
void
NDArray
::
CopyTo
(
const
NDArray
&
other
)
const
{
inline
void
NDArray
::
CopyTo
(
const
NDArray
&
other
)
const
{
CHECK
(
other
.
data_
!=
nullptr
);
CHECK
(
other
.
data_
!=
nullptr
);
// copy between two devices
if
(
data_
->
dl_tensor
.
ctx
.
device_type
!=
other
.
data_
->
dl_tensor
.
ctx
.
device_type
)
{
CHECK
(
data_
!=
nullptr
);
auto
from_ctx_type
=
data_
->
dl_tensor
.
ctx
.
device_type
;
auto
cpu_data
=
(
from_ctx_type
==
kDGLCPU
?
data_
:
other
.
data_
);
// pinned by PyTorch
if
(
cpu_data
->
pinned_by_pytorch_
)
{
// To ensure correct behavior, the event must be recorded after
// cudaMemcpyAsync as long as the memory is pinned by PyTorch.
void
*
pytorch_ctx
=
cpu_data
->
pytorch_ctx_
;
RecordedCopyFromTo
(
&
(
data_
->
dl_tensor
),
&
(
other
.
data_
->
dl_tensor
),
pytorch_ctx
);
return
;
}
}
CopyTo
(
&
(
other
.
data_
->
dl_tensor
));
CopyTo
(
&
(
other
.
data_
->
dl_tensor
));
}
}
inline
NDArray
NDArray
::
CopyTo
(
const
DGLContext
&
ctx
)
const
{
inline
NDArray
NDArray
::
CopyTo
(
const
DGLContext
&
ctx
)
const
{
CHECK
(
data_
!=
nullptr
);
CHECK
(
data_
!=
nullptr
);
const
DGLArray
*
dptr
=
operator
->
();
const
DGLArray
*
array
=
operator
->
();
NDArray
ret
=
Empty
(
NDArray
ret
=
Empty
(
std
::
vector
<
int64_t
>
(
dptr
->
shape
,
dptr
->
shape
+
dptr
->
ndim
),
dptr
->
dtype
,
std
::
vector
<
int64_t
>
(
array
->
shape
,
array
->
shape
+
array
->
ndim
),
ctx
);
array
->
dtype
,
ctx
);
this
->
CopyTo
(
ret
);
this
->
CopyTo
(
ret
);
return
ret
;
return
ret
;
}
}
inline
NDArray
NDArray
::
Clone
()
const
{
inline
NDArray
NDArray
::
Clone
()
const
{
CHECK
(
data_
!=
nullptr
);
CHECK
(
data_
!=
nullptr
);
const
DGLArray
*
dptr
=
operator
->
();
const
DGLArray
*
array
=
operator
->
();
return
this
->
CopyTo
(
dptr
->
ctx
);
return
this
->
CopyTo
(
array
->
ctx
);
}
inline
NDArray
NDArray
::
PinMemory
()
{
CHECK
(
data_
!=
nullptr
);
const
DGLArray
*
array
=
operator
->
();
auto
ctx
=
array
->
ctx
;
NDArray
ret
=
PinnedEmpty
(
std
::
vector
<
int64_t
>
(
array
->
shape
,
array
->
shape
+
array
->
ndim
),
array
->
dtype
,
ctx
);
this
->
CopyTo
(
ret
);
return
ret
;
}
}
inline
void
NDArray
::
PinMemory_
()
{
inline
void
NDArray
::
PinMemory_
()
{
...
...
include/dgl/runtime/tensordispatch.h
View file @
62b5f50a
...
@@ -134,6 +134,70 @@ class TensorDispatcher {
...
@@ -134,6 +134,70 @@ class TensorDispatcher {
auto
entry
=
entrypoints_
[
Op
::
kCUDACurrentStream
];
auto
entry
=
entrypoints_
[
Op
::
kCUDACurrentStream
];
return
FUNCCAST
(
tensoradapter
::
CUDACurrentStream
,
entry
)();
return
FUNCCAST
(
tensoradapter
::
CUDACurrentStream
,
entry
)();
}
}
/**
* @brief Allocate a piece of pinned CPU memory via PyTorch
* CachingHostAllocator.
* @note Used in CUDADeviceAPI::AllocPinnedDataSpace().
* @param nbytes The size to be allocated.
* @param ctx Pointer to the PyTorch storage ctx ptr returned from the
* allocator.
* @param deleter Pointer to the delete function ptr returned from the
* allocator.
* @return Raw pointer to the allocated memory.
*/
inline
void
*
CUDAAllocHostWorkspace
(
size_t
nbytes
,
void
**
ctx
,
void
**
deleter
)
{
auto
entry
=
entrypoints_
[
Op
::
kCUDARawHostAlloc
];
auto
alloc_func
=
FUNCCAST
(
tensoradapter
::
CUDARawHostAlloc
,
entry
);
return
alloc_func
(
nbytes
,
ctx
,
deleter
);
}
/**
* @brief Insert the pinned memory block (allocated via PyTorch
* CachingHostAllocator) back to the free list for future usage.(ref:
* pytorch/pytorch/blob/master/aten/src/ATen/cuda/CachingHostAllocator.cpp).
* @note Used in CUDADeviceAPI::FreePinnedDataSpace().
* @param deleter Pointer to the delete function ptr returned from the
* allocator.
*/
inline
void
CUDAFreeHostWorkspace
(
void
**
deleter
)
{
auto
entry
=
entrypoints_
[
Op
::
kCUDARawHostDelete
];
FUNCCAST
(
tensoradapter
::
CUDARawHostDelete
,
entry
)(
deleter
);
}
/**
* @brief Invoke the record_event function call from PyTorch
* CachingHostAllocator.
* @note This function assoicates a CUDA stream (used by a copy kernel) to the
* pinned data. In the free path of this data, which is achieved by
* calling CUDAFreeHostWorkspace, the set of associated streams is then
* consumed to ensure proper functionlity. (ref:
* pytorch/pytorch/blob/master/aten/src/ATen/cuda/CachingHostAllocator.cpp).
* Used in CUDADeviceAPI::RecordedCopyDataFromTo().
*
* @param data Pointer of the tensor to be recorded.
* @param ctx PyTorch storage ctx ptr returned from the allocator.
* @param stream The stream that currently consumes this tensor.
* @param device_id Device of the tensor.
*/
inline
void
CUDARecordHostAlloc
(
void
*
data
,
void
*
ctx
,
cudaStream_t
stream
,
int
device_id
)
{
auto
entry
=
entrypoints_
[
Op
::
kCUDARecordHostAlloc
];
auto
recorded_alloc
=
FUNCCAST
(
tensoradapter
::
CUDARecordHostAlloc
,
entry
);
recorded_alloc
(
data
,
ctx
,
stream
,
device_id
);
}
/**
* @brief Release cached pinned memory allocations via cudaHostFree.
* @note Used in CUDADeviceAPI::PinData() before pinning any host memory by
* DGL.
*/
inline
void
CUDAHostAllocatorEmptyCache
()
{
auto
entry
=
entrypoints_
[
Op
::
kCUDAHostAllocatorEmptyCache
];
FUNCCAST
(
tensoradapter
::
CUDAHostAllocatorEmptyCache
,
entry
)();
}
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
/**
/**
...
@@ -149,7 +213,7 @@ class TensorDispatcher {
...
@@ -149,7 +213,7 @@ class TensorDispatcher {
auto
entry
=
entrypoints_
[
Op
::
kRecordStream
];
auto
entry
=
entrypoints_
[
Op
::
kRecordStream
];
FUNCCAST
(
tensoradapter
::
RecordStream
,
entry
)
FUNCCAST
(
tensoradapter
::
RecordStream
,
entry
)
(
ptr
,
static_cast
<
cudaStream_t
>
(
stream
),
device_id
);
(
ptr
,
static_cast
<
cudaStream_t
>
(
stream
),
device_id
);
#endif
// DGL_USE_CUDA
#endif
}
}
private:
private:
...
@@ -164,9 +228,12 @@ class TensorDispatcher {
...
@@ -164,9 +228,12 @@ class TensorDispatcher {
* Must match the functions in tensoradapter/include/tensoradapter.h.
* Must match the functions in tensoradapter/include/tensoradapter.h.
*/
*/
static
constexpr
const
char
*
names_
[]
=
{
static
constexpr
const
char
*
names_
[]
=
{
"CPURawAlloc"
,
"CPURawDelete"
,
"CPURawAlloc"
,
"CPURawDelete"
,
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
"CUDARawAlloc"
,
"CUDARawDelete"
,
"CUDACurrentStream"
,
"RecordStream"
,
"CUDARawAlloc"
,
"CUDARawDelete"
,
"CUDACurrentStream"
,
"RecordStream"
,
"CUDARawHostAlloc"
,
"CUDARawHostDelete"
,
"CUDARecordHostAlloc"
,
"CUDAHostAllocatorEmptyCache"
,
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
};
};
...
@@ -180,6 +247,10 @@ class TensorDispatcher {
...
@@ -180,6 +247,10 @@ class TensorDispatcher {
static
constexpr
int
kCUDARawDelete
=
3
;
static
constexpr
int
kCUDARawDelete
=
3
;
static
constexpr
int
kCUDACurrentStream
=
4
;
static
constexpr
int
kCUDACurrentStream
=
4
;
static
constexpr
int
kRecordStream
=
5
;
static
constexpr
int
kRecordStream
=
5
;
static
constexpr
int
kCUDARawHostAlloc
=
6
;
static
constexpr
int
kCUDARawHostDelete
=
7
;
static
constexpr
int
kCUDARecordHostAlloc
=
8
;
static
constexpr
int
kCUDAHostAllocatorEmptyCache
=
9
;
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
};
};
...
@@ -190,7 +261,7 @@ class TensorDispatcher {
...
@@ -190,7 +261,7 @@ class TensorDispatcher {
void
*
entrypoints_
[
num_entries_
]
=
{
void
*
entrypoints_
[
num_entries_
]
=
{
nullptr
,
nullptr
,
nullptr
,
nullptr
,
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
};
};
...
...
python/dgl/heterograph_index.py
View file @
62b5f50a
...
@@ -254,12 +254,32 @@ class HeteroGraphIndex(ObjectBase):
...
@@ -254,12 +254,32 @@ class HeteroGraphIndex(ObjectBase):
"""
"""
return
_CAPI_DGLHeteroCopyTo
(
self
,
ctx
.
device_type
,
ctx
.
device_id
)
return
_CAPI_DGLHeteroCopyTo
(
self
,
ctx
.
device_type
,
ctx
.
device_id
)
def
pin_memory
(
self
):
"""Copies the graph structure to pinned memory, if it's not already
pinned.
NOTE: This function is similar to PyTorch's Tensor.pin_memory(), but
tailored for graphs. It utilizes the same pin_memory allocator as
PyTorch, so the lifecycle of the graph is also managed by PyTorch.
If a batch includes a DGL graph object (HeteroGraphIndex),
PyTorch's DataLoader memory pinning logic will detect it and
automatically activate this function when pin_memory=True.
Returns
-------
HeteroGraphIndex
The pinned graph index.
"""
return
_CAPI_DGLHeteroPinMemory
(
self
)
def
pin_memory_
(
self
):
def
pin_memory_
(
self
):
"""Pin this graph to the page-locked memory.
"""Pin this graph to the page-locked memory.
NOTE: This is an inplace method.
NOTE: This is an inplace method to pin the current graph index, i.e.,
The graph structure must be on CPU to be pinned.
it does not require new memory allocation but simply flags the
If the graph struture is already pinned, the function directly returns it.
existing graph structure to be page-locked. The graph structure
must be on CPU to be pinned. If the graph struture is already
pinned, the function directly returns it.
Returns
Returns
-------
-------
...
...
src/graph/heterograph.cc
View file @
62b5f50a
...
@@ -277,6 +277,31 @@ HeteroGraphPtr HeteroGraph::CopyTo(HeteroGraphPtr g, const DGLContext& ctx) {
...
@@ -277,6 +277,31 @@ HeteroGraphPtr HeteroGraph::CopyTo(HeteroGraphPtr g, const DGLContext& ctx) {
hgindex
->
meta_graph_
,
rel_graphs
,
hgindex
->
num_verts_per_type_
));
hgindex
->
meta_graph_
,
rel_graphs
,
hgindex
->
num_verts_per_type_
));
}
}
HeteroGraphPtr
HeteroGraph
::
PinMemory
(
HeteroGraphPtr
g
)
{
auto
casted_ptr
=
std
::
dynamic_pointer_cast
<
HeteroGraph
>
(
g
);
CHECK_NOTNULL
(
casted_ptr
);
auto
relation_graphs
=
casted_ptr
->
relation_graphs_
;
auto
it
=
std
::
find_if_not
(
relation_graphs
.
begin
(),
relation_graphs
.
end
(),
[](
auto
&
underlying_g
)
{
return
underlying_g
->
IsPinned
();
});
// All underlying relation graphs are pinned, return the input hetero-graph
// directly.
if
(
it
==
relation_graphs
.
end
())
return
g
;
std
::
vector
<
HeteroGraphPtr
>
pinned_relation_graphs
(
relation_graphs
.
size
());
for
(
size_t
i
=
0
;
i
<
pinned_relation_graphs
.
size
();
++
i
)
{
if
(
!
relation_graphs
[
i
]
->
IsPinned
())
{
pinned_relation_graphs
[
i
]
=
relation_graphs
[
i
]
->
PinMemory
();
}
else
{
pinned_relation_graphs
[
i
]
=
relation_graphs
[
i
];
}
}
return
HeteroGraphPtr
(
new
HeteroGraph
(
casted_ptr
->
meta_graph_
,
pinned_relation_graphs
,
casted_ptr
->
num_verts_per_type_
));
}
void
HeteroGraph
::
PinMemory_
()
{
void
HeteroGraph
::
PinMemory_
()
{
for
(
auto
g
:
relation_graphs_
)
g
->
PinMemory_
();
for
(
auto
g
:
relation_graphs_
)
g
->
PinMemory_
();
}
}
...
...
src/graph/heterograph.h
View file @
62b5f50a
...
@@ -249,6 +249,16 @@ class HeteroGraph : public BaseHeteroGraph {
...
@@ -249,6 +249,16 @@ class HeteroGraph : public BaseHeteroGraph {
*/
*/
void
UnpinMemory_
();
void
UnpinMemory_
();
/**
* @brief Copy the current graph to pinned memory managed by
* PyTorch CachingHostAllocator for each relation graph.
* @note If any of the underlying relation graphs are already pinned, the
* function will utilize their existing copies. If all of them are
* pinned, the function will return the original input hetero-graph
* directly.
*/
static
HeteroGraphPtr
PinMemory
(
HeteroGraphPtr
g
);
/**
/**
* @brief Record stream for this graph.
* @brief Record stream for this graph.
* @param stream The stream that is using the graph
* @param stream The stream that is using the graph
...
...
src/graph/heterograph_capi.cc
View file @
62b5f50a
...
@@ -489,6 +489,13 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCopyTo")
...
@@ -489,6 +489,13 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCopyTo")
*
rv
=
HeteroGraphRef
(
hg_new
);
*
rv
=
HeteroGraphRef
(
hg_new
);
});
});
DGL_REGISTER_GLOBAL
(
"heterograph_index._CAPI_DGLHeteroPinMemory"
)
.
set_body
([](
DGLArgs
args
,
DGLRetValue
*
rv
)
{
HeteroGraphRef
hg
=
args
[
0
];
HeteroGraphPtr
hg_new
=
HeteroGraph
::
PinMemory
(
hg
.
sptr
());
*
rv
=
HeteroGraphRef
(
hg_new
);
});
DGL_REGISTER_GLOBAL
(
"heterograph_index._CAPI_DGLHeteroPinMemory_"
)
DGL_REGISTER_GLOBAL
(
"heterograph_index._CAPI_DGLHeteroPinMemory_"
)
.
set_body
([](
DGLArgs
args
,
DGLRetValue
*
rv
)
{
.
set_body
([](
DGLArgs
args
,
DGLRetValue
*
rv
)
{
HeteroGraphRef
hg
=
args
[
0
];
HeteroGraphRef
hg
=
args
[
0
];
...
...
src/graph/unit_graph.cc
View file @
62b5f50a
...
@@ -140,6 +140,15 @@ class UnitGraph::COO : public BaseHeteroGraph {
...
@@ -140,6 +140,15 @@ class UnitGraph::COO : public BaseHeteroGraph {
return
COO
(
meta_graph_
,
adj_
.
CopyTo
(
ctx
));
return
COO
(
meta_graph_
,
adj_
.
CopyTo
(
ctx
));
}
}
/**
* @brief Copy the adj_ to pinned memory.
* @return COOMatrix of the COO graph.
*/
COO
PinMemory
()
{
if
(
adj_
.
is_pinned
)
return
*
this
;
return
COO
(
meta_graph_
,
adj_
.
PinMemory
());
}
/** @brief Pin the adj_: COOMatrix of the COO graph. */
/** @brief Pin the adj_: COOMatrix of the COO graph. */
void
PinMemory_
()
{
adj_
.
PinMemory_
();
}
void
PinMemory_
()
{
adj_
.
PinMemory_
();
}
...
@@ -535,6 +544,15 @@ class UnitGraph::CSR : public BaseHeteroGraph {
...
@@ -535,6 +544,15 @@ class UnitGraph::CSR : public BaseHeteroGraph {
}
}
}
}
/**
* @brief Copy the adj_ to pinned memory.
* @return CSRMatrix of the CSR graph.
*/
CSR
PinMemory
()
{
if
(
adj_
.
is_pinned
)
return
*
this
;
return
CSR
(
meta_graph_
,
adj_
.
PinMemory
());
}
/** @brief Pin the adj_: CSRMatrix of the CSR graph. */
/** @brief Pin the adj_: CSRMatrix of the CSR graph. */
void
PinMemory_
()
{
adj_
.
PinMemory_
();
}
void
PinMemory_
()
{
adj_
.
PinMemory_
();
}
...
@@ -1259,6 +1277,37 @@ HeteroGraphPtr UnitGraph::CopyTo(HeteroGraphPtr g, const DGLContext& ctx) {
...
@@ -1259,6 +1277,37 @@ HeteroGraphPtr UnitGraph::CopyTo(HeteroGraphPtr g, const DGLContext& ctx) {
}
}
}
}
HeteroGraphPtr
UnitGraph
::
PinMemory
()
{
CSRPtr
pinned_in_csr
,
pinned_out_csr
;
COOPtr
pinned_coo
;
if
(
this
->
in_csr_
->
defined
()
&&
this
->
in_csr_
->
IsPinned
())
{
pinned_in_csr
=
this
->
in_csr_
;
}
else
if
(
this
->
in_csr_
->
defined
())
{
pinned_in_csr
=
CSRPtr
(
new
CSR
(
this
->
in_csr_
->
PinMemory
()));
}
else
{
pinned_in_csr
=
nullptr
;
}
if
(
this
->
out_csr_
->
defined
()
&&
this
->
out_csr_
->
IsPinned
())
{
pinned_out_csr
=
this
->
out_csr_
;
}
else
if
(
this
->
out_csr_
->
defined
())
{
pinned_out_csr
=
CSRPtr
(
new
CSR
(
this
->
out_csr_
->
PinMemory
()));
}
else
{
pinned_out_csr
=
nullptr
;
}
if
(
this
->
coo_
->
defined
()
&&
this
->
coo_
->
IsPinned
())
{
pinned_coo
=
this
->
coo_
;
}
else
if
(
this
->
coo_
->
defined
())
{
pinned_coo
=
COOPtr
(
new
COO
(
this
->
coo_
->
PinMemory
()));
}
else
{
pinned_coo
=
nullptr
;
}
return
HeteroGraphPtr
(
new
UnitGraph
(
meta_graph
(),
pinned_in_csr
,
pinned_out_csr
,
pinned_coo
,
this
->
formats_
));
}
void
UnitGraph
::
PinMemory_
()
{
void
UnitGraph
::
PinMemory_
()
{
if
(
this
->
in_csr_
->
defined
())
this
->
in_csr_
->
PinMemory_
();
if
(
this
->
in_csr_
->
defined
())
this
->
in_csr_
->
PinMemory_
();
if
(
this
->
out_csr_
->
defined
())
this
->
out_csr_
->
PinMemory_
();
if
(
this
->
out_csr_
->
defined
())
this
->
out_csr_
->
PinMemory_
();
...
...
src/graph/unit_graph.h
View file @
62b5f50a
...
@@ -222,6 +222,15 @@ class UnitGraph : public BaseHeteroGraph {
...
@@ -222,6 +222,15 @@ class UnitGraph : public BaseHeteroGraph {
*/
*/
void
UnpinMemory_
();
void
UnpinMemory_
();
/**
* @brief Create a copy of the current graph in pinned memory.
* @note The graph will be pinned outplace through PyTorch
* CachingHostAllocator, if available. Otherwise, an error will be thrown.
* If any of the underlying structures (incsr, outcsr, coo) are already
* pinned, the function will simply use its original copy.
*/
HeteroGraphPtr
PinMemory
();
/**
/**
* @brief Record stream for this graph.
* @brief Record stream for this graph.
* @param stream The stream that is using the graph
* @param stream The stream that is using the graph
...
...
src/runtime/c_runtime_api.cc
View file @
62b5f50a
...
@@ -126,6 +126,16 @@ bool DeviceAPI::PinData(void* ptr, size_t nbytes) {
...
@@ -126,6 +126,16 @@ bool DeviceAPI::PinData(void* ptr, size_t nbytes) {
return
false
;
return
false
;
}
}
void
*
DeviceAPI
::
AllocPinnedDataSpace
(
size_t
nbytes
,
void
**
ctx
,
void
**
deleter
)
{
LOG
(
FATAL
)
<<
"Device does not support cudaHostAlloc api."
;
return
nullptr
;
}
void
DeviceAPI
::
FreePinnedDataSpace
(
void
**
deleter
)
{
LOG
(
FATAL
)
<<
"Device does not support cudaHostFree api."
;
}
void
DeviceAPI
::
UnpinData
(
void
*
ptr
)
{
void
DeviceAPI
::
UnpinData
(
void
*
ptr
)
{
LOG
(
FATAL
)
<<
"Device does not support cudaHostUnregister api."
;
LOG
(
FATAL
)
<<
"Device does not support cudaHostUnregister api."
;
}
}
...
...
src/runtime/cpu_device_api.cc
View file @
62b5f50a
...
@@ -26,8 +26,9 @@ class CPUDeviceAPI final : public DeviceAPI {
...
@@ -26,8 +26,9 @@ class CPUDeviceAPI final : public DeviceAPI {
void
*
AllocDataSpace
(
void
*
AllocDataSpace
(
DGLContext
ctx
,
size_t
nbytes
,
size_t
alignment
,
DGLContext
ctx
,
size_t
nbytes
,
size_t
alignment
,
DGLDataType
type_hint
)
final
{
DGLDataType
type_hint
)
final
{
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
return
td
->
CPUAllocWorkspace
(
nbytes
);
if
(
tensor_dispatcher
->
IsAvailable
())
return
tensor_dispatcher
->
CPUAllocWorkspace
(
nbytes
);
void
*
ptr
;
void
*
ptr
;
#if _MSC_VER || defined(__MINGW32__)
#if _MSC_VER || defined(__MINGW32__)
...
@@ -44,8 +45,9 @@ class CPUDeviceAPI final : public DeviceAPI {
...
@@ -44,8 +45,9 @@ class CPUDeviceAPI final : public DeviceAPI {
}
}
void
FreeDataSpace
(
DGLContext
ctx
,
void
*
ptr
)
final
{
void
FreeDataSpace
(
DGLContext
ctx
,
void
*
ptr
)
final
{
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
return
td
->
CPUFreeWorkspace
(
ptr
);
if
(
tensor_dispatcher
->
IsAvailable
())
return
tensor_dispatcher
->
CPUFreeWorkspace
(
ptr
);
#if _MSC_VER || defined(__MINGW32__)
#if _MSC_VER || defined(__MINGW32__)
_aligned_free
(
ptr
);
_aligned_free
(
ptr
);
...
@@ -63,6 +65,13 @@ class CPUDeviceAPI final : public DeviceAPI {
...
@@ -63,6 +65,13 @@ class CPUDeviceAPI final : public DeviceAPI {
static_cast
<
const
char
*>
(
from
)
+
from_offset
,
size
);
static_cast
<
const
char
*>
(
from
)
+
from_offset
,
size
);
}
}
void
RecordedCopyDataFromTo
(
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
size
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
void
*
pytorch_ctx
)
final
{
BUG_IF_FAIL
(
false
)
<<
"This piece of code should not be reached."
;
}
DGLStreamHandle
CreateStream
(
DGLContext
)
final
{
return
nullptr
;
}
DGLStreamHandle
CreateStream
(
DGLContext
)
final
{
return
nullptr
;
}
void
StreamSync
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{}
void
StreamSync
(
DGLContext
ctx
,
DGLStreamHandle
stream
)
final
{}
...
@@ -84,16 +93,20 @@ struct CPUWorkspacePool : public WorkspacePool {
...
@@ -84,16 +93,20 @@ struct CPUWorkspacePool : public WorkspacePool {
void
*
CPUDeviceAPI
::
AllocWorkspace
(
void
*
CPUDeviceAPI
::
AllocWorkspace
(
DGLContext
ctx
,
size_t
size
,
DGLDataType
type_hint
)
{
DGLContext
ctx
,
size_t
size
,
DGLDataType
type_hint
)
{
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
return
td
->
CPUAllocWorkspace
(
size
);
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
tensor_dispatcher
->
CPUAllocWorkspace
(
size
);
}
return
dmlc
::
ThreadLocalStore
<
CPUWorkspacePool
>::
Get
()
->
AllocWorkspace
(
return
dmlc
::
ThreadLocalStore
<
CPUWorkspacePool
>::
Get
()
->
AllocWorkspace
(
ctx
,
size
);
ctx
,
size
);
}
}
void
CPUDeviceAPI
::
FreeWorkspace
(
DGLContext
ctx
,
void
*
data
)
{
void
CPUDeviceAPI
::
FreeWorkspace
(
DGLContext
ctx
,
void
*
data
)
{
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
return
td
->
CPUFreeWorkspace
(
data
);
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
tensor_dispatcher
->
CPUFreeWorkspace
(
data
);
}
dmlc
::
ThreadLocalStore
<
CPUWorkspacePool
>::
Get
()
->
FreeWorkspace
(
ctx
,
data
);
dmlc
::
ThreadLocalStore
<
CPUWorkspacePool
>::
Get
()
->
FreeWorkspace
(
ctx
,
data
);
}
}
...
...
src/runtime/cuda/cuda_device_api.cc
View file @
62b5f50a
...
@@ -107,10 +107,11 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -107,10 +107,11 @@ class CUDADeviceAPI final : public DeviceAPI {
DGLDataType
type_hint
)
final
{
DGLDataType
type_hint
)
final
{
SetDevice
(
ctx
);
SetDevice
(
ctx
);
// Redirect to PyTorch's allocator when available.
// Redirect to PyTorch's allocator when available.
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
td
->
CUDAAllocWorkspace
(
nbytes
,
getCurrentCUDAStream
());
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
nbytes
,
getCurrentCUDAStream
());
}
CHECK_EQ
(
256
%
alignment
,
0U
)
<<
"CUDA space is aligned at 256 bytes"
;
CHECK_EQ
(
256
%
alignment
,
0U
)
<<
"CUDA space is aligned at 256 bytes"
;
void
*
ret
;
void
*
ret
;
CUDA_CALL
(
cudaMalloc
(
&
ret
,
nbytes
));
CUDA_CALL
(
cudaMalloc
(
&
ret
,
nbytes
));
...
@@ -119,9 +120,10 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -119,9 +120,10 @@ class CUDADeviceAPI final : public DeviceAPI {
void
FreeDataSpace
(
DGLContext
ctx
,
void
*
ptr
)
final
{
void
FreeDataSpace
(
DGLContext
ctx
,
void
*
ptr
)
final
{
SetDevice
(
ctx
);
SetDevice
(
ctx
);
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
return
td
->
CUDAFreeWorkspace
(
ptr
);
if
(
tensor_dispatcher
->
IsAvailable
())
{
return
tensor_dispatcher
->
CUDAFreeWorkspace
(
ptr
);
}
CUDA_CALL
(
cudaFree
(
ptr
));
CUDA_CALL
(
cudaFree
(
ptr
));
}
}
...
@@ -163,6 +165,28 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -163,6 +165,28 @@ class CUDADeviceAPI final : public DeviceAPI {
stream
);
stream
);
}
}
// To ensure correct behavior, `record_event` must be invoked anytime a
// pointer from PyTorch CachingHostAllocator is used in a cudaMemcpyAsync
// call. It provides a way to re-use freed pinned (page-locked) memory
// allocations and avoid device sync due to cudaFreeHost calls.
void
RecordedCopyDataFromTo
(
void
*
from
,
size_t
from_offset
,
void
*
to
,
size_t
to_offset
,
size_t
size
,
DGLContext
ctx_from
,
DGLContext
ctx_to
,
DGLDataType
type_hint
,
void
*
pytorch_ctx
)
final
{
auto
stream
=
GetStream
();
CopyDataFromTo
(
from
,
from_offset
,
to
,
to_offset
,
size
,
ctx_from
,
ctx_to
,
type_hint
,
stream
);
auto
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
tensor_dispatcher
->
IsAvailable
())
{
auto
custream
=
static_cast
<
cudaStream_t
>
(
stream
);
void
*
ptr
=
ctx_to
.
device_type
==
kDGLCPU
?
to
:
from
;
int
id
=
ctx_to
.
device_type
==
kDGLCPU
?
ctx_from
.
device_id
:
ctx_to
.
device_id
;
tensor_dispatcher
->
CUDARecordHostAlloc
(
ptr
,
pytorch_ctx
,
custream
,
id
);
}
}
DGLStreamHandle
CreateStream
(
DGLContext
ctx
)
{
DGLStreamHandle
CreateStream
(
DGLContext
ctx
)
{
CUDA_CALL
(
cudaSetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
cudaSetDevice
(
ctx
.
device_id
));
cudaStream_t
retval
;
cudaStream_t
retval
;
...
@@ -214,6 +238,12 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -214,6 +238,12 @@ class CUDADeviceAPI final : public DeviceAPI {
bool
PinData
(
void
*
ptr
,
size_t
nbytes
)
override
{
bool
PinData
(
void
*
ptr
,
size_t
nbytes
)
override
{
// prevent users from pinning empty tensors or graphs
// prevent users from pinning empty tensors or graphs
if
(
ptr
==
nullptr
||
nbytes
==
0
)
return
false
;
if
(
ptr
==
nullptr
||
nbytes
==
0
)
return
false
;
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
// Minimize the pinned memory pool allocated by backend (via tensoradapter)
// to preserve enough memory for DGL inherited in-place pin-memory operation
if
(
tensor_dispatcher
->
IsAvailable
())
{
tensor_dispatcher
->
CUDAHostAllocatorEmptyCache
();
}
CUDA_CALL
(
cudaHostRegister
(
ptr
,
nbytes
,
cudaHostRegisterDefault
));
CUDA_CALL
(
cudaHostRegister
(
ptr
,
nbytes
,
cudaHostRegisterDefault
));
return
true
;
return
true
;
}
}
...
@@ -223,6 +253,25 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -223,6 +253,25 @@ class CUDADeviceAPI final : public DeviceAPI {
CUDA_CALL
(
cudaHostUnregister
(
ptr
));
CUDA_CALL
(
cudaHostUnregister
(
ptr
));
}
}
void
*
AllocPinnedDataSpace
(
size_t
nbytes
,
void
**
ctx
,
void
**
deleter
)
override
{
// prevent pinning empty tensors or graphs
if
(
nbytes
==
0
)
return
nullptr
;
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
CHECK
(
tensor_dispatcher
->
IsAvailable
())
<<
"CachingHostAllocator is not available in the current backend "
"PyTorch. Please update the PyTorch version to 1.11+"
;
return
tensor_dispatcher
->
CUDAAllocHostWorkspace
(
nbytes
,
ctx
,
deleter
);
}
void
FreePinnedDataSpace
(
void
**
deleter
)
override
{
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
CHECK
(
tensor_dispatcher
->
IsAvailable
())
<<
"CachingHostAllocator is not available in the current backend "
"PyTorch. Please update the PyTorch version to 1.11+"
;
tensor_dispatcher
->
CUDAFreeHostWorkspace
(
deleter
);
}
bool
IsPinned
(
const
void
*
ptr
)
override
{
bool
IsPinned
(
const
void
*
ptr
)
override
{
// can't be a pinned tensor if CUDA context is unavailable.
// can't be a pinned tensor if CUDA context is unavailable.
if
(
!
is_available_
)
return
false
;
if
(
!
is_available_
)
return
false
;
...
@@ -264,17 +313,19 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -264,17 +313,19 @@ class CUDADeviceAPI final : public DeviceAPI {
DGLContext
ctx
,
size_t
size
,
DGLDataType
type_hint
)
final
{
DGLContext
ctx
,
size_t
size
,
DGLDataType
type_hint
)
final
{
SetDevice
(
ctx
);
SetDevice
(
ctx
);
// Redirect to PyTorch's allocator when available.
// Redirect to PyTorch's allocator when available.
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
if
(
tensor_dispatcher
->
IsAvailable
())
return
td
->
CUDAAllocWorkspace
(
size
,
getCurrentCUDAStream
());
return
tensor_dispatcher
->
CUDAAllocWorkspace
(
size
,
getCurrentCUDAStream
());
return
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
AllocWorkspace
(
ctx
,
size
);
return
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
AllocWorkspace
(
ctx
,
size
);
}
}
void
FreeWorkspace
(
DGLContext
ctx
,
void
*
data
)
final
{
void
FreeWorkspace
(
DGLContext
ctx
,
void
*
data
)
final
{
SetDevice
(
ctx
);
SetDevice
(
ctx
);
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
tensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
return
td
->
CUDAFreeWorkspace
(
data
);
if
(
tensor_dispatcher
->
IsAvailable
())
return
tensor_dispatcher
->
CUDAFreeWorkspace
(
data
);
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
FreeWorkspace
(
ctx
,
data
);
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
FreeWorkspace
(
ctx
,
data
);
}
}
...
@@ -309,9 +360,9 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
...
@@ -309,9 +360,9 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
}
}
cudaStream_t
getCurrentCUDAStream
()
{
cudaStream_t
getCurrentCUDAStream
()
{
TensorDispatcher
*
t
d
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
t
ensor_dispatcher
=
TensorDispatcher
::
Global
();
if
(
t
d
->
IsAvailable
())
if
(
t
ensor_dispatcher
->
IsAvailable
())
return
t
d
->
CUDAGetCurrentStream
();
return
t
ensor_dispatcher
->
CUDAGetCurrentStream
();
else
// return the default stream when TA is not available
else
// return the default stream when TA is not available
return
nullptr
;
return
nullptr
;
}
}
...
...
src/runtime/ndarray.cc
View file @
62b5f50a
...
@@ -68,8 +68,16 @@ void NDArray::Internal::DefaultDeleter(NDArray::Container* ptr) {
...
@@ -68,8 +68,16 @@ void NDArray::Internal::DefaultDeleter(NDArray::Container* ptr) {
}
else
if
(
ptr
->
dl_tensor
.
data
!=
nullptr
)
{
}
else
if
(
ptr
->
dl_tensor
.
data
!=
nullptr
)
{
// if the array is still pinned before freeing, unpin it.
// if the array is still pinned before freeing, unpin it.
if
(
ptr
->
pinned_by_dgl_
)
UnpinContainer
(
ptr
);
if
(
ptr
->
pinned_by_dgl_
)
UnpinContainer
(
ptr
);
dgl
::
runtime
::
DeviceAPI
::
Get
(
ptr
->
dl_tensor
.
ctx
)
if
(
ptr
->
pinned_by_pytorch_
)
{
->
FreeDataSpace
(
ptr
->
dl_tensor
.
ctx
,
ptr
->
dl_tensor
.
data
);
DeviceAPI
::
Get
(
kDGLCUDA
)
->
FreePinnedDataSpace
(
&
(
ptr
->
pytorch_raw_deleter_
));
CHECK
(
ptr
->
pytorch_raw_deleter_
==
nullptr
);
ptr
->
pinned_by_pytorch_
=
false
;
ptr
->
pytorch_ctx_
=
nullptr
;
}
else
{
dgl
::
runtime
::
DeviceAPI
::
Get
(
ptr
->
dl_tensor
.
ctx
)
->
FreeDataSpace
(
ptr
->
dl_tensor
.
ctx
,
ptr
->
dl_tensor
.
data
);
}
}
}
delete
ptr
;
delete
ptr
;
}
}
...
@@ -159,7 +167,6 @@ NDArray NDArray::EmptyShared(
...
@@ -159,7 +167,6 @@ NDArray NDArray::EmptyShared(
const
std
::
string
&
name
,
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
const
std
::
string
&
name
,
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
DGLContext
ctx
,
bool
is_create
)
{
DGLContext
ctx
,
bool
is_create
)
{
NDArray
ret
=
Internal
::
Create
(
shape
,
dtype
,
ctx
);
NDArray
ret
=
Internal
::
Create
(
shape
,
dtype
,
ctx
);
// setup memory content
size_t
size
=
GetDataSize
(
ret
.
data_
->
dl_tensor
);
size_t
size
=
GetDataSize
(
ret
.
data_
->
dl_tensor
);
auto
mem
=
std
::
make_shared
<
SharedMemory
>
(
name
);
auto
mem
=
std
::
make_shared
<
SharedMemory
>
(
name
);
if
(
is_create
)
{
if
(
is_create
)
{
...
@@ -175,7 +182,6 @@ NDArray NDArray::EmptyShared(
...
@@ -175,7 +182,6 @@ NDArray NDArray::EmptyShared(
NDArray
NDArray
::
Empty
(
NDArray
NDArray
::
Empty
(
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
DGLContext
ctx
)
{
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
DGLContext
ctx
)
{
NDArray
ret
=
Internal
::
Create
(
shape
,
dtype
,
ctx
);
NDArray
ret
=
Internal
::
Create
(
shape
,
dtype
,
ctx
);
// setup memory content
size_t
size
=
GetDataSize
(
ret
.
data_
->
dl_tensor
);
size_t
size
=
GetDataSize
(
ret
.
data_
->
dl_tensor
);
size_t
alignment
=
GetDataAlignment
(
ret
.
data_
->
dl_tensor
);
size_t
alignment
=
GetDataAlignment
(
ret
.
data_
->
dl_tensor
);
if
(
size
>
0
)
if
(
size
>
0
)
...
@@ -206,6 +212,44 @@ void NDArray::CopyFromTo(DGLArray* from, DGLArray* to) {
...
@@ -206,6 +212,44 @@ void NDArray::CopyFromTo(DGLArray* from, DGLArray* to) {
from
->
dtype
);
from
->
dtype
);
}
}
void
NDArray
::
RecordedCopyFromTo
(
DGLArray
*
from
,
DGLArray
*
to
,
void
*
pytorch_ctx
)
{
size_t
from_size
=
GetDataSize
(
*
from
);
size_t
to_size
=
GetDataSize
(
*
to
);
CHECK_EQ
(
from_size
,
to_size
)
<<
"DGLArrayCopyFromTo: The size must exactly match."
;
CHECK
(
from
->
ctx
.
device_type
!=
to
->
ctx
.
device_type
)
<<
"Recoding event is only called for the copy between CPU and GPU."
;
CHECK
(
from
->
ctx
.
device_type
==
kDGLCUDA
||
to
->
ctx
.
device_type
==
kDGLCUDA
)
<<
"At least one CUDA ctx needs to be involved."
;
DeviceAPI
::
Get
(
kDGLCUDA
)
->
RecordedCopyDataFromTo
(
from
->
data
,
static_cast
<
size_t
>
(
from
->
byte_offset
),
to
->
data
,
static_cast
<
size_t
>
(
to
->
byte_offset
),
from_size
,
from
->
ctx
,
to
->
ctx
,
from
->
dtype
,
pytorch_ctx
);
}
NDArray
NDArray
::
PinnedEmpty
(
std
::
vector
<
int64_t
>
shape
,
DGLDataType
dtype
,
DGLContext
ctx
)
{
CHECK_EQ
(
ctx
.
device_type
,
kDGLCPU
)
<<
"Only NDArray on CPU can be pinned"
;
NDArray
ret
=
Internal
::
Create
(
shape
,
dtype
,
ctx
);
size_t
size
=
GetDataSize
(
ret
.
data_
->
dl_tensor
);
if
(
size
>
0
)
{
ret
.
data_
->
dl_tensor
.
data
=
DeviceAPI
::
Get
(
kDGLCUDA
)
->
AllocPinnedDataSpace
(
size
,
&
(
ret
.
data_
->
pytorch_ctx_
),
&
(
ret
.
data_
->
pytorch_raw_deleter_
));
CHECK
(
ret
.
data_
->
pytorch_ctx_
!=
nullptr
&&
ret
.
data_
->
pytorch_raw_deleter_
!=
nullptr
)
<<
"The allocation failed in PyTorch's CachingHostAllocator. "
<<
"The returned context pointer is "
<<
ret
.
data_
->
pytorch_ctx_
<<
" and the function deleter is "
<<
ret
.
data_
->
pytorch_raw_deleter_
;
ret
.
data_
->
pinned_by_pytorch_
=
true
;
}
return
ret
;
}
void
NDArray
::
PinContainer
(
NDArray
::
Container
*
ptr
)
{
void
NDArray
::
PinContainer
(
NDArray
::
Container
*
ptr
)
{
if
(
IsContainerPinned
(
ptr
))
return
;
if
(
IsContainerPinned
(
ptr
))
return
;
auto
*
tensor
=
&
(
ptr
->
dl_tensor
);
auto
*
tensor
=
&
(
ptr
->
dl_tensor
);
...
@@ -229,13 +273,13 @@ void NDArray::UnpinContainer(NDArray::Container* ptr) {
...
@@ -229,13 +273,13 @@ void NDArray::UnpinContainer(NDArray::Container* ptr) {
}
}
void
NDArray
::
RecordStream
(
DGLArray
*
tensor
,
DGLStreamHandle
stream
)
{
void
NDArray
::
RecordStream
(
DGLArray
*
tensor
,
DGLStreamHandle
stream
)
{
TensorDispatcher
*
t
d
=
TensorDispatcher
::
Global
();
TensorDispatcher
*
t
ensor_dispatcher
=
TensorDispatcher
::
Global
();
CHECK
(
t
d
->
IsAvailable
())
CHECK
(
t
ensor_dispatcher
->
IsAvailable
())
<<
"RecordStream only works when TensorAdapt
o
r is available."
;
<<
"RecordStream only works when TensorAdapt
e
r is available."
;
CHECK_EQ
(
tensor
->
ctx
.
device_type
,
kDGLCUDA
)
CHECK_EQ
(
tensor
->
ctx
.
device_type
,
kDGLCUDA
)
<<
"RecordStream only works with GPU tensors."
;
<<
"RecordStream only works with GPU tensors."
;
t
d
->
RecordStream
(
tensor
->
data
,
stream
,
tensor
->
ctx
.
device_id
);
t
ensor_dispatcher
->
RecordStream
(
tensor
->
data
,
stream
,
tensor
->
ctx
.
device_id
);
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -300,7 +344,7 @@ std::shared_ptr<SharedMemory> NDArray::GetSharedMem() const {
...
@@ -300,7 +344,7 @@ std::shared_ptr<SharedMemory> NDArray::GetSharedMem() const {
}
}
bool
NDArray
::
IsContainerPinned
(
NDArray
::
Container
*
ptr
)
{
bool
NDArray
::
IsContainerPinned
(
NDArray
::
Container
*
ptr
)
{
if
(
ptr
->
pinned_by_dgl_
)
return
true
;
if
(
ptr
->
pinned_by_dgl_
||
ptr
->
pinned_by_pytorch_
)
return
true
;
auto
*
tensor
=
&
(
ptr
->
dl_tensor
);
auto
*
tensor
=
&
(
ptr
->
dl_tensor
);
// Can only be pinned if on CPU...
// Can only be pinned if on CPU...
if
(
tensor
->
ctx
.
device_type
!=
kDGLCPU
)
return
false
;
if
(
tensor
->
ctx
.
device_type
!=
kDGLCPU
)
return
false
;
...
...
tensoradapter/include/tensoradapter.h
View file @
62b5f50a
...
@@ -65,6 +65,46 @@ cudaStream_t CUDACurrentStream();
...
@@ -65,6 +65,46 @@ cudaStream_t CUDACurrentStream();
* @param device_id Device of the tensor.
* @param device_id Device of the tensor.
*/
*/
void
RecordStream
(
void
*
ptr
,
cudaStream_t
stream
,
int
device_id
);
void
RecordStream
(
void
*
ptr
,
cudaStream_t
stream
,
int
device_id
);
/**
* @brief Allocate a piece of pinned CPU memory via
* PyTorch's CachingHostAllocator.
*
* @param nbytes The size to be allocated.
* @param ctx Pointer to the PyTorch storage ctx ptr returned from the
* allocator.
* @param deleter Pointer to the delete function ptr returned from the
* allocator.
* @return Raw pointer to the allocated memory.
*/
void
*
CUDARawHostAlloc
(
size_t
nbytes
,
void
**
ctx
,
void
**
raw_deleter
);
/**
* @brief 'Free' the pinned CPU memory via
* inserting the memory block back to the free list.
*
* @param deleter Pointer to the delete function ptr returned from the
* allocator.
*/
void
CUDARawHostDelete
(
void
**
raw_deleter
);
/**
* @brief 'Record' a CUDA stream (usually from a copy kernel) for the pinned
* memory via PyTorch's CachingHostAllocator.
*
* @param data Pointer of the tensor to be recorded.
* @param ctx PyTorch storage ctx ptr returned from the allocator.
* @param stream The stream that currently consumes this tensor.
* @param device_id Device of the tensor.
*/
void
CUDARecordHostAlloc
(
void
*
data
,
void
*
ctx
,
cudaStream_t
stream
,
int
device_id
);
/**
* @brief Release cached pinned memory allocations via cudaHostFree.
*/
void
CUDAHostAllocatorEmptyCache
();
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
}
}
...
...
tensoradapter/pytorch/torch.cpp
View file @
62b5f50a
...
@@ -8,6 +8,7 @@
...
@@ -8,6 +8,7 @@
#include <tensoradapter_exports.h>
#include <tensoradapter_exports.h>
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CachingHostAllocator.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/cuda/CUDAStream.h>
#include <cuda_runtime.h>
#include <cuda_runtime.h>
...
@@ -55,6 +56,50 @@ TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) {
...
@@ -55,6 +56,50 @@ TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) {
reinterpret_cast
<
int64_t
>
(
stream
))));
reinterpret_cast
<
int64_t
>
(
stream
))));
data_ptr
.
release_context
();
data_ptr
.
release_context
();
}
}
class
CUDAHostDeleter
{
public:
explicit
CUDAHostDeleter
(
std
::
unique_ptr
<
void
,
c10
::
DeleterFnPtr
>
ptr
)
:
ptr_
(
std
::
move
(
ptr
))
{}
private:
std
::
unique_ptr
<
void
,
c10
::
DeleterFnPtr
>
ptr_
;
};
TA_EXPORTS
void
*
CUDARawHostAlloc
(
size_t
nbytes
,
void
**
ctx
,
void
**
raw_deleter
)
{
auto
data_ptr
=
at
::
cuda
::
getCachingHostAllocator
()
->
allocate
(
nbytes
);
auto
raw
=
data_ptr
.
get
();
// Return the raw ctx ptr for recording event.
*
ctx
=
data_ptr
.
get_context
();
// Transfer ownership to raw_deleter.
auto
*
data_deleter
=
new
CUDAHostDeleter
(
data_ptr
.
move_context
());
*
raw_deleter
=
static_cast
<
void
*>
(
data_deleter
);
return
raw
;
}
// Designated CUDAHostDeleter for CUDARawHostAlloc.
TA_EXPORTS
void
CUDARawHostDelete
(
void
**
raw_deleter
)
{
delete
static_cast
<
CUDAHostDeleter
*>
(
*
raw_deleter
);
*
raw_deleter
=
nullptr
;
}
TA_EXPORTS
void
CUDARecordHostAlloc
(
void
*
ptr
,
void
*
ctx
,
cudaStream_t
stream
,
int
device_id
)
{
at
::
cuda
::
CachingHostAllocator_recordEvent
(
ptr
,
ctx
,
c10
::
cuda
::
CUDAStream
(
c10
::
cuda
::
CUDAStream
::
UNCHECKED
,
c10
::
Stream
(
c10
::
Stream
::
UNSAFE
,
c10
::
Device
(
c10
::
DeviceType
::
CUDA
,
device_id
),
reinterpret_cast
<
int64_t
>
(
stream
))));
}
TA_EXPORTS
void
CUDAHostAllocatorEmptyCache
()
{
at
::
cuda
::
CachingHostAllocator_emptyCache
();
}
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
};
};
...
...
tests/python/common/test_heterograph-index.py
0 → 100644
View file @
62b5f50a
import
unittest
import
backend
as
F
import
dgl
import
pytest
from
dgl
import
DGLError
from
utils
import
parametrize_idtype
def
create_test_heterograph
(
idtype
):
# 3 users, 2 games, 2 developers
# metagraph:
# ('user', 'follows', 'user'),
# ('user', 'plays', 'game'),
# ('user', 'wishes', 'game'),
# ('developer', 'develops', 'game')])
g
=
dgl
.
heterograph
(
{
(
"user"
,
"follows"
,
"user"
):
([
0
,
1
],
[
1
,
2
]),
(
"user"
,
"plays"
,
"game"
):
([
0
,
1
,
2
,
1
],
[
0
,
0
,
1
,
1
]),
(
"user"
,
"wishes"
,
"game"
):
([
0
,
2
],
[
1
,
0
]),
(
"developer"
,
"develops"
,
"game"
):
([
0
,
1
],
[
0
,
1
]),
},
idtype
=
idtype
,
device
=
F
.
ctx
(),
)
assert
g
.
idtype
==
idtype
assert
g
.
device
==
F
.
ctx
()
return
g
@
unittest
.
skipIf
(
F
.
_default_context_str
==
"cpu"
,
reason
=
"Need gpu for this test"
)
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"Pinning graph outplace only supported for PyTorch"
,
)
@
parametrize_idtype
def
test_pin_memory
(
idtype
):
g
=
create_test_heterograph
(
idtype
)
g
.
nodes
[
"user"
].
data
[
"h"
]
=
F
.
ones
((
3
,
5
))
g
.
nodes
[
"game"
].
data
[
"i"
]
=
F
.
ones
((
2
,
5
))
g
.
edges
[
"plays"
].
data
[
"e"
]
=
F
.
ones
((
4
,
4
))
g
=
g
.
to
(
F
.
cpu
())
assert
not
g
.
is_pinned
()
# Test pinning a CPU graph.
g
.
_graph
.
pin_memory
()
assert
not
g
.
is_pinned
()
g
.
_graph
=
g
.
_graph
.
pin_memory
()
assert
g
.
is_pinned
()
assert
g
.
device
==
F
.
cpu
()
# when clone with a new (different) formats, e.g., g.formats("csc")
# ensure the new graphs are not pinned
assert
not
g
.
formats
(
"csc"
).
is_pinned
()
assert
not
g
.
formats
(
"csr"
).
is_pinned
()
# 'coo' formats is the default and thus not cloned
assert
g
.
formats
(
"coo"
).
is_pinned
()
# Test pinning a GPU graph will cause error raised.
g1
=
g
.
to
(
F
.
cuda
())
with
pytest
.
raises
(
DGLError
):
g1
.
_graph
.
pin_memory
()
# Test pinning an empty homograph
g2
=
dgl
.
graph
(([],
[]))
g2
.
_graph
=
g2
.
_graph
.
pin_memory
()
assert
g2
.
is_pinned
()
# Test pinning heterograph with 0 edge of one relation type
g3
=
dgl
.
heterograph
(
{(
"a"
,
"b"
,
"c"
):
([
0
,
1
],
[
1
,
2
]),
(
"c"
,
"d"
,
"c"
):
([],
[])}
).
astype
(
idtype
)
g3
.
_graph
=
g3
.
_graph
.
pin_memory
()
assert
g3
.
is_pinned
()
if
__name__
==
"__main__"
:
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment