[Doc][Dataloading] Expand documentation of AsyncTransferer (#2313)

* Update docs * Make non-default streams non-blocking

[Doc][Dataloading] Expand documentation of AsyncTransferer (#2313)
* Update docs * Make non-default streams non-blocking
d453d72d · nv-dlasalle · GitHub · f673fc25 · d453d72d · d453d72d
Unverified Commit d453d72d authored Nov 02, 2020 by nv-dlasalle Committed by GitHub Nov 02, 2020
3 changed files
--- a/docs/source/api/python/dgl.dataloading.rst
+++ b/docs/source/api/python/dgl.dataloading.rst
@@ -56,6 +56,11 @@ Async Copying to/from GPUs
 Data can be copied from the CPU to the GPU, or from the GPU to the CPU,
 while the GPU is being used for
 computation, using the :class:`AsyncTransferer`.
+For the transfer to be fully asynchronous, the context the
+:class:`AsyncTranserer`
+is created with must be a GPU context, and the input tensor must be in 
+pinned memory.
 .. autoclass:: AsyncTransferer
    :members: __init__, async_copy

--- a/python/dgl/dataloading/async_transferer.py
+++ b/python/dgl/dataloading/async_transferer.py
@@ -38,7 +38,21 @@ class Transfer(object):
 class AsyncTransferer(object):
    """ Class for initiating asynchronous copies to/from the GPU on a second
-    GPU stream. """
+    GPU stream.
+    To initiate a transfer to a GPU:
+    >>> tensor_cpu = torch.ones(100000).pin_memory()
+    >>> transferer = dgl.dataloading.AsyncTransferer(torch.device(0))
+    >>> future = transferer.async_copy(tensor_cpu, torch.device(0))
+    And then to wait for the transfer to finish and get a copy of the tensor on
+    the GPU.
+    >>> tensor_gpu = future.wait()
+    """
    def __init__(self, device):
        """ Create a new AsyncTransferer object.
@@ -55,7 +69,12 @@ class AsyncTransferer(object):
        self._handle = _CAPI_DGLAsyncTransfererCreate(ctx)
    def async_copy(self, tensor, device):
-        """ Initiate an asynchronous copy on the internal stream.
+        """ Initiate an asynchronous copy on the internal stream. For this call
+        to be asynchronous, the context the AsyncTranserer is created with must
+        be a GPU context, and the input tensor must be in pinned memory.
+        Currently, transfers from the GPU to the CPU, and CPU to CPU, will
+        be synchronous.
        Parameters
        ----------

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -138,7 +138,8 @@ class CUDADeviceAPI final : public DeviceAPI {
  DGLStreamHandle CreateStream(DGLContext ctx) {
    CUDA_CALL(cudaSetDevice(ctx.device_id));
    cudaStream_t retval;
-    CUDA_CALL(cudaStreamCreate(&retval));
+    // make sure the legacy default stream won't block on this stream
+    CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
    return static_cast<DGLStreamHandle>(retval);
  }