[Feature] CUDA UVA sampling for MultiLayerNeighborSampler (#3674)

* implement pin_memory/unpin_memory/is_pinned for dgl.graph * update python docstring * update c++ docstring * add test * fix the broken UnifiedTensor * XPU_SWITCH for kDLCPUPinned * a rough version ready for testing * eliminate extra context parameter for pin/unpin * update train_sampling * fix linting * fix typo * multi-gpu uva sampling case * disable new format materialization for pinned graphs * update python doc for pin_memory_ * fix unit test * UVA sampling for link prediction * dispatch most csr ops * update graphsage example to combine uva sampling and UnifiedTensor * update graphsage example to combine uva sampling and UnifiedTensor * update graphsage example to combine uva sampling and UnifiedTensor * update doc * update examples * change unitgraph and heterograph's PinMemory to in-place * update examples for multi-gpu uva sampling * update doc * fix linting * fix cpu build * fix is_pinned for DistGraph * fix is_pinned for DistGraph * update graphsage unsupervised example * update doc for gpu sampling * update some check for sampling device switching * fix linting * adapt for new dataloader * fix linting * fix * fix some name issue * adjust device check * add unit test for uva sampling & fix some zero_copy bug * fix linting * update num_threads in graphsage examples Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>

[Feature] CUDA UVA sampling for MultiLayerNeighborSampler (#3674)
* implement pin_memory/unpin_memory/is_pinned for dgl.graph * update python docstring * update c++ docstring * add test * fix the broken UnifiedTensor * XPU_SWITCH for kDLCPUPinned * a rough version ready for testing * eliminate extra context parameter for pin/unpin * update train_sampling * fix linting * fix typo * multi-gpu uva sampling case * disable new format materialization for pinned graphs * update python doc for pin_memory_ * fix unit test * UVA sampling for link prediction * dispatch most csr ops * update graphsage example to combine uva sampling and UnifiedTensor * update graphsage example to combine uva sampling and UnifiedTensor * update graphsage example to combine uva sampling and UnifiedTensor * update doc * update examples * change unitgraph and heterograph's PinMemory to in-place * update examples for multi-gpu uva sampling * update doc * fix linting * fix cpu build * fix is_pinned for DistGraph * fix is_pinned for DistGraph * update graphsage unsupervised example * update doc for gpu sampling * update some check for sampling device switching * fix linting * adapt for new dataloader * fix linting * fix * fix some name issue * adjust device check * add unit test for uva sampling & fix some zero_copy bug * fix linting * update num_threads in graphsage examples Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
738e8318 · Xin Yao · GitHub · fa343873 · 738e8318 · 738e8318
Unverified Commit 738e8318 authored Feb 09, 2022 by Xin Yao Committed by GitHub Feb 09, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 6 deletions

src/runtime/ndarray.cc src/runtime/ndarray.cc +4 -3

tests/pytorch/test_dataloader.py tests/pytorch/test_dataloader.py +17 -3

No files found.
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -260,7 +260,7 @@ void NDArray::CopyFromTo(DLTensor* from,

  // Use the context that is *not* a cpu context to get the correct device
  // api manager.
-  DGLContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx;
+  DGLContext ctx = GetDevice(from->ctx).device_type != kDLCPU ? from->ctx : to->ctx;

  DeviceAPI::Get(ctx)->CopyDataFromTo(
    from->data, static_cast<size_t>(from->byte_offset),
@@ -489,9 +489,10 @@ int DGLArrayToDLPack(DGLArrayHandle from, DLManagedTensor** out,
  API_BEGIN();
  auto* nd_container = reinterpret_cast<NDArray::Container*>(from);
  DLTensor* nd = &(nd_container->dl_tensor);
-  if (alignment != 0 && !is_aligned(nd->data, alignment)) {
+  if ((alignment != 0 && !is_aligned(nd->data, alignment))
+      || (nd->ctx.device_type == kDLCPUPinned)) {
    std::vector<int64_t> shape_vec(nd->shape, nd->shape + nd->ndim);
-    NDArray copy_ndarray = NDArray::Empty(shape_vec, nd->dtype, nd->ctx);
+    NDArray copy_ndarray = NDArray::Empty(shape_vec, nd->dtype, GetDevice(nd->ctx));
    copy_ndarray.CopyFrom(nd);
    *out = copy_ndarray.ToDLPack();
  } else {

--- a/tests/pytorch/test_dataloader.py
+++ b/tests/pytorch/test_dataloader.py
@@ -99,8 +99,12 @@ def _check_device(data):
        assert data.device == F.ctx()

 @pytest.mark.parametrize('sampler_name', ['full', 'neighbor', 'neighbor2'])
-def test_node_dataloader(sampler_name):
+@pytest.mark.parametrize('pin_graph', [True, False])
+def test_node_dataloader(sampler_name, pin_graph):
    g1 = dgl.graph(([0, 0, 0, 1, 1], [1, 2, 3, 3, 4]))
+    if F.ctx() != F.cpu() and pin_graph:
+        g1.create_formats_()
+        g1.pin_memory_()
    g1.ndata['feat'] = F.copy_to(F.randn((5, 8)), F.cpu())
    g1.ndata['label'] = F.copy_to(F.randn((g1.num_nodes(),)), F.cpu())

@@ -141,14 +145,20 @@ def test_node_dataloader(sampler_name):
        _check_device(output_nodes)
        _check_device(blocks)

+    if g1.is_pinned():
+        g1.unpin_memory_()

 @pytest.mark.parametrize('sampler_name', ['full', 'neighbor'])
 @pytest.mark.parametrize('neg_sampler', [
    dgl.dataloading.negative_sampler.Uniform(2),
    dgl.dataloading.negative_sampler.GlobalUniform(15, False, 3),
    dgl.dataloading.negative_sampler.GlobalUniform(15, True, 3)])
-def test_edge_dataloader(sampler_name, neg_sampler):
+@pytest.mark.parametrize('pin_graph', [True, False])
+def test_edge_dataloader(sampler_name, neg_sampler, pin_graph):
    g1 = dgl.graph(([0, 0, 0, 1, 1], [1, 2, 3, 3, 4]))
+    if F.ctx() != F.cpu() and pin_graph:
+        g1.create_formats_()
+        g1.pin_memory_()
    g1.ndata['feat'] = F.copy_to(F.randn((5, 8)), F.cpu())

    sampler = {
@@ -209,6 +219,9 @@ def test_edge_dataloader(sampler_name, neg_sampler):
        _check_device(neg_pair_graph)
        _check_device(blocks)

+    if g1.is_pinned():
+        g1.unpin_memory_()
+
 if __name__ == '__main__':
    test_graph_dataloader()
    test_cluster_gcn(0)
@@ -219,4 +232,5 @@ if __name__ == '__main__':
                dgl.dataloading.negative_sampler.Uniform(2),
                dgl.dataloading.negative_sampler.GlobalUniform(2, False),
                dgl.dataloading.negative_sampler.GlobalUniform(2, True)]:
-            test_edge_dataloader(sampler, neg_sampler)
+            for pin_graph in [True, False]:
+                test_edge_dataloader(sampler, neg_sampler, pin_graph)