[Bugfix] Fix empty tensors may being treated as pinned (#5005)

* fix empty tensor is treated as pinned * avoid calling cudaHostGetDevicePointer on nullptr * update empty array * add a comment

[Bugfix] Fix empty tensors may being treated as pinned (#5005)
* fix empty tensor is treated as pinned * avoid calling cudaHostGetDevicePointer on nullptr * update empty array * add a comment
aad3bd04 · Xin Yao · GitHub · e28f0781 · aad3bd04 · aad3bd04
Unverified Commit aad3bd04 authored Dec 09, 2022 by Xin Yao Committed by GitHub Dec 09, 2022
13 changed files
--- a/include/dgl/aten/macro.h
+++ b/include/dgl/aten/macro.h
@@ -309,7 +309,9 @@
  });

 #define CHECK_VALID_CONTEXT(VAR1, VAR2)                          \
-  CHECK(((VAR1)->ctx == (VAR2)->ctx) || (VAR1).IsPinned())       \
+  CHECK(                                                         \
+      ((VAR1)->ctx == (VAR2)->ctx) || (VAR1).IsPinned() ||       \
+      ((VAR1).NumElements() == 0)) /* Let empty arrays pass */   \
      << "Expected " << (#VAR2) << "(" << (VAR2)->ctx << ")"     \
      << " to have the same device "                             \
      << "context as " << (#VAR1) << "(" << (VAR1)->ctx << "). " \

--- a/include/dgl/runtime/device_api.h
+++ b/include/dgl/runtime/device_api.h
@@ -149,8 +149,9 @@ class DeviceAPI {
   *
   * @param ptr The host memory pointer to be pinned.
   * @param nbytes The size to be pinned.
+   * @return false when pinning an empty tensor. true otherwise.
   */
-  DGL_DLL virtual void PinData(void* ptr, size_t nbytes);
+  DGL_DLL virtual bool PinData(void* ptr, size_t nbytes);

  /**
   * @brief Unpin host memory using cudaHostUnregister().

--- a/src/array/cuda/array_index_select.cu
+++ b/src/array/cuda/array_index_select.cu
@@ -16,12 +16,6 @@ namespace impl {

 template <DGLDeviceType XPU, typename DType, typename IdType>
 NDArray IndexSelect(NDArray array, IdArray index) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
-  const DType* array_data = array.Ptr<DType>();
-  if (array.IsPinned()) {
-    CUDA_CALL(cudaHostGetDevicePointer(&array_data, array.Ptr<DType>(), 0));
-  }
-  const IdType* idx_data = static_cast<IdType*>(index->data);
  const int64_t arr_len = array->shape[0];
  const int64_t len = index->shape[0];
  int64_t num_feat = 1;
@@ -33,9 +27,13 @@ NDArray IndexSelect(NDArray array, IdArray index) {

  // use index->ctx for pinned array
  NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx);
-  if (len == 0) return ret;
+  if (len == 0 || arr_len * num_feat == 0) return ret;
  DType* ret_data = static_cast<DType*>(ret->data);

+  const DType* array_data = static_cast<DType*>(cuda::GetDevicePointer(array));
+  const IdType* idx_data = static_cast<IdType*>(index->data);
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
  if (num_feat == 1) {
    const int nt = cuda::FindNumThreads(len);
    const int nb = (len + nt - 1) / nt;

--- a/src/array/cuda/csr_get_data.cu
+++ b/src/array/cuda/csr_get_data.cu
@@ -43,19 +43,13 @@ NDArray CSRGetData(
    BUG_IF_FAIL(DGLDataTypeTraits<DType>::dtype == rows->dtype)
        << "DType does not match row's dtype.";

-  const IdType* indptr_data = csr.indptr.Ptr<IdType>();
-  const IdType* indices_data = csr.indices.Ptr<IdType>();
-  const IdType* data_data = CSRHasData(csr) ? csr.data.Ptr<IdType>() : nullptr;
-  if (csr.is_pinned) {
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indptr_data, csr.indptr.Ptr<IdType>(), 0));
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indices_data, csr.indices.Ptr<IdType>(), 0));
-    if (CSRHasData(csr)) {
-      CUDA_CALL(
-          cudaHostGetDevicePointer(&data_data, csr.data.Ptr<IdType>(), 0));
-    }
-  }
+  const IdType* indptr_data =
+      static_cast<IdType*>(cuda::GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(cuda::GetDevicePointer(csr.indices));
+  const IdType* data_data =
+      CSRHasData(csr) ? static_cast<IdType*>(cuda::GetDevicePointer(csr.data))
+                      : nullptr;

  // TODO(minjie): use binary search for sorted csr
  CUDA_KERNEL_CALL(

--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
@@ -13,10 +13,11 @@
 #include "../../array/cuda/atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
 #include "./dgl_cub.cuh"
-
-using namespace dgl::aten::cuda;
+#include "./utils.h"

 namespace dgl {
+using namespace cuda;
+using namespace aten::cuda;
 namespace aten {
 namespace impl {

@@ -248,16 +249,11 @@ COOMatrix _CSRRowWiseSamplingUniform(
  IdType* const out_cols = static_cast<IdType*>(picked_col->data);
  IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);

-  const IdType* in_ptr = mat.indptr.Ptr<IdType>();
-  const IdType* in_cols = mat.indices.Ptr<IdType>();
-  const IdType* data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
-  if (mat.is_pinned) {
-    CUDA_CALL(cudaHostGetDevicePointer(&in_ptr, mat.indptr.Ptr<IdType>(), 0));
-    CUDA_CALL(cudaHostGetDevicePointer(&in_cols, mat.indices.Ptr<IdType>(), 0));
-    if (CSRHasData(mat)) {
-      CUDA_CALL(cudaHostGetDevicePointer(&data, mat.data.Ptr<IdType>(), 0));
-    }
-  }
+  const IdType* in_ptr = static_cast<IdType*>(GetDevicePointer(mat.indptr));
+  const IdType* in_cols = static_cast<IdType*>(GetDevicePointer(mat.indices));
+  const IdType* data = CSRHasData(mat)
+                           ? static_cast<IdType*>(GetDevicePointer(mat.data))
+                           : nullptr;

  // compute degree
  IdType* out_deg = static_cast<IdType*>(

--- a/src/array/cuda/rowwise_sampling_prob.cu
+++ b/src/array/cuda/rowwise_sampling_prob.cu
@@ -21,9 +21,9 @@
 static_assert(
    CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");

-using namespace dgl::aten::cuda;
-
 namespace dgl {
+using namespace cuda;
+using namespace aten::cuda;
 namespace aten {
 namespace impl {

@@ -496,18 +496,12 @@ COOMatrix _CSRRowWiseSampling(
  IdType* const out_cols = static_cast<IdType*>(picked_col->data);
  IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);

-  const IdType* in_ptr = mat.indptr.Ptr<IdType>();
-  const IdType* in_cols = mat.indices.Ptr<IdType>();
-  const IdType* data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
-  const FloatType* prob_data = prob.Ptr<FloatType>();
-  if (mat.is_pinned) {
-    CUDA_CALL(cudaHostGetDevicePointer(&in_ptr, mat.indptr.Ptr<IdType>(), 0));
-    CUDA_CALL(cudaHostGetDevicePointer(&in_cols, mat.indices.Ptr<IdType>(), 0));
-    if (CSRHasData(mat)) {
-      CUDA_CALL(cudaHostGetDevicePointer(&data, mat.data.Ptr<IdType>(), 0));
-    }
-    CUDA_CALL(cudaHostGetDevicePointer(&prob_data, prob.Ptr<FloatType>(), 0));
-  }
+  const IdType* in_ptr = static_cast<IdType*>(GetDevicePointer(mat.indptr));
+  const IdType* in_cols = static_cast<IdType*>(GetDevicePointer(mat.indices));
+  const IdType* data = CSRHasData(mat)
+                           ? static_cast<IdType*>(GetDevicePointer(mat.data))
+                           : nullptr;
+  const FloatType* prob_data = static_cast<FloatType*>(GetDevicePointer(prob));

  // compute degree
  // out_deg: the size of each row in the sampled matrix

--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
@@ -19,6 +19,7 @@
 namespace dgl {

 using runtime::NDArray;
+using namespace cuda;

 namespace aten {
 namespace impl {
@@ -61,14 +62,10 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
  const int nt = dgl::cuda::FindNumThreads(rstlen);
  const int nb = (rstlen + nt - 1) / nt;
  const IdType* data = nullptr;
-  const IdType* indptr_data = csr.indptr.Ptr<IdType>();
-  const IdType* indices_data = csr.indices.Ptr<IdType>();
-  if (csr.is_pinned) {
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indptr_data, csr.indptr.Ptr<IdType>(), 0));
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indices_data, csr.indices.Ptr<IdType>(), 0));
-  }
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indices));
  // TODO(minjie): use binary search for sorted csr
  CUDA_KERNEL_CALL(
      dgl::cuda::_LinearSearchKernel, nb, nt, 0, stream, indptr_data,
@@ -155,11 +152,8 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto len = rows->shape[0];
  const IdType* vid_data = rows.Ptr<IdType>();
-  const IdType* indptr_data = csr.indptr.Ptr<IdType>();
-  if (csr.is_pinned) {
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indptr_data, csr.indptr.Ptr<IdType>(), 0));
-  }
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
  NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
  IdType* rst_data = static_cast<IdType*>(rst->data);
  const int nt = dgl::cuda::FindNumThreads(len);
@@ -267,19 +261,13 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
  // Copy indices.
  IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, rows->ctx);

-  const IdType* indptr_data = csr.indptr.Ptr<IdType>();
-  const IdType* indices_data = csr.indices.Ptr<IdType>();
-  const IdType* data_data = CSRHasData(csr) ? csr.data.Ptr<IdType>() : nullptr;
-  if (csr.is_pinned) {
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indptr_data, csr.indptr.Ptr<IdType>(), 0));
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indices_data, csr.indices.Ptr<IdType>(), 0));
-    if (CSRHasData(csr)) {
-      CUDA_CALL(
-          cudaHostGetDevicePointer(&data_data, csr.data.Ptr<IdType>(), 0));
-    }
-  }
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indices));
+  const IdType* data_data =
+      CSRHasData(csr) ? static_cast<IdType*>(GetDevicePointer(csr.data))
+                      : nullptr;

  CUDA_KERNEL_CALL(
      _SegmentCopyKernel, nb, nt, 0, stream, indptr_data, indices_data,
@@ -381,14 +369,10 @@ std::vector<NDArray> CSRGetDataAndIndices(
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
  cudaStream_t stream = runtime::getCurrentCUDAStream();

-  const IdType* indptr_data = csr.indptr.Ptr<IdType>();
-  const IdType* indices_data = csr.indices.Ptr<IdType>();
-  if (csr.is_pinned) {
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indptr_data, csr.indptr.Ptr<IdType>(), 0));
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indices_data, csr.indices.Ptr<IdType>(), 0));
-  }
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indices));

  // Generate a 0-1 mask for matched (row, col) positions.
  IdArray mask = Full(0, nnz, nbits, ctx);
@@ -618,14 +602,10 @@ CSRMatrix CSRSliceMatrix(
        hashmap.Insert(key[i]);
      });

-  const IdType* indptr_data = csr.indptr.Ptr<IdType>();
-  const IdType* indices_data = csr.indices.Ptr<IdType>();
-  if (csr.is_pinned) {
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indptr_data, csr.indptr.Ptr<IdType>(), 0));
-    CUDA_CALL(
-        cudaHostGetDevicePointer(&indices_data, csr.indices.Ptr<IdType>(), 0));
-  }
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indices));

  // Execute SegmentMaskColKernel
  const int64_t num_rows = csr.num_rows;

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
@@ -264,6 +264,14 @@ void MaskSelect(
  device->FreeWorkspace(ctx, workspace);
 }

+inline void* GetDevicePointer(runtime::NDArray array) {
+  void* ptr = array->data;
+  if (array.IsPinned()) {
+    CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0));
+  }
+  return ptr;
+}
+
 }  // namespace cuda
 }  // namespace dgl


--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
@@ -25,8 +25,7 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
  std::vector<int64_t> shape{len};

  CHECK(array.IsPinned());
-  const DType* array_data = nullptr;
-  CUDA_CALL(cudaHostGetDevicePointer(&array_data, array.Ptr<DType>(), 0));
+  const DType* array_data = static_cast<DType*>(cuda::GetDevicePointer(array));
  CHECK_EQ(index->ctx.device_type, kDGLCUDA);

  for (int d = 1; d < array->ndim; ++d) {
@@ -35,7 +34,7 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
  }

  NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx);
-  if (len == 0) return ret;
+  if (len == 0 || arr_len * num_feat == 0) return ret;
  DType* ret_data = static_cast<DType*>(ret->data);

  if (num_feat == 1) {
@@ -85,8 +84,7 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
  std::vector<int64_t> shape{len};

  CHECK(dest.IsPinned());
-  DType* dest_data = nullptr;
-  CUDA_CALL(cudaHostGetDevicePointer(&dest_data, dest.Ptr<DType>(), 0));
+  DType* dest_data = static_cast<DType*>(cuda::GetDevicePointer(dest));
  CHECK_EQ(index->ctx.device_type, kDGLCUDA);
  CHECK_EQ(source->ctx.device_type, kDGLCUDA);


--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -121,8 +121,9 @@ void DeviceAPI::SyncStreamFromTo(
  LOG(FATAL) << "Device does not support stream api.";
 }

-void DeviceAPI::PinData(void* ptr, size_t nbytes) {
+bool DeviceAPI::PinData(void* ptr, size_t nbytes) {
  LOG(FATAL) << "Device does not support cudaHostRegister api.";
+  return false;
 }

 void DeviceAPI::UnpinData(void* ptr) {

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -211,10 +211,11 @@ class CUDADeviceAPI final : public DeviceAPI {
   *        The pinned memory can be seen by all CUDA contexts,
   *        not just the one that performed the allocation
   */
-  void PinData(void* ptr, size_t nbytes) {
+  bool PinData(void* ptr, size_t nbytes) override {
    // prevent users from pinning empty tensors or graphs
-    if (ptr == nullptr || nbytes == 0) return;
+    if (ptr == nullptr || nbytes == 0) return false;
    CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault));
+    return true;
  }

  void UnpinData(void* ptr) {

--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -211,8 +211,8 @@ void NDArray::PinContainer(NDArray::Container* ptr) {
  auto* tensor = &(ptr->dl_tensor);
  CHECK_EQ(tensor->ctx.device_type, kDGLCPU)
      << "Only NDArray on CPU can be pinned";
-  DeviceAPI::Get(kDGLCUDA)->PinData(tensor->data, GetDataSize(*tensor));
-  ptr->pinned_by_dgl_ = true;
+  ptr->pinned_by_dgl_ =
+      DeviceAPI::Get(kDGLCUDA)->PinData(tensor->data, GetDataSize(*tensor));
 }

 void NDArray::UnpinContainer(NDArray::Container* ptr) {

--- a/tests/compute/test_subgraph.py
+++ b/tests/compute/test_subgraph.py
 import unittest

 import backend as F
+
+import dgl
 import networkx as nx
 import numpy as np
 import pytest
 import scipy.sparse as ssp
 from test_utils import parametrize_idtype

-import dgl
-
 D = 5


@@ -130,6 +130,32 @@ def create_test_heterograph(idtype):
    return g


+def create_test_heterograph2(idtype):
+    """test heterograph from the docstring, with an empty relation"""
+    # 3 users, 2 games, 2 developers
+    # metagraph:
+    #    ('user', 'follows', 'user'),
+    #    ('user', 'plays', 'game'),
+    #    ('user', 'wishes', 'game'),
+    #    ('developer', 'develops', 'game')
+
+    g = dgl.heterograph(
+        {
+            ("user", "follows", "user"): ([0, 1], [1, 2]),
+            ("user", "plays", "game"): ([0, 1, 2, 1], [0, 0, 1, 1]),
+            ("user", "wishes", "game"): ([0, 2], [1, 0]),
+            ("developer", "develops", "game"): ([], []),
+        },
+        idtype=idtype,
+        device=F.ctx(),
+    )
+    for etype in g.etypes:
+        g.edges[etype].data["weight"] = F.randn((g.num_edges(etype),))
+    assert g.idtype == idtype
+    assert g.device == F.ctx()
+    return g
+
+
 @unittest.skipIf(
    dgl.backend.backend_name == "mxnet",
    reason="MXNet doesn't support bool tensor",
@@ -788,14 +814,14 @@ def test_subframes(parent_idx_device, child_device):
 @unittest.skipIf(
    F._default_context_str != "gpu", reason="UVA only available on GPU"
 )
-@pytest.mark.parametrize("device", [F.cpu(), F.cuda()])
 @unittest.skipIf(
    dgl.backend.backend_name != "pytorch",
    reason="UVA only supported for PyTorch",
 )
+@pytest.mark.parametrize("device", [F.cpu(), F.cuda()])
 @parametrize_idtype
 def test_uva_subgraph(idtype, device):
-    g = create_test_heterograph(idtype)
+    g = create_test_heterograph2(idtype)
    g = g.to(F.cpu())
    g.create_formats_()
    g.pin_memory_()
@@ -805,16 +831,13 @@ def test_uva_subgraph(idtype, device):
    assert g.edge_subgraph(edge_indices).device == device
    assert g.in_subgraph(indices).device == device
    assert g.out_subgraph(indices).device == device
-    if dgl.backend.backend_name != "tensorflow":
-        # (BarclayII) Most of Tensorflow functions somehow do not preserve device: a CPU tensor
-        # becomes a GPU tensor after operations such as concat(), unique() or even sin().
-        # Not sure what should be the best fix.
-        assert g.khop_in_subgraph(indices, 1)[0].device == device
-        assert g.khop_out_subgraph(indices, 1)[0].device == device
+    assert g.khop_in_subgraph(indices, 1)[0].device == device
+    assert g.khop_out_subgraph(indices, 1)[0].device == device
    assert g.sample_neighbors(indices, 1).device == device
    g.unpin_memory_()


 if __name__ == "__main__":
    test_edge_subgraph()
-    # test_uva_subgraph(F.int64, F.cpu())
+    test_uva_subgraph(F.int64, F.cpu())
+    test_uva_subgraph(F.int64, F.cuda())