Merge branch 'master' into fix_readme

0114f4fd · Minjie Wang · GitHub · 14ab462f · 40b44a43 · 0114f4fd
Unverified Commit 0114f4fd authored Jan 21, 2022 by Minjie Wang Committed by GitHub Jan 21, 2022
20 changed files
--- a/docs/source/api/python/dgl.DGLGraph.rst
+++ b/docs/source/api/python/dgl.DGLGraph.rst
@@ -208,6 +208,10 @@ Methods for getting or changing the device on which the graph is hosted.

    DGLGraph.to
    DGLGraph.device
+    DGLGraph.cpu
+    DGLGraph.pin_memory_
+    DGLGraph.unpin_memory_
+    DGLGraph.is_pinned

 Misc
 ----

--- a/include/dgl/aten/coo.h
+++ b/include/dgl/aten/coo.h
@@ -129,6 +129,37 @@ struct COOMatrix {
                     aten::IsNullArray(data) ? data : data.CopyTo(ctx, stream),
                     row_sorted, col_sorted);
  }
+
+  /*!
+  * \brief Pin the row, col and data (if not Null) of the matrix.
+  * \note This is an in-place method. Behavior depends on the current context,
+  *       kDLCPU: will be pinned;
+  *       kDLCPUPinned: directly return;
+  *       kDLGPU: invalid, will throw an error.
+  *       The context check is deferred to pinning the NDArray.
+  */
+  inline void PinMemory_() {
+    row.PinMemory_();
+    col.PinMemory_();
+    if (!aten::IsNullArray(data)) {
+      data.PinMemory_();
+    }
+  }
+
+  /*!
+  * \brief Unpin the row, col and data (if not Null) of the matrix.
+  * \note This is an in-place method. Behavior depends on the current context,
+  *       kDLCPUPinned: will be unpinned;
+  *       others: directly return.
+  *       The context check is deferred to unpinning the NDArray.
+  */
+  inline void UnpinMemory_() {
+    row.UnpinMemory_();
+    col.UnpinMemory_();
+    if (!aten::IsNullArray(data)) {
+      data.UnpinMemory_();
+    }
+  }
 };

 ///////////////////////// COO routines //////////////////////////

--- a/include/dgl/aten/csr.h
+++ b/include/dgl/aten/csr.h
@@ -122,6 +122,37 @@ struct CSRMatrix {
                     aten::IsNullArray(data) ? data : data.CopyTo(ctx, stream),
                     sorted);
  }
+
+  /*!
+  * \brief Pin the indptr, indices and data (if not Null) of the matrix.
+  * \note This is an in-place method. Behavior depends on the current context,
+  *       kDLCPU: will be pinned;
+  *       kDLCPUPinned: directly return;
+  *       kDLGPU: invalid, will throw an error.
+  *       The context check is deferred to pinning the NDArray.
+  */
+  inline void PinMemory_() {
+    indptr.PinMemory_();
+    indices.PinMemory_();
+    if (!aten::IsNullArray(data)) {
+      data.PinMemory_();
+    }
+  }
+
+  /*!
+  * \brief Unpin the indptr, indices and data (if not Null) of the matrix.
+  * \note This is an in-place method. Behavior depends on the current context,
+  *       kDLCPUPinned: will be unpinned;
+  *       others: directly return.
+  *       The context check is deferred to unpinning the NDArray.
+  */
+  inline void UnpinMemory_() {
+    indptr.UnpinMemory_();
+    indices.UnpinMemory_();
+    if (!aten::IsNullArray(data)) {
+      data.UnpinMemory_();
+    }
+  }
 };

 ///////////////////////// CSR routines //////////////////////////

--- a/include/dgl/aten/macro.h
+++ b/include/dgl/aten/macro.h
@@ -37,10 +37,13 @@
 *   // Now XPU is a placeholder for array->ctx.device_type
 *   DeviceSpecificImplementation<XPU>(...);
 * });
+ * 
+ * We treat pinned memory as normal host memory if we don't want
+ * to enable CUDA UVA access for this operator
 */
 #ifdef DGL_USE_CUDA
 #define ATEN_XPU_SWITCH_CUDA(val, XPU, op, ...) do {            \
-  if ((val) == kDLCPU) {                                        \
+  if ((val) == kDLCPU || (val) == kDLCPUPinned) {               \
    constexpr auto XPU = kDLCPU;                                \
    {__VA_ARGS__}                                               \
  } else if ((val) == kDLGPU) {                                 \

--- a/include/dgl/base_heterograph.h
+++ b/include/dgl/base_heterograph.h
@@ -110,6 +110,11 @@ class BaseHeteroGraph : public runtime::Object {
   */
  virtual DLContext Context() const = 0;

+  /*!
+   * \brief Check if this graph is pinned.
+   */
+  virtual bool IsPinned() const = 0;
+
  /*!
   * \brief Get the number of integer bits used to store node/edge ids (32 or 64).
   */

--- a/include/dgl/runtime/device_api.h
+++ b/include/dgl/runtime/device_api.h
@@ -148,19 +148,17 @@ class DeviceAPI {
  /*!
   * \brief Pin host memory using cudaHostRegister().
   *
-   * \param ctx The context of pinning and mapping.
   * \param ptr The host memory pointer to be pinned.
   * \param nbytes The size to be pinned.   
   */  
-  DGL_DLL virtual void PinData(DGLContext ctx, void* ptr, size_t nbytes);
+  DGL_DLL virtual void PinData(void* ptr, size_t nbytes);

  /*!
-   * \brief Unpin host memory ussing cudaHostUnregister().
+   * \brief Unpin host memory using cudaHostUnregister().
   *
-   * \param ctx The context to unmap and unpin.
   * \param ptr The host memory pointer to be unpinned.   
   */ 
-  DGL_DLL virtual void UnpinData(DGLContext ctx, void* ptr);
+  DGL_DLL virtual void UnpinData(void* ptr);

  /*!
   * \brief Allocate temporal workspace for backend execution.
@@ -190,12 +188,21 @@ class DeviceAPI {
  DGL_DLL virtual void FreeWorkspace(DGLContext ctx, void* ptr);

  /*!
-   * \brief Get device API base don context.
+   * \brief Get device API based on context.
   * \param ctx The context
   * \param allow_missing Whether allow missing
   * \return The corresponding device API.
   */
  DGL_DLL static DeviceAPI* Get(DGLContext ctx, bool allow_missing = false);
+
+
+  /*!
+   * \brief Get device API based on context.
+   * \param dev_type The device type
+   * \param allow_missing Whether allow missing
+   * \return The corresponding device API.
+   */
+  DGL_DLL static DeviceAPI* Get(DLDeviceType dev_type, bool allow_missing = false);
 };

 /*! \brief The device type bigger than this is RPC device */

--- a/include/dgl/runtime/ndarray.h
+++ b/include/dgl/runtime/ndarray.h
@@ -171,6 +171,27 @@ class NDArray {
   * \brief Return a new array with a copy of the content.
   */
  inline NDArray Clone(const DGLStreamHandle &stream = nullptr) const;
+  /*!
+   * \brief In-place method to pin the current array by calling PinData
+   *        on the underlying DLTensor.
+   * \note This is an in-place method. Behavior depends on the current context,
+   *       kDLCPU: will be pinned;
+   *       kDLCPUPinned: directly return;
+   *       kDLGPU: invalid, will throw an error.
+   */
+  inline void PinMemory_();
+  /*!
+   * \brief In-place method to unpin the current array by calling UnpinData
+   *        on the underlying DLTensor.
+   * \note This is an in-place method. Behavior depends on the current context,
+   *       kDLCPUPinned: will be unpinned;
+   *       others: directly return.
+   */
+  inline void UnpinMemory_();
+  /*!
+   * \brief Check if the array is pinned.
+   */
+  inline bool IsPinned() const;
  /*!
   * \brief Load NDArray from stream
   * \param stream The input data stream
@@ -272,6 +293,27 @@ class NDArray {
  DGL_DLL static void CopyFromTo(
      DLTensor* from, DLTensor* to, DGLStreamHandle stream = nullptr);

+  /*!
+   * \brief Function to pin the data of a DLTensor.
+   * \param tensor The array to be pinned.
+   * \note Data of the given array will be pinned inplace.
+   *       Behavior depends on the current context,
+   *       kDLCPU: will be pinned;
+   *       kDLCPUPinned: directly return;
+   *       kDLGPU: invalid, will throw an error.
+   */
+  DGL_DLL static void PinData(DLTensor* tensor);
+
+  /*!
+   * \brief Function to unpin the data of a DLTensor.
+   * \param tensor The array to be unpinned.
+   * \note Data of the given array will be unpinned inplace.
+   *       Behavior depends on the current context,
+   *       kDLCPUPinned: will be unpinned;
+   *       others: directly return.
+   */
+  DGL_DLL static void UnpinData(DLTensor* tensor);
+
  // internal namespace
  struct Internal;
 private:
@@ -431,6 +473,21 @@ inline NDArray NDArray::Clone(const DGLStreamHandle &stream) const {
  return this->CopyTo(dptr->ctx, stream);
 }

+inline void NDArray::PinMemory_() {
+  CHECK(data_ != nullptr);
+  PinData(&(data_->dl_tensor));
+}
+
+inline void NDArray::UnpinMemory_() {
+  CHECK(data_ != nullptr);
+  UnpinData(&(data_->dl_tensor));
+}
+
+inline bool NDArray::IsPinned() const {
+  CHECK(data_ != nullptr);
+  return data_->dl_tensor.ctx.device_type == kDLCPUPinned;
+}
+
 inline int NDArray::use_count() const {
  if (data_ == nullptr) return 0;
  return data_->ref_counter_.load(std::memory_order_relaxed);

--- a/python/dgl/heterograph.py
+++ b/python/dgl/heterograph.py
@@ -5458,6 +5458,74 @@ class DGLHeteroGraph(object):
        """
        return self.to(F.cpu())

+    def pin_memory_(self):
+        """Pin the graph structure to the page-locked memory.
+
+        This is an **inplace** method. The graph structure must be on CPU to be pinned.
+        If the graph struture is already pinned, the function directly returns it.
+
+        Materialization of new sparse formats for pinned graphs is not allowed.
+        To avoid implicit formats materialization during training,
+        you should create all the needed formats before pinnning.
+        But cloning and materialization is fine. See the examples below.
+
+        Returns
+        -------
+        DGLGraph
+            The pinned graph.
+
+        Examples
+        --------
+        The following example uses PyTorch backend.
+
+        >>> import dgl
+        >>> import torch
+
+        >>> g = dgl.graph((torch.tensor([1, 0]), torch.tensor([1, 2])))
+        >>> g.pin_memory_()
+
+        Materialization of new sparse formats is not allowed for pinned graphs.
+
+        >>> g.create_formats_()  # This would raise an error! You should do this before pinning.
+
+        Cloning and materializing new formats is allowed. The returned graph is **not** pinned.
+
+        >>> g1 = g.formats(['csc'])
+        >>> assert not g1.is_pinned()
+        """
+        if self._graph.is_pinned():
+            return self
+        if F.device_type(self.device) != 'cpu':
+            raise DGLError("The graph structure must be on CPU to be pinned.")
+        self._graph.pin_memory_()
+        return self
+
+    def unpin_memory_(self):
+        """Unpin the graph structure from the page-locked memory.
+
+        This is an **inplace** method.If the graph struture is not pinned,
+        e.g., on CPU or GPU, the function directly returns it.
+
+        Returns
+        -------
+        DGLGraph
+            The unpinned graph.
+        """
+        if not self._graph.is_pinned():
+            return self
+        self._graph.unpin_memory_()
+        return self
+
+    def is_pinned(self):
+        """Check if the graph structure is pinned to the page-locked memory.
+
+        Returns
+        -------
+        bool
+            True if the graph structure is pinned.
+        """
+        return self._graph.is_pinned()
+
    def clone(self):
        """Return a heterograph object that is a clone of current graph.


--- a/python/dgl/heterograph_index.py
+++ b/python/dgl/heterograph_index.py
@@ -234,6 +234,44 @@ class HeteroGraphIndex(ObjectBase):
        """
        return _CAPI_DGLHeteroCopyTo(self, ctx.device_type, ctx.device_id)

+    def pin_memory_(self):
+        """Pin this graph to the page-locked memory.
+
+        NOTE: This is an inplace method.
+              The graph structure must be on CPU to be pinned.
+              If the graph struture is already pinned, the function directly returns it.
+
+        Returns
+        -------
+        HeteroGraphIndex
+            The pinned graph index.
+        """
+        return _CAPI_DGLHeteroPinMemory_(self)
+
+    def unpin_memory_(self):
+        """Unpin this graph from the page-locked memory.
+
+        NOTE: this is an inplace method.
+              If the graph struture is not pinned, e.g., on CPU or GPU,
+              the function directly returns it.
+
+        Returns
+        -------
+        HeteroGraphIndex
+            The unpinned graph index.
+        """
+        return _CAPI_DGLHeteroUnpinMemory_(self)
+
+    def is_pinned(self):
+        """Check if this graph is pinned to the page-locked memory.
+
+        Returns
+        -------
+        bool
+            True if the graph is pinned.
+        """
+        return bool(_CAPI_DGLHeteroIsPinned(self))
+
    def shared_memory(self, name, ntypes=None, etypes=None, formats=('coo', 'csr', 'csc')):
        """Return a copy of this graph in shared memory


--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
@@ -24,7 +24,7 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
  int64_t num_feat = 1;
  std::vector<int64_t> shape{len};

-  CHECK_EQ(array->ctx.device_type, kDLCPU);
+  CHECK_EQ(array->ctx.device_type, kDLCPUPinned);
  CHECK_EQ(index->ctx.device_type, kDLGPU);

  for (int d = 1; d < array->ndim; ++d) {

--- a/src/array/uvm_array.cc
+++ b/src/array/uvm_array.cc
@@ -15,10 +15,10 @@ namespace aten {

 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
 #ifdef DGL_USE_CUDA
-  CHECK_EQ(array->ctx.device_type, kDLCPU) << "Only the CPU device type input "
-                                           << "array supported";
-  CHECK_EQ(index->ctx.device_type, kDLGPU) << "Only the GPU device type input "
-                                           << "index supported";
+  CHECK_EQ(array->ctx.device_type, kDLCPUPinned)
+    << "Only the CPUPinned device type input array is supported";
+  CHECK_EQ(index->ctx.device_type, kDLGPU)
+    << "Only the GPU device type input index is supported";

  CHECK_GE(array->ndim, 1) << "Only support array with at least 1 dimension";
  CHECK_EQ(index->ndim, 1) << "Index array must be an 1D array.";

--- a/src/graph/heterograph.cc
+++ b/src/graph/heterograph.cc
@@ -267,6 +267,16 @@ HeteroGraphPtr HeteroGraph::CopyTo(HeteroGraphPtr g, const DLContext &ctx,
                                        hgindex->num_verts_per_type_));
 }

+void HeteroGraph::PinMemory_() {
+  for (auto g : relation_graphs_)
+    g->PinMemory_();
+}
+
+void HeteroGraph::UnpinMemory_() {
+  for (auto g : relation_graphs_)
+    g->UnpinMemory_();
+}
+
 std::string HeteroGraph::SharedMemName() const {
  return shared_mem_ ? shared_mem_->GetName() : "";
 }

--- a/src/graph/heterograph.h
+++ b/src/graph/heterograph.h
@@ -58,6 +58,10 @@ class HeteroGraph : public BaseHeteroGraph {
    return relation_graphs_[0]->Context();
  }

+  bool IsPinned() const override {
+    return relation_graphs_[0]->IsPinned();
+  }
+
  uint8_t NumBits() const override {
    return relation_graphs_[0]->NumBits();
  }
@@ -228,6 +232,25 @@ class HeteroGraph : public BaseHeteroGraph {
  static HeteroGraphPtr CopyTo(HeteroGraphPtr g, const DLContext &ctx,
                               const DGLStreamHandle &stream = nullptr);

+  /*!
+  * \brief Pin all relation graphs of the current graph.
+  * \note The graph will be pinned inplace. Behavior depends on the current context,
+  *       kDLCPU: will be pinned;
+  *       kDLCPUPinned: directly return;
+  *       kDLGPU: invalid, will throw an error.
+  *       The context check is deferred to pinning the NDArray.
+  */
+  void PinMemory_();
+
+  /*!
+  * \brief Unpin all relation graphs of the current graph.
+  * \note The graph will be unpinned inplace. Behavior depends on the current context,
+  *       kDLCPUPinned: will be unpinned;
+  *       others: directly return.
+  *       The context check is deferred to unpinning the NDArray.
+  */
+  void UnpinMemory_();
+
  /*! \brief Copy the data to shared memory.
  *
  * Also save names of node types and edge types of the HeteroGraph object to shared memory

--- a/src/graph/heterograph_capi.cc
+++ b/src/graph/heterograph_capi.cc
@@ -173,7 +173,19 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroDataType")
 DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroContext")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    HeteroGraphRef hg = args[0];
-    *rv = hg->Context();
+    // The Python side only recognizes CPU and GPU device type.
+    // Use is_pinned() to checked whether the object is
+    // on page-locked memory
+    if (hg->Context().device_type == kDLCPUPinned)
+      *rv = DLContext{kDLCPU, 0};
+    else
+      *rv = hg->Context();
+  });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroIsPinned")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    HeteroGraphRef hg = args[0];
+    *rv = hg->IsPinned();
  });

 DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroNumBits")
@@ -473,6 +485,22 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCopyTo")
    *rv = HeteroGraphRef(hg_new);
  });

+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroPinMemory_")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    HeteroGraphRef hg = args[0];
+    auto hgindex = std::dynamic_pointer_cast<HeteroGraph>(hg.sptr());
+    hgindex->PinMemory_();
+    *rv = hg;
+  });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroUnpinMemory_")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    HeteroGraphRef hg = args[0];
+    auto hgindex = std::dynamic_pointer_cast<HeteroGraph>(hg.sptr());
+    hgindex->UnpinMemory_();
+    *rv = hg;
+  });
+
 DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCopyToSharedMem")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    HeteroGraphRef hg = args[0];

--- a/src/graph/unit_graph.cc
+++ b/src/graph/unit_graph.cc
@@ -133,6 +133,10 @@ class UnitGraph::COO : public BaseHeteroGraph {
    return adj_.row->ctx;
  }

+  bool IsPinned() const override {
+    return adj_.row.IsPinned();
+  }
+
  uint8_t NumBits() const override {
    return adj_.row->dtype.bits;
  }
@@ -156,6 +160,17 @@ class UnitGraph::COO : public BaseHeteroGraph {
    return COO(meta_graph_, adj_.CopyTo(ctx, stream));
  }

+
+  /*! \brief Pin the adj_: COOMatrix of the COO graph. */
+  void PinMemory_() {
+    adj_.PinMemory_();
+  }
+
+  /*! \brief Unpin the adj_: COOMatrix of the COO graph. */
+  void UnpinMemory_() {
+    adj_.UnpinMemory_();
+  }
+
  bool IsMultigraph() const override {
    return aten::COOHasDuplicate(adj_);
  }
@@ -520,6 +535,10 @@ class UnitGraph::CSR : public BaseHeteroGraph {
    return adj_.indices->ctx;
  }

+  bool IsPinned() const override {
+    return adj_.indices.IsPinned();
+  }
+
  uint8_t NumBits() const override {
    return adj_.indices->dtype.bits;
  }
@@ -547,6 +566,16 @@ class UnitGraph::CSR : public BaseHeteroGraph {
    }
  }

+  /*! \brief Pin the adj_: CSRMatrix of the CSR graph. */
+  void PinMemory_() {
+    adj_.PinMemory_();
+  }
+
+  /*! \brief Unpin the adj_: CSRMatrix of the CSR graph. */
+  void UnpinMemory_() {
+    adj_.UnpinMemory_();
+  }
+
  bool IsMultigraph() const override {
    return aten::CSRHasDuplicate(adj_);
  }
@@ -829,6 +858,10 @@ DLContext UnitGraph::Context() const {
  return GetAny()->Context();
 }

+bool UnitGraph::IsPinned() const {
+  return GetAny()->IsPinned();
+}
+
 uint8_t UnitGraph::NumBits() const {
  return GetAny()->NumBits();
 }
@@ -1263,6 +1296,24 @@ HeteroGraphPtr UnitGraph::CopyTo(HeteroGraphPtr g, const DLContext &ctx,
  }
 }

+void UnitGraph::PinMemory_() {
+  if (this->in_csr_->defined())
+    this->in_csr_->PinMemory_();
+  if (this->out_csr_->defined())
+    this->out_csr_->PinMemory_();
+  if (this->coo_->defined())
+    this->coo_->PinMemory_();
+}
+
+void UnitGraph::UnpinMemory_() {
+  if (this->in_csr_->defined())
+    this->in_csr_->UnpinMemory_();
+  if (this->out_csr_->defined())
+    this->out_csr_->UnpinMemory_();
+  if (this->coo_->defined())
+    this->coo_->UnpinMemory_();
+}
+
 void UnitGraph::InvalidateCSR() {
  this->out_csr_ = CSRPtr(new CSR());
 }
@@ -1334,6 +1385,10 @@ UnitGraph::CSRPtr UnitGraph::GetInCSR(bool inplace) const {
  // Prefers converting from COO since it is parallelized.
  // TODO(BarclayII): need benchmarking.
  if (!in_csr_->defined()) {
+    // inplace new formats materialization is not allowed for pinned graphs
+    if (inplace && IsPinned())
+      LOG(FATAL) << "Cannot create new formats for pinned graphs, " <<
+        "please create the CSC format before pinning.";
    if (coo_->defined()) {
      const auto& newadj = aten::COOToCSR(
            aten::COOTranspose(coo_->adj()));
@@ -1365,6 +1420,10 @@ UnitGraph::CSRPtr UnitGraph::GetOutCSR(bool inplace) const {
  // Prefers converting from COO since it is parallelized.
  // TODO(BarclayII): need benchmarking.
  if (!out_csr_->defined()) {
+    // inplace new formats materialization is not allowed for pinned graphs
+    if (inplace && IsPinned())
+      LOG(FATAL) << "Cannot create new formats for pinned graphs, " <<
+        "please create the CSR format before pinning.";
    if (coo_->defined()) {
      const auto& newadj = aten::COOToCSR(coo_->adj());

@@ -1393,6 +1452,10 @@ UnitGraph::COOPtr UnitGraph::GetCOO(bool inplace) const {
        CodeToStr(formats_) << ", cannot create COO matrix.";
  COOPtr ret = coo_;
  if (!coo_->defined()) {
+    // inplace new formats materialization is not allowed for pinned graphs
+    if (inplace && IsPinned())
+      LOG(FATAL) << "Cannot create new formats for pinned graphs, " <<
+        "please create the COO format before pinning.";
    if (in_csr_->defined()) {
      const auto& newadj = aten::COOTranspose(aten::CSRToCOO(in_csr_->adj(), true));


--- a/src/graph/unit_graph.h
+++ b/src/graph/unit_graph.h
@@ -83,6 +83,8 @@ class UnitGraph : public BaseHeteroGraph {

  DLContext Context() const override;

+  bool IsPinned() const override;
+
  uint8_t NumBits() const override;

  bool IsMultigraph() const override;
@@ -208,6 +210,25 @@ class UnitGraph : public BaseHeteroGraph {
  static HeteroGraphPtr CopyTo(HeteroGraphPtr g, const DLContext &ctx,
                               const DGLStreamHandle &stream = nullptr);

+  /*!
+  * \brief Pin the in_csr_, out_scr_ and coo_ of the current graph.
+  * \note The graph will be pinned inplace. Behavior depends on the current context,
+  *       kDLCPU: will be pinned;
+  *       kDLCPUPinned: directly return;
+  *       kDLGPU: invalid, will throw an error.
+  *       The context check is deferred to pinning the NDArray.
+  */
+  void PinMemory_();
+
+  /*!
+  * \brief Unpin the in_csr_, out_scr_ and coo_ of the current graph.
+  * \note The graph will be unpinned inplace. Behavior depends on the current context,
+  *       kDLCPUPinned: will be unpinned;
+  *       others: directly return.
+  *       The context check is deferred to unpinning the NDArray.
+  */
+  void UnpinMemory_();
+
  /*! 
   * \brief Create in-edge CSR format of the unit graph.
   * \param inplace if true and the in-edge CSR format does not exist, the created

--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -99,6 +99,10 @@ DeviceAPI* DeviceAPI::Get(DGLContext ctx, bool allow_missing) {
      static_cast<int>(ctx.device_type), allow_missing);
 }

+DeviceAPI* DeviceAPI::Get(DLDeviceType dev_type, bool allow_missing) {
+  return DeviceAPIManager::Get(static_cast<int>(dev_type), allow_missing);
+}
+
 void* DeviceAPI::AllocWorkspace(DGLContext ctx,
                                size_t size,
                                DGLType type_hint) {
@@ -124,11 +128,11 @@ void DeviceAPI::SyncStreamFromTo(DGLContext ctx,
  LOG(FATAL) << "Device does not support stream api.";
 }

-void DeviceAPI::PinData(DGLContext ctx, void* ptr, size_t nbytes) {
+void DeviceAPI::PinData(void* ptr, size_t nbytes) {
  LOG(FATAL) << "Device does not support cudaHostRegister api.";
 }

-void DeviceAPI::UnpinData(DGLContext ctx, void* ptr) {
+void DeviceAPI::UnpinData(void* ptr) {
  LOG(FATAL) << "Device does not support cudaHostUnregister api.";
 }
 }  // namespace runtime

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -174,13 +174,16 @@ class CUDADeviceAPI final : public DeviceAPI {
    return static_cast<DGLStreamHandle>(CUDAThreadEntry::ThreadLocal()->stream);
  }

-  void PinData(DGLContext ctx, void* ptr, size_t nbytes) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
+  /*! NOTE: cudaHostRegister can be called from an arbitrary GPU device,
+   *        so we don't need to specify a ctx.
+   *        The pinned memory can be seen by all CUDA contexts,
+   *        not just the one that performed the allocation
+   */
+  void PinData(void* ptr, size_t nbytes) {
    CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault));
  }

-  void UnpinData(DGLContext ctx, void* ptr) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
+  void UnpinData(void* ptr) {
    CUDA_CALL(cudaHostUnregister(ptr));
  }


--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -63,6 +63,10 @@ struct NDArray::Internal {
    } else if (ptr->mem) {
      ptr->mem = nullptr;
    } else if (ptr->dl_tensor.data != nullptr) {
+      // if the array is still pinned before freeing, unpin it.
+      if (ptr->dl_tensor.ctx.device_type == kDLCPUPinned) {
+        UnpinData(&(ptr->dl_tensor));
+      }
      dgl::runtime::DeviceAPI::Get(ptr->dl_tensor.ctx)->FreeDataSpace(
          ptr->dl_tensor.ctx, ptr->dl_tensor.data);
    }
@@ -202,6 +206,19 @@ NDArray NDArray::EmptyShared(const std::string &name,
  return ret;
 }

+inline DLContext GetDevice(DLContext ctx) {
+  switch (ctx.device_type) {
+    case kDLCPU:
+    case kDLGPU:
+      return ctx;
+      break;
+    default:
+      // fallback to CPU
+      return DLContext{kDLCPU, 0};
+      break;
+  }
+}
+
 NDArray NDArray::Empty(std::vector<int64_t> shape,
                       DLDataType dtype,
                       DLContext ctx) {
@@ -209,7 +226,7 @@ NDArray NDArray::Empty(std::vector<int64_t> shape,
  if (td->IsAvailable())
    return td->Empty(shape, dtype, ctx);

-  NDArray ret = Internal::Create(shape, dtype, ctx);
+  NDArray ret = Internal::Create(shape, dtype, GetDevice(ctx));
  // setup memory content
  size_t size = GetDataSize(ret.data_->dl_tensor);
  size_t alignment = GetDataAlignment(ret.data_->dl_tensor);
@@ -251,6 +268,22 @@ void NDArray::CopyFromTo(DLTensor* from,
    from_size, from->ctx, to->ctx, from->dtype, stream);
 }

+void NDArray::PinData(DLTensor* tensor) {
+  // Only need to call PinData once, since the pinned memory can be seen
+  // by all CUDA contexts, not just the one that performed the allocation
+  if (tensor->ctx.device_type == kDLCPUPinned) return;
+  CHECK_EQ(tensor->ctx.device_type, kDLCPU)
+    << "Only NDArray on CPU can be pinned";
+  DeviceAPI::Get(kDLGPU)->PinData(tensor->data, GetDataSize(*tensor));
+  tensor->ctx = DLContext{kDLCPUPinned, 0};
+}
+
+void NDArray::UnpinData(DLTensor* tensor) {
+  if (tensor->ctx.device_type != kDLCPUPinned) return;
+  DeviceAPI::Get(kDLGPU)->UnpinData(tensor->data);
+  tensor->ctx = DLContext{kDLCPU, 0};
+}
+
 template<typename T>
 NDArray NDArray::FromVector(const std::vector<T>& vec, DLContext ctx) {
  const DLDataType dtype = DLDataTypeTraits<T>::dtype;
@@ -508,16 +541,13 @@ int DGLArrayCopyToBytes(DGLArrayHandle handle,
 int DGLArrayPinData(DGLArrayHandle handle,
                    DLContext ctx) {
  API_BEGIN();
-  CHECK_EQ(ctx.device_type, kDLGPU);
-  DeviceAPI::Get(ctx)->PinData(ctx, handle->data,
-                                        GetDataSize(*handle));
+  NDArray::PinData(handle);
  API_END();
 }

 int DGLArrayUnpinData(DGLArrayHandle handle,
                      DLContext ctx) {
  API_BEGIN();
-  CHECK_EQ(ctx.device_type, kDLGPU);
-  DeviceAPI::Get(ctx)->UnpinData(ctx, handle->data);
+  NDArray::UnpinData(handle);
  API_END();
 }
--- a/tests/compute/test_heterograph.py
+++ b/tests/compute/test_heterograph.py
@@ -952,6 +952,68 @@ def test_to_device2(g, idtype):
        assert g1.etypes == g.etypes
        assert g1.canonical_etypes == g.canonical_etypes

+@unittest.skipIf(F._default_context_str == 'cpu', reason="Need gpu for this test")
+@parametrize_dtype
+def test_pin_memory_(idtype):
+    # TODO: rewrite this test case to accept different graphs so we
+    #  can test reverse graph and batched graph
+    g = create_test_heterograph(idtype)
+    g.nodes['user'].data['h'] = F.ones((3, 5))
+    g.nodes['game'].data['i'] = F.ones((2, 5))
+    g.edges['plays'].data['e'] = F.ones((4, 4))
+    g = g.to(F.cpu())
+    assert not g.is_pinned()
+
+    if F.is_cuda_available():
+        # unpin an unpinned CPU graph, directly return
+        g.unpin_memory_()
+        assert not g.is_pinned()
+        assert g.device == F.cpu()
+
+        # pin a CPU graph
+        g.pin_memory_()
+        assert g.is_pinned()
+        assert g.device == F.cpu()
+        assert F.context(g.nodes['user'].data['h']) == F.cpu()
+        assert F.context(g.nodes['game'].data['i']) == F.cpu()
+        assert F.context(g.edges['plays'].data['e']) == F.cpu()
+        for ntype in g.ntypes:
+            assert F.context(g.batch_num_nodes(ntype)) == F.cpu()
+        for etype in g.canonical_etypes:
+            assert F.context(g.batch_num_edges(etype)) == F.cpu()
+
+        # not allowed to create new formats for the pinned graph
+        with pytest.raises(DGLError):
+            g.create_formats_()
+        # it's fine to clone with new formats, but new graphs are not pinned
+        # >>> g.formats()
+        # {'created': ['coo'], 'not created': ['csr', 'csc']}
+        assert not g.formats('csc').is_pinned()
+        assert not g.formats('csr').is_pinned()
+        # 'coo' formats is already created and thus not cloned
+        assert g.formats('coo').is_pinned()
+
+        # pin a pinned graph, direcly return
+        g.pin_memory_()
+        assert g.is_pinned()
+        assert g.device == F.cpu()
+
+        # unpin a pinned graph
+        g.unpin_memory_()
+        assert not g.is_pinned()
+        assert g.device == F.cpu()
+
+        g1 = g.to(F.cuda())
+
+        # unpin an unpinned GPU graph, directly return
+        g1.unpin_memory_()
+        assert not g1.is_pinned()
+        assert g1.device == F.cuda()
+
+        # error pinning a GPU graph
+        with pytest.raises(DGLError):
+            g1.pin_memory_()
+
 @parametrize_dtype
 def test_convert_bound(idtype):
    def _test_bipartite_bound(data, card):