[Feature] Bump DLPack to v0.7 and decouple DLPack from the core library (#4454)

* rename `DLContext` to `DGLContext` * rename `kDLGPU` to `kDLCUDA` * replace DLTensor with DGLArray * fix linting * Unify DGLType and DLDataType to DGLDataType * Fix FFI * rename DLDeviceType to DGLDeviceType * decouple dlpack from the core library * fix bug * fix lint * fix merge * fix build * address comments * rename dl_converter to dlpack_convert * remove redundant comments

[Feature] Bump DLPack to v0.7 and decouple DLPack from the core library (#4454)
* rename `DLContext` to `DGLContext` * rename `kDLGPU` to `kDLCUDA` * replace DLTensor with DGLArray * fix linting * Unify DGLType and DLDataType to DGLDataType * Fix FFI * rename DLDeviceType to DGLDeviceType * decouple dlpack from the core library * fix bug * fix lint * fix merge * fix build * address comments * rename dl_converter to dlpack_convert * remove redundant comments
cded5b80 · Xin Yao · GitHub · f1689ad0 · cded5b80 · cded5b80
Unverified Commit cded5b80 authored Sep 19, 2022 by Xin Yao Committed by GitHub Sep 19, 2022
20 changed files
--- a/src/graph/sampling/randomwalks/node2vec_randomwalk.h
+++ b/src/graph/sampling/randomwalks/node2vec_randomwalk.h
@@ -65,7 +65,7 @@ bool has_edge_between(const CSRMatrix &csr, dgl_id_t u,
 *         as well as whether to terminate.
 */

-template <DLDeviceType XPU, typename IdxType>
+template <DGLDeviceType XPU, typename IdxType>
 std::tuple<dgl_id_t, dgl_id_t, bool> Node2vecRandomWalkStep(
    IdxType *data, dgl_id_t curr, dgl_id_t pre, const double p, const double q,
    int64_t len, const CSRMatrix &csr, bool csr_has_data, const FloatArray &probs,
@@ -146,7 +146,7 @@ std::tuple<dgl_id_t, dgl_id_t, bool> Node2vecRandomWalkStep(
  return std::make_tuple(next_node, eid, terminate(data, next_node, len));
 }

-template <DLDeviceType XPU, typename IdxType>
+template <DGLDeviceType XPU, typename IdxType>
 std::pair<IdArray, IdArray> Node2vecRandomWalk(
    const HeteroGraphPtr g, const IdArray seeds,
    const double p, const double q,

--- a/src/graph/sampling/randomwalks/randomwalk_cpu.cc
+++ b/src/graph/sampling/randomwalks/randomwalk_cpu.cc
@@ -23,7 +23,7 @@ namespace sampling {

 namespace impl {

-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::pair<IdArray, IdArray> RandomWalk(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -37,13 +37,13 @@ std::pair<IdArray, IdArray> RandomWalk(
  return MetapathBasedRandomWalk<XPU, IdxType>(hg, seeds, metapath, prob, terminate);
 }

-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
    const IdArray src,
    const IdArray dst,
    const int64_t num_samples_per_node,
    const int64_t k) {
-  CHECK(src->ctx.device_type == kDLCPU) << "IdArray needs be on CPU!";
+  CHECK(src->ctx.device_type == kDGLCPU) << "IdArray needs be on CPU!";
  int64_t len = src->shape[0] / num_samples_per_node;
  IdxType* src_data = src.Ptr<IdxType>();
  const IdxType* dst_data = dst.Ptr<IdxType>();
@@ -90,43 +90,43 @@ std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
  device->CopyDataFromTo(static_cast<IdxType*>(res_src_vec.data()), 0,
      res_src.Ptr<IdxType>(), 0,
      sizeof(IdxType) * res_src_vec.size(),
-      DGLContext{kDLCPU, 0}, res_src->ctx,
+      DGLContext{kDGLCPU, 0}, res_src->ctx,
      res_src->dtype);
  device->CopyDataFromTo(static_cast<IdxType*>(res_dst_vec.data()), 0,
      res_dst.Ptr<IdxType>(), 0,
      sizeof(IdxType) * res_dst_vec.size(),
-      DGLContext{kDLCPU, 0}, res_dst->ctx,
+      DGLContext{kDGLCPU, 0}, res_dst->ctx,
      res_dst->dtype);
  device->CopyDataFromTo(static_cast<IdxType*>(res_cnt_vec.data()), 0,
      res_cnt.Ptr<IdxType>(), 0,
      sizeof(IdxType) * res_cnt_vec.size(),
-      DGLContext{kDLCPU, 0}, res_cnt->ctx,
+      DGLContext{kDGLCPU, 0}, res_cnt->ctx,
      res_cnt->dtype);

  return std::make_tuple(res_src, res_dst, res_cnt);
 }

 template
-std::pair<IdArray, IdArray> RandomWalk<kDLCPU, int32_t>(
+std::pair<IdArray, IdArray> RandomWalk<kDGLCPU, int32_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob);
 template
-std::pair<IdArray, IdArray> RandomWalk<kDLCPU, int64_t>(
+std::pair<IdArray, IdArray> RandomWalk<kDGLCPU, int64_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob);

 template
-std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors<kDLCPU, int32_t>(
+std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors<kDGLCPU, int32_t>(
    const IdArray src,
    const IdArray dst,
    const int64_t num_samples_per_node,
    const int64_t k);
 template
-std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors<kDLCPU, int64_t>(
+std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors<kDGLCPU, int64_t>(
    const IdArray src,
    const IdArray dst,
    const int64_t num_samples_per_node,

--- a/src/graph/sampling/randomwalks/randomwalk_gpu.cu
+++ b/src/graph/sampling/randomwalks/randomwalk_gpu.cu
@@ -168,7 +168,7 @@ __global__ void _RandomWalkBiasedKernel(
 }  // namespace

 // random walk for uniform choice
-template<DLDeviceType XPU, typename IdType>
+template<DGLDeviceType XPU, typename IdType>
 std::pair<IdArray, IdArray> RandomWalkUniform(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -205,7 +205,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
  // copy graph metadata pointers to GPU
  device->CopyDataFromTo(h_graphs.data(), 0, d_graphs, 0,
      (num_etypes) * sizeof(GraphKernelData<IdType>),
-      DGLContext{kDLCPU, 0},
+      DGLContext{kDGLCPU, 0},
      ctx,
      hg->GetCSRMatrix(0).indptr->dtype);
  // copy metapath to GPU
@@ -218,7 +218,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
  dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
  ATEN_FLOAT_TYPE_SWITCH(restart_prob->dtype, FloatType, "random walk GPU kernel", {
-    CHECK(restart_prob->ctx.device_type == kDLGPU) << "restart prob should be in GPU.";
+    CHECK(restart_prob->ctx.device_type == kDGLCUDA) << "restart prob should be in GPU.";
    CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
    const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
    const int64_t restart_prob_size = restart_prob->shape[0];
@@ -246,7 +246,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
 * \brief Random walk for biased choice. We use inverse transform sampling to
 * choose the next step.
 */
-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 std::pair<IdArray, IdArray> RandomWalkBiased(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -321,7 +321,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
      device->AllocWorkspace(ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
  device->CopyDataFromTo(h_graphs.data(), 0, d_graphs, 0,
      (num_etypes) * sizeof(GraphKernelData<IdType>),
-      DGLContext{kDLCPU, 0},
+      DGLContext{kDGLCPU, 0},
      ctx,
      hg->GetCSRMatrix(0).indptr->dtype);
  // copy probs pointers to GPU
@@ -329,7 +329,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
      device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *)));
  device->CopyDataFromTo(probs.get(), 0, probs_dev, 0,
      (num_etypes) * sizeof(FloatType *),
-      DGLContext{kDLCPU, 0},
+      DGLContext{kDGLCPU, 0},
      ctx,
      prob[0]->dtype);
  // copy probs_sum pointers to GPU
@@ -337,7 +337,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
      device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *)));
  device->CopyDataFromTo(prob_sums.get(), 0, prob_sums_dev, 0,
      (num_etypes) * sizeof(FloatType *),
-      DGLContext{kDLCPU, 0},
+      DGLContext{kDGLCPU, 0},
      ctx,
      prob[0]->dtype);
  // copy metapath to GPU
@@ -349,7 +349,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
  dim3 block(256);
  dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
-  CHECK(restart_prob->ctx.device_type == kDLGPU) << "restart prob should be in GPU.";
+  CHECK(restart_prob->ctx.device_type == kDGLCUDA) << "restart prob should be in GPU.";
  CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
  const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
  const int64_t restart_prob_size = restart_prob->shape[0];
@@ -376,7 +376,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
  return std::make_pair(traces, eids);
 }

-template<DLDeviceType XPU, typename IdType>
+template<DGLDeviceType XPU, typename IdType>
 std::pair<IdArray, IdArray> RandomWalk(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -392,7 +392,7 @@ std::pair<IdArray, IdArray> RandomWalk(
  }

  auto restart_prob = NDArray::Empty(
-      {0}, DLDataType{kDLFloat, 32, 1}, DGLContext{XPU, 0});
+      {0}, DGLDataType{kDGLFloat, 32, 1}, DGLContext{XPU, 0});
  if (!isUniform) {
    std::pair<IdArray, IdArray> ret;
    ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
@@ -404,7 +404,7 @@ std::pair<IdArray, IdArray> RandomWalk(
  }
 }

-template<DLDeviceType XPU, typename IdType>
+template<DGLDeviceType XPU, typename IdType>
 std::pair<IdArray, IdArray> RandomWalkWithRestart(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -422,7 +422,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(

  auto device_ctx = seeds->ctx;
  auto restart_prob_array = NDArray::Empty(
-      {1}, DLDataType{kDLFloat, 64, 1}, device_ctx);
+      {1}, DGLDataType{kDGLFloat, 64, 1}, device_ctx);
  auto device = dgl::runtime::DeviceAPI::Get(device_ctx);

  // use cuda stream from local thread
@@ -430,7 +430,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
  device->CopyDataFromTo(
      &restart_prob, 0, restart_prob_array.Ptr<double>(), 0,
      sizeof(double),
-      DGLContext{kDLCPU, 0}, device_ctx,
+      DGLContext{kDGLCPU, 0}, device_ctx,
      restart_prob_array->dtype);
  device->StreamSync(device_ctx, stream);

@@ -446,7 +446,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
  }
 }

-template<DLDeviceType XPU, typename IdType>
+template<DGLDeviceType XPU, typename IdType>
 std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -473,13 +473,13 @@ std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
  }
 }

-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
    const IdArray src,
    const IdArray dst,
    const int64_t num_samples_per_node,
    const int64_t k) {
-  CHECK(src->ctx.device_type == kDLGPU) <<
+  CHECK(src->ctx.device_type == kDGLCUDA) <<
    "IdArray needs be on GPU!";
  const IdxType* src_data = src.Ptr<IdxType>();
  const IdxType* dst_data = dst.Ptr<IdxType>();
@@ -495,27 +495,27 @@ std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
 }

 template
-std::pair<IdArray, IdArray> RandomWalk<kDLGPU, int32_t>(
+std::pair<IdArray, IdArray> RandomWalk<kDGLCUDA, int32_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob);
 template
-std::pair<IdArray, IdArray> RandomWalk<kDLGPU, int64_t>(
+std::pair<IdArray, IdArray> RandomWalk<kDGLCUDA, int64_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob);

 template
-std::pair<IdArray, IdArray> RandomWalkWithRestart<kDLGPU, int32_t>(
+std::pair<IdArray, IdArray> RandomWalkWithRestart<kDGLCUDA, int32_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob,
    double restart_prob);
 template
-std::pair<IdArray, IdArray> RandomWalkWithRestart<kDLGPU, int64_t>(
+std::pair<IdArray, IdArray> RandomWalkWithRestart<kDGLCUDA, int64_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
@@ -523,14 +523,14 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart<kDLGPU, int64_t>(
    double restart_prob);

 template
-std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart<kDLGPU, int32_t>(
+std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart<kDGLCUDA, int32_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob,
    FloatArray restart_prob);
 template
-std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart<kDLGPU, int64_t>(
+std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart<kDGLCUDA, int64_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
@@ -538,13 +538,13 @@ std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart<kDLGPU, int64_t>(
    FloatArray restart_prob);

 template
-std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors<kDLGPU, int32_t>(
+std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors<kDGLCUDA, int32_t>(
    const IdArray src,
    const IdArray dst,
    const int64_t num_samples_per_node,
    const int64_t k);
 template
-std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors<kDLGPU, int64_t>(
+std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors<kDGLCUDA, int64_t>(
    const IdArray src,
    const IdArray dst,
    const int64_t num_samples_per_node,

--- a/src/graph/sampling/randomwalks/randomwalk_with_restart_cpu.cc
+++ b/src/graph/sampling/randomwalks/randomwalk_with_restart_cpu.cc
@@ -22,7 +22,7 @@ namespace sampling {

 namespace impl {

-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::pair<IdArray, IdArray> RandomWalkWithRestart(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -37,21 +37,21 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
 }

 template
-std::pair<IdArray, IdArray> RandomWalkWithRestart<kDLCPU, int32_t>(
+std::pair<IdArray, IdArray> RandomWalkWithRestart<kDGLCPU, int32_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob,
    double restart_prob);
 template
-std::pair<IdArray, IdArray> RandomWalkWithRestart<kDLCPU, int64_t>(
+std::pair<IdArray, IdArray> RandomWalkWithRestart<kDGLCPU, int64_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob,
    double restart_prob);

-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -73,14 +73,14 @@ std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
 }

 template
-std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart<kDLCPU, int32_t>(
+std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart<kDGLCPU, int32_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob,
    FloatArray restart_prob);
 template
-std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart<kDLCPU, int64_t>(
+std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart<kDGLCPU, int64_t>(
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,

--- a/src/graph/sampling/randomwalks/randomwalks.cc
+++ b/src/graph/sampling/randomwalks/randomwalks.cc
@@ -36,7 +36,7 @@ void CheckRandomWalkInputs(
  // CHECK_SAME_CONTEXT(seeds, metapath);

  if (hg->IsPinned()) {
-    CHECK_EQ(seeds->ctx.device_type, kDLGPU) << "Expected seeds (" << seeds->ctx << ")" \
+    CHECK_EQ(seeds->ctx.device_type, kDGLCUDA) << "Expected seeds (" << seeds->ctx << ")" \
      << " to be on the GPU when the graph is pinned.";
  } else if (hg->Context() != seeds->ctx) {
    LOG(FATAL) << "Expected seeds (" << seeds->ctx << ")" << " to have the same " \

--- a/src/graph/sampling/randomwalks/randomwalks_cpu.h
+++ b/src/graph/sampling/randomwalks/randomwalks_cpu.h
@@ -35,7 +35,7 @@ namespace {
 * \return A 2D array of shape (len(seeds), max_num_steps + 1) with node IDs.
 * \note The graph itself should be bounded in the closure of \c step.
 */
-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::pair<IdArray, IdArray> GenericRandomWalk(
    const IdArray seeds,
    int64_t max_num_steps,

--- a/src/graph/sampling/randomwalks/randomwalks_impl.h
+++ b/src/graph/sampling/randomwalks/randomwalks_impl.h
@@ -38,7 +38,7 @@ using StepFunc = std::function<
 * \brief Get the node types traversed by the metapath.
 * \return A 1D array of shape (len(metapath) + 1,) with node type IDs.
 */
-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 TypeArray GetNodeTypesFromMetapath(
    const HeteroGraphPtr hg,
    const TypeArray metapath);
@@ -58,7 +58,7 @@ TypeArray GetNodeTypesFromMetapath(
 * \note This function should be called together with GetNodeTypesFromMetapath to
 *       determine the node type of each node in the random walk traces.
 */
-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::pair<IdArray, IdArray> RandomWalk(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -81,7 +81,7 @@ std::pair<IdArray, IdArray> RandomWalk(
 * \note This function should be called together with GetNodeTypesFromMetapath to
 *       determine the node type of each node in the random walk traces.
 */
-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::pair<IdArray, IdArray> RandomWalkWithRestart(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -107,7 +107,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
 * \note This function should be called together with GetNodeTypesFromMetapath to
 *       determine the node type of each node in the random walk traces.
 */
-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
    const HeteroGraphPtr hg,
    const IdArray seeds,
@@ -115,7 +115,7 @@ std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
    const std::vector<FloatArray> &prob,
    FloatArray restart_prob);

-template<DLDeviceType XPU, typename IdxType>
+template<DGLDeviceType XPU, typename IdxType>
 std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
    const IdArray src,
    const IdArray dst,

--- a/src/graph/serialize/zerocopy_serializer.cc
+++ b/src/graph/serialize/zerocopy_serializer.cc
 /*!
- *  Copyright (c) 2020 by Contributors
+ *  Copyright (c) 2020-2022 by Contributors
 * \file graph/serailize/zerocopy_serializer.cc
 * \brief serializer implementation.
 */
@@ -13,38 +13,9 @@ namespace dgl {

 using dgl::runtime::NDArray;

-struct RawDataTensorCtx {
-  std::vector<int64_t> shape;
-  std::vector<int64_t> stride;
-  DLManagedTensor tensor;
-};
-
-void RawDataTensoDLPackDeleter(DLManagedTensor* tensor) {
-  auto ctx = static_cast<RawDataTensorCtx*>(tensor->manager_ctx);
-  delete[] ctx->tensor.dl_tensor.data;
-  delete ctx;
-}
-
-NDArray CreateNDArrayFromRawData(std::vector<int64_t> shape, DLDataType dtype,
-                                 DLContext ctx, void* raw) {
-  auto dlm_tensor_ctx = new RawDataTensorCtx();
-  DLManagedTensor* dlm_tensor = &dlm_tensor_ctx->tensor;
-  dlm_tensor_ctx->shape = shape;
-  dlm_tensor->manager_ctx = dlm_tensor_ctx;
-  dlm_tensor->dl_tensor.shape = dmlc::BeginPtr(dlm_tensor_ctx->shape);
-  dlm_tensor->dl_tensor.ctx = ctx;
-  dlm_tensor->dl_tensor.ndim = static_cast<int>(shape.size());
-  dlm_tensor->dl_tensor.dtype = dtype;
-
-  dlm_tensor_ctx->stride.resize(dlm_tensor->dl_tensor.ndim, 1);
-  for (int i = dlm_tensor->dl_tensor.ndim - 2; i >= 0; --i) {
-    dlm_tensor_ctx->stride[i] =
-      dlm_tensor_ctx->shape[i + 1] * dlm_tensor_ctx->stride[i + 1];
-  }
-  dlm_tensor->dl_tensor.strides = dmlc::BeginPtr(dlm_tensor_ctx->stride);
-  dlm_tensor->dl_tensor.data = raw;
-  dlm_tensor->deleter = RawDataTensoDLPackDeleter;
-  return NDArray::FromDLPack(dlm_tensor);
+NDArray CreateNDArrayFromRawData(std::vector<int64_t> shape, DGLDataType dtype,
+                                 DGLContext ctx, void* raw) {
+  return NDArray::CreateFromRaw(shape, dtype, ctx, raw, true);
 }

 void StreamWithBuffer::PushNDArray(const NDArray& tensor) {
@@ -89,18 +60,18 @@ void StreamWithBuffer::PushNDArray(const NDArray& tensor) {
 NDArray StreamWithBuffer::PopNDArray() {
 #ifndef _WIN32
  int ndim;
-  DLDataType dtype;
+  DGLDataType dtype;

-  CHECK(this->Read(&ndim)) << "Invalid DLTensor file format";
-  CHECK(this->Read(&dtype)) << "Invalid DLTensor file format";
+  CHECK(this->Read(&ndim)) << "Invalid DGLArray file format";
+  CHECK(this->Read(&dtype)) << "Invalid DGLArray file format";

  std::vector<int64_t> shape(ndim);
  if (ndim != 0) {
-    CHECK(this->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format";
+    CHECK(this->ReadArray(&shape[0], ndim)) << "Invalid DGLArray file format";
  }

-  DLContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
+  DGLContext cpu_ctx;
+  cpu_ctx.device_type = kDGLCPU;
  cpu_ctx.device_id = 0;

  bool is_shared_mem;

--- a/src/graph/shared_mem_manager.cc
+++ b/src/graph/shared_mem_manager.cc
@@ -33,7 +33,7 @@ namespace dgl {
 template <>
 NDArray SharedMemManager::CopyToSharedMem<NDArray>(const NDArray &data,
                                                   std::string name) {
-  DLContext ctx = {kDLCPU, 0};
+  DGLContext ctx = {kDGLCPU, 0};
  std::vector<int64_t> shape(data->shape, data->shape + data->ndim);
  strm_->Write(data->ndim);
  strm_->Write(data->dtype);
@@ -83,15 +83,15 @@ template <>
 bool SharedMemManager::CreateFromSharedMem<NDArray>(NDArray *nd,
                                                    std::string name) {
  int ndim;
-  DLContext ctx = {kDLCPU, 0};
-  DLDataType dtype;
+  DGLContext ctx = {kDGLCPU, 0};
+  DGLDataType dtype;

-  CHECK(this->Read(&ndim)) << "Invalid DLTensor file format";
-  CHECK(this->Read(&dtype)) << "Invalid DLTensor file format";
+  CHECK(this->Read(&ndim)) << "Invalid DGLArray file format";
+  CHECK(this->Read(&dtype)) << "Invalid DGLArray file format";

  std::vector<int64_t> shape(ndim);
  if (ndim != 0) {
-    CHECK(this->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format";
+    CHECK(this->ReadArray(&shape[0], ndim)) << "Invalid DGLArray file format";
  }
  bool is_null;
  this->Read(&is_null);

--- a/src/graph/subgraph.cc
+++ b/src/graph/subgraph.cc
@@ -13,7 +13,7 @@ HeteroSubgraph InEdgeGraphRelabelNodes(
  CHECK_EQ(vids.size(), graph->NumVertexTypes())
    << "Invalid input: the input list size must be the same as the number of vertex types.";
  std::vector<IdArray> eids(graph->NumEdgeTypes());
-  DLContext ctx = aten::GetContextOf(vids);
+  DGLContext ctx = aten::GetContextOf(vids);
  for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
    auto pair = graph->meta_graph()->FindEdge(etype);
    const dgl_type_t dst_vtype = pair.second;
@@ -34,7 +34,7 @@ HeteroSubgraph InEdgeGraphNoRelabelNodes(
    << "Invalid input: the input list size must be the same as the number of vertex types.";
  std::vector<HeteroGraphPtr> subrels(graph->NumEdgeTypes());
  std::vector<IdArray> induced_edges(graph->NumEdgeTypes());
-  DLContext ctx = aten::GetContextOf(vids);
+  DGLContext ctx = aten::GetContextOf(vids);
  for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
    auto pair = graph->meta_graph()->FindEdge(etype);
    const dgl_type_t src_vtype = pair.first;
@@ -79,7 +79,7 @@ HeteroSubgraph OutEdgeGraphRelabelNodes(
  CHECK_EQ(vids.size(), graph->NumVertexTypes())
    << "Invalid input: the input list size must be the same as the number of vertex types.";
  std::vector<IdArray> eids(graph->NumEdgeTypes());
-  DLContext ctx = aten::GetContextOf(vids);
+  DGLContext ctx = aten::GetContextOf(vids);
  for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
    auto pair = graph->meta_graph()->FindEdge(etype);
    const dgl_type_t src_vtype = pair.first;
@@ -100,7 +100,7 @@ HeteroSubgraph OutEdgeGraphNoRelabelNodes(
    << "Invalid input: the input list size must be the same as the number of vertex types.";
  std::vector<HeteroGraphPtr> subrels(graph->NumEdgeTypes());
  std::vector<IdArray> induced_edges(graph->NumEdgeTypes());
-  DLContext ctx = aten::GetContextOf(vids);
+  DGLContext ctx = aten::GetContextOf(vids);
  for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
    auto pair = graph->meta_graph()->FindEdge(etype);
    const dgl_type_t src_vtype = pair.first;

--- a/src/graph/transform/compact.cc
+++ b/src/graph/transform/compact.cc
@@ -140,7 +140,7 @@ CompactGraphsCPU(

 template<>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphs<kDLCPU, int32_t>(
+CompactGraphs<kDGLCPU, int32_t>(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve) {
  return CompactGraphsCPU<int32_t>(graphs, always_preserve);
@@ -148,7 +148,7 @@ CompactGraphs<kDLCPU, int32_t>(

 template<>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphs<kDLCPU, int64_t>(
+CompactGraphs<kDGLCPU, int64_t>(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve) {
  return CompactGraphsCPU<int64_t>(graphs, always_preserve);

--- a/src/graph/transform/compact.h
+++ b/src/graph/transform/compact.h
@@ -41,7 +41,7 @@ namespace transform {
 *
 * @return The vector of compacted graphs and the vector of induced nodes.
 */
-template<DLDeviceType XPU, typename IdType>
+template<DGLDeviceType XPU, typename IdType>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
 CompactGraphs(
    const std::vector<HeteroGraphPtr> &graphs,

--- a/src/graph/transform/cpu/knn.cc
+++ b/src/graph/transform/cpu/knn.cc
@@ -304,7 +304,7 @@ void BruteForceKNN(const NDArray& data_points, const IdArray& data_offsets,
 }
 }  // namespace impl

-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void KNN(const NDArray& data_points, const IdArray& data_offsets,
         const NDArray& query_points, const IdArray& query_offsets,
         const int k, IdArray result, const std::string& algorithm) {
@@ -319,7 +319,7 @@ void KNN(const NDArray& data_points, const IdArray& data_offsets,
  }
 }

-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void NNDescent(const NDArray& points, const IdArray& offsets,
               IdArray result, const int k, const int num_iters,
               const int num_candidates, const double delta) {
@@ -567,36 +567,36 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
  device->FreeWorkspace(ctx, flags);
 }

-template void KNN<kDLCPU, float, int32_t>(
+template void KNN<kDGLCPU, float, int32_t>(
  const NDArray& data_points, const IdArray& data_offsets,
  const NDArray& query_points, const IdArray& query_offsets,
  const int k, IdArray result, const std::string& algorithm);
-template void KNN<kDLCPU, float, int64_t>(
+template void KNN<kDGLCPU, float, int64_t>(
  const NDArray& data_points, const IdArray& data_offsets,
  const NDArray& query_points, const IdArray& query_offsets,
  const int k, IdArray result, const std::string& algorithm);
-template void KNN<kDLCPU, double, int32_t>(
+template void KNN<kDGLCPU, double, int32_t>(
  const NDArray& data_points, const IdArray& data_offsets,
  const NDArray& query_points, const IdArray& query_offsets,
  const int k, IdArray result, const std::string& algorithm);
-template void KNN<kDLCPU, double, int64_t>(
+template void KNN<kDGLCPU, double, int64_t>(
  const NDArray& data_points, const IdArray& data_offsets,
  const NDArray& query_points, const IdArray& query_offsets,
  const int k, IdArray result, const std::string& algorithm);

-template void NNDescent<kDLCPU, float, int32_t>(
+template void NNDescent<kDGLCPU, float, int32_t>(
  const NDArray& points, const IdArray& offsets,
  IdArray result, const int k, const int num_iters,
  const int num_candidates, const double delta);
-template void NNDescent<kDLCPU, float, int64_t>(
+template void NNDescent<kDGLCPU, float, int64_t>(
  const NDArray& points, const IdArray& offsets,
  IdArray result, const int k, const int num_iters,
  const int num_candidates, const double delta);
-template void NNDescent<kDLCPU, double, int32_t>(
+template void NNDescent<kDGLCPU, double, int32_t>(
  const NDArray& points, const IdArray& offsets,
  IdArray result, const int k, const int num_iters,
  const int num_candidates, const double delta);
-template void NNDescent<kDLCPU, double, int64_t>(
+template void NNDescent<kDGLCPU, double, int64_t>(
  const NDArray& points, const IdArray& offsets,
  IdArray result, const int k, const int num_iters,
  const int num_candidates, const double delta);

--- a/src/graph/transform/cuda/cuda_compact_graph.cu
+++ b/src/graph/transform/cuda/cuda_compact_graph.cu
@@ -70,7 +70,7 @@ void BuildNodeMaps(
  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
    const IdArray& nodes = input_nodes[ntype];
    if (nodes->shape[0] > 0) {
-      CHECK_EQ(nodes->ctx.device_type, kDLGPU);
+      CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
      node_maps->LhsHashTable(ntype).FillWithDuplicates(
          nodes.Ptr<IdType>(),
          nodes->shape[0],
@@ -92,7 +92,7 @@ CompactGraphsGPU(
  auto device = runtime::DeviceAPI::Get(ctx);
  cudaStream_t stream = runtime::getCurrentCUDAStream();

-  CHECK_EQ(ctx.device_type, kDLGPU);
+  CHECK_EQ(ctx.device_type, kDGLCUDA);

  // Step 1: Collect the nodes that has connections for each type.
  const uint64_t num_ntypes = graphs[0]->NumVertexTypes();
@@ -206,8 +206,8 @@ CompactGraphsGPU(
    num_induced_nodes.data(), 0,
    sizeof(*num_induced_nodes.data())*num_ntypes,
    ctx,
-    DGLContext{kDLCPU, 0},
-    DGLType{kDLInt, 64, 1});
+    DGLContext{kDGLCPU, 0},
+    DGLDataType{kDGLInt, 64, 1});
  device->StreamSync(ctx, stream);

  // wait for the node counts to finish transferring
@@ -255,7 +255,7 @@ CompactGraphsGPU(

 template<>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphs<kDLGPU, int32_t>(
+CompactGraphs<kDGLCUDA, int32_t>(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve) {
  return CompactGraphsGPU<int32_t>(graphs, always_preserve);
@@ -263,7 +263,7 @@ CompactGraphs<kDLGPU, int32_t>(

 template<>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphs<kDLGPU, int64_t>(
+CompactGraphs<kDGLCUDA, int64_t>(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve) {
  return CompactGraphsGPU<int64_t>(graphs, always_preserve);

--- a/src/graph/transform/cuda/cuda_map_edges.cuh
+++ b/src/graph/transform/cuda/cuda_map_edges.cuh
@@ -250,9 +250,9 @@ MapEdges(
        node_map.RhsHashTable(dst_type).DeviceHandle());
    } else {
      new_lhs.emplace_back(
-          aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx));
      new_rhs.emplace_back(
-          aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx));
    }
  }


--- a/src/graph/transform/cuda/cuda_to_block.cu
+++ b/src/graph/transform/cuda/cuda_to_block.cu
@@ -82,7 +82,7 @@ class DeviceNodeMapMaker {
    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
      const IdArray& nodes = lhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
-        CHECK_EQ(nodes->ctx.device_type, kDLGPU);
+        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
        node_maps->LhsHashTable(ntype).FillWithDuplicates(
            nodes.Ptr<IdType>(),
            nodes->shape[0],
@@ -127,7 +127,7 @@ class DeviceNodeMapMaker {
    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
      const IdArray& nodes = lhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
-        CHECK_EQ(nodes->ctx.device_type, kDLGPU);
+        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
        node_maps->LhsHashTable(ntype).FillWithUnique(
            nodes.Ptr<IdType>(),
            nodes->shape[0],
@@ -154,7 +154,7 @@ class DeviceNodeMapMaker {


 // Since partial specialization is not allowed for functions, use this as an
-// intermediate for ToBlock where XPU = kDLGPU.
+// intermediate for ToBlock where XPU = kDGLCUDA.
 template<typename IdType>
 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
 ToBlockGPU(
@@ -170,7 +170,7 @@ ToBlockGPU(
  auto device = runtime::DeviceAPI::Get(ctx);
  cudaStream_t stream = runtime::getCurrentCUDAStream();

-  CHECK_EQ(ctx.device_type, kDLGPU);
+  CHECK_EQ(ctx.device_type, kDGLCUDA);
  for (const auto& nodes : rhs_nodes) {
    CHECK_EQ(ctx.device_type, nodes->ctx.device_type);
  }
@@ -296,8 +296,8 @@ ToBlockGPU(
        num_nodes_per_type.data(), 0,
        sizeof(*num_nodes_per_type.data())*num_ntypes,
        ctx,
-        DGLContext{kDLCPU, 0},
-        DGLType{kDLInt, 64, 1});
+        DGLContext{kDGLCPU, 0},
+        DGLDataType{kDGLInt, 64, 1});
    device->StreamSync(ctx, stream);

    // wait for the node counts to finish transferring
@@ -321,7 +321,7 @@ ToBlockGPU(
      induced_edges.push_back(edge_arrays[etype].id);
    } else {
      induced_edges.push_back(
-            aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
+            aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx));
    }
  }

@@ -358,8 +358,8 @@ ToBlockGPU(
      // No rhs nodes are given for this edge type. Create an empty graph.
      rel_graphs.push_back(CreateFromCOO(
          2, lhs_nodes[srctype]->shape[0], rhs_nodes[dsttype]->shape[0],
-          aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx),
-          aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx)));
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx),
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx)));
    } else {
      rel_graphs.push_back(CreateFromCOO(
          2,
@@ -383,7 +383,7 @@ ToBlockGPU(
 // functions are the same.
 // Using template<> fails to export the symbols.
 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-// ToBlock<kDLGPU, int32_t>
+// ToBlock<kDGLCUDA, int32_t>
 ToBlockGPU32(
    HeteroGraphPtr graph,
    const std::vector<IdArray> &rhs_nodes,
@@ -393,7 +393,7 @@ ToBlockGPU32(
 }

 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-// ToBlock<kDLGPU, int64_t>
+// ToBlock<kDGLCUDA, int64_t>
 ToBlockGPU64(
    HeteroGraphPtr graph,
    const std::vector<IdArray> &rhs_nodes,

--- a/src/graph/transform/cuda/knn.cu
+++ b/src/graph/transform/cuda/knn.cu
@@ -528,11 +528,11 @@ void BruteForceKNNSharedCuda(const NDArray& data_points, const IdArray& data_off
  int64_t num_blocks = 0, final_elem = 0, copyoffset = (batch_size - 1) * sizeof(IdType);
  device->CopyDataFromTo(
    num_block_prefixsum, copyoffset, &num_blocks, 0,
-    sizeof(IdType), ctx, DLContext{kDLCPU, 0},
+    sizeof(IdType), ctx, DGLContext{kDGLCPU, 0},
    query_offsets->dtype);
  device->CopyDataFromTo(
    num_block_per_segment, copyoffset, &final_elem, 0,
-    sizeof(IdType), ctx, DLContext{kDLCPU, 0},
+    sizeof(IdType), ctx, DGLContext{kDGLCPU, 0},
    query_offsets->dtype);
  num_blocks += final_elem;
  device->FreeWorkspace(ctx, num_block_per_segment);
@@ -815,7 +815,7 @@ __global__ void UpdateNeighborsKernel(const FloatType* points, const IdType* off

 }  // namespace impl

-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void KNN(const NDArray& data_points, const IdArray& data_offsets,
         const NDArray& query_points, const IdArray& query_offsets,
         const int k, IdArray result, const std::string& algorithm) {
@@ -830,7 +830,7 @@ void KNN(const NDArray& data_points, const IdArray& data_offsets,
  }
 }

-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void NNDescent(const NDArray& points, const IdArray& offsets,
               IdArray result, const int k, const int num_iters,
               const int num_candidates, const double delta) {
@@ -905,7 +905,7 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
      stream));
    device->CopyDataFromTo(
      total_num_updates_d, 0, &total_num_updates, 0,
-      sizeof(IdType), ctx, DLContext{kDLCPU, 0},
+      sizeof(IdType), ctx, DGLContext{kDGLCPU, 0},
      offsets->dtype);

    if (total_num_updates <= static_cast<IdType>(delta * k * num_nodes)) {
@@ -922,36 +922,36 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
  device->FreeWorkspace(ctx, sum_temp_storage);
 }

-template void KNN<kDLGPU, float, int32_t>(
+template void KNN<kDGLCUDA, float, int32_t>(
  const NDArray& data_points, const IdArray& data_offsets,
  const NDArray& query_points, const IdArray& query_offsets,
  const int k, IdArray result, const std::string& algorithm);
-template void KNN<kDLGPU, float, int64_t>(
+template void KNN<kDGLCUDA, float, int64_t>(
  const NDArray& data_points, const IdArray& data_offsets,
  const NDArray& query_points, const IdArray& query_offsets,
  const int k, IdArray result, const std::string& algorithm);
-template void KNN<kDLGPU, double, int32_t>(
+template void KNN<kDGLCUDA, double, int32_t>(
  const NDArray& data_points, const IdArray& data_offsets,
  const NDArray& query_points, const IdArray& query_offsets,
  const int k, IdArray result, const std::string& algorithm);
-template void KNN<kDLGPU, double, int64_t>(
+template void KNN<kDGLCUDA, double, int64_t>(
  const NDArray& data_points, const IdArray& data_offsets,
  const NDArray& query_points, const IdArray& query_offsets,
  const int k, IdArray result, const std::string& algorithm);

-template void NNDescent<kDLGPU, float, int32_t>(
+template void NNDescent<kDGLCUDA, float, int32_t>(
  const NDArray& points, const IdArray& offsets,
  IdArray result, const int k, const int num_iters,
  const int num_candidates, const double delta);
-template void NNDescent<kDLGPU, float, int64_t>(
+template void NNDescent<kDGLCUDA, float, int64_t>(
  const NDArray& points, const IdArray& offsets,
  IdArray result, const int k, const int num_iters,
  const int num_candidates, const double delta);
-template void NNDescent<kDLGPU, double, int32_t>(
+template void NNDescent<kDGLCUDA, double, int32_t>(
  const NDArray& points, const IdArray& offsets,
  IdArray result, const int k, const int num_iters,
  const int num_candidates, const double delta);
-template void NNDescent<kDLGPU, double, int64_t>(
+template void NNDescent<kDGLCUDA, double, int64_t>(
  const NDArray& points, const IdArray& offsets,
  IdArray result, const int k, const int num_iters,
  const int num_candidates, const double delta);

--- a/src/graph/transform/knn.h
+++ b/src/graph/transform/knn.h
@@ -27,7 +27,7 @@ namespace transform {
 *  relation between \a query_points and \a data_points.
 * \param algorithm algorithm used to compute the k-nearest neighbors.
 */
-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void KNN(const NDArray& data_points, const IdArray& data_offsets,
         const NDArray& query_points, const IdArray& query_offsets,
         const int k, IdArray result, const std::string& algorithm);
@@ -44,7 +44,7 @@ void KNN(const NDArray& data_points, const IdArray& data_offsets,
 * \param num_candidates The maximum number of candidates to be considered during one iteration.
 * \param delta A value controls the early abort.
 */
-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void NNDescent(const NDArray& points, const IdArray& offsets,
               IdArray result, const int k, const int num_iters,
               const int num_candidates, const double delta);

--- a/src/graph/transform/partition_hetero.cc
+++ b/src/graph/transform/partition_hetero.cc
@@ -164,10 +164,10 @@ HaloHeteroSubgraph GetSubgraphWithHalo(std::shared_ptr<HeteroGraph> hg,
  }

  num_edges = edge_src.size();
-  IdArray new_src = IdArray::Empty({num_edges}, DLDataType{kDLInt, 64, 1},
-                                   DLContext{kDLCPU, 0});
-  IdArray new_dst = IdArray::Empty({num_edges}, DLDataType{kDLInt, 64, 1},
-                                   DLContext{kDLCPU, 0});
+  IdArray new_src = IdArray::Empty({num_edges}, DGLDataType{kDGLInt, 64, 1},
+                                   DGLContext{kDGLCPU, 0});
+  IdArray new_dst = IdArray::Empty({num_edges}, DGLDataType{kDGLInt, 64, 1},
+                                   DGLContext{kDGLCPU, 0});
  dgl_id_t *new_src_data = static_cast<dgl_id_t *>(new_src->data);
  dgl_id_t *new_dst_data = static_cast<dgl_id_t *>(new_dst->data);
  for (size_t i = 0; i < edge_src.size(); i++) {

--- a/src/graph/transform/to_bipartite.cc
+++ b/src/graph/transform/to_bipartite.cc
@@ -41,7 +41,7 @@ namespace transform {
 namespace {

 // Since partial specialization is not allowed for functions, use this as an
-// intermediate for ToBlock where XPU = kDLCPU.
+// intermediate for ToBlock where XPU = kDGLCPU.
 template<typename IdType>
 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
 ToBlockCPU(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
@@ -143,7 +143,7 @@ ToBlockCPU(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,

 template<>
 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlock<kDLCPU, int32_t>(HeteroGraphPtr graph,
+ToBlock<kDGLCPU, int32_t>(HeteroGraphPtr graph,
                         const std::vector<IdArray> &rhs_nodes,
                         bool include_rhs_in_lhs,
                         std::vector<IdArray>* const lhs_nodes) {
@@ -152,7 +152,7 @@ ToBlock<kDLCPU, int32_t>(HeteroGraphPtr graph,

 template<>
 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlock<kDLCPU, int64_t>(HeteroGraphPtr graph,
+ToBlock<kDGLCPU, int64_t>(HeteroGraphPtr graph,
                         const std::vector<IdArray> &rhs_nodes,
                         bool include_rhs_in_lhs,
                         std::vector<IdArray>* const lhs_nodes) {
@@ -172,7 +172,7 @@ ToBlockGPU64(HeteroGraphPtr, const std::vector<IdArray>&, bool, std::vector<IdAr

 template<>
 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlock<kDLGPU, int32_t>(HeteroGraphPtr graph,
+ToBlock<kDGLCUDA, int32_t>(HeteroGraphPtr graph,
                         const std::vector<IdArray> &rhs_nodes,
                         bool include_rhs_in_lhs,
                         std::vector<IdArray>* const lhs_nodes) {
@@ -181,7 +181,7 @@ ToBlock<kDLGPU, int32_t>(HeteroGraphPtr graph,

 template<>
 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlock<kDLGPU, int64_t>(HeteroGraphPtr graph,
+ToBlock<kDGLCUDA, int64_t>(HeteroGraphPtr graph,
                         const std::vector<IdArray> &rhs_nodes,
                         bool include_rhs_in_lhs,
                         std::vector<IdArray>* const lhs_nodes) {