[Feature] Bump DLPack to v0.7 and decouple DLPack from the core library (#4454)

* rename `DLContext` to `DGLContext` * rename `kDLGPU` to `kDLCUDA` * replace DLTensor with DGLArray * fix linting * Unify DGLType and DLDataType to DGLDataType * Fix FFI * rename DLDeviceType to DGLDeviceType * decouple dlpack from the core library * fix bug * fix lint * fix merge * fix build * address comments * rename dl_converter to dlpack_convert * remove redundant comments

[Feature] Bump DLPack to v0.7 and decouple DLPack from the core library (#4454)
* rename `DLContext` to `DGLContext` * rename `kDLGPU` to `kDLCUDA` * replace DLTensor with DGLArray * fix linting * Unify DGLType and DLDataType to DGLDataType * Fix FFI * rename DLDeviceType to DGLDeviceType * decouple dlpack from the core library * fix bug * fix lint * fix merge * fix build * address comments * rename dl_converter to dlpack_convert * remove redundant comments
cded5b80 · Xin Yao · GitHub · f1689ad0 · cded5b80 · cded5b80
Unverified Commit cded5b80 authored Sep 19, 2022 by Xin Yao Committed by GitHub Sep 19, 2022
20 changed files
--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
@@ -240,7 +240,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(

 ///////////////////////////// CSR sampling //////////////////////////

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
                                    IdArray rows,
                                    const int64_t num_picks,
@@ -311,7 +311,7 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
  device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
      sizeof(new_len),
      ctx,
-      DGLContext{kDLCPU, 0},
+      DGLContext{kDGLCPU, 0},
      mat.indptr->dtype);
  CUDA_CALL(cudaEventRecord(copyEvent, stream));

@@ -369,9 +369,9 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
      picked_col, picked_idx);
 }

-template COOMatrix CSRRowWiseSamplingUniform<kDLGPU, int32_t>(
+template COOMatrix CSRRowWiseSamplingUniform<kDGLCUDA, int32_t>(
    CSRMatrix, IdArray, int64_t, bool);
-template COOMatrix CSRRowWiseSamplingUniform<kDLGPU, int64_t>(
+template COOMatrix CSRRowWiseSamplingUniform<kDGLCUDA, int64_t>(
    CSRMatrix, IdArray, int64_t, bool);

 }  // namespace impl

--- a/src/array/cuda/rowwise_sampling_prob.cu
+++ b/src/array/cuda/rowwise_sampling_prob.cu
@@ -416,7 +416,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
 * @param replace Is replacement sampling?
 * @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
 */
-template <DLDeviceType XPU, typename IdType, typename FloatType>
+template <DGLDeviceType XPU, typename IdType, typename FloatType>
 COOMatrix CSRRowWiseSampling(CSRMatrix mat,
                             IdArray rows,
                             int64_t num_picks,
@@ -492,7 +492,7 @@ COOMatrix CSRRowWiseSampling(CSRMatrix mat,
  device->CopyDataFromTo(temp_ptr, num_rows * sizeof(temp_len), &temp_len, 0,
      sizeof(temp_len),
      ctx,
-      DGLContext{kDLCPU, 0},
+      DGLContext{kDGLCPU, 0},
      mat.indptr->dtype);
  device->StreamSync(ctx, stream);

@@ -523,7 +523,7 @@ COOMatrix CSRRowWiseSampling(CSRMatrix mat,
  device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
      sizeof(new_len),
      ctx,
-      DGLContext{kDLCPU, 0},
+      DGLContext{kDGLCPU, 0},
      mat.indptr->dtype);
  CUDA_CALL(cudaEventRecord(copyEvent, stream));

@@ -651,13 +651,13 @@ COOMatrix CSRRowWiseSampling(CSRMatrix mat,
      picked_col, picked_idx);
 }

-template COOMatrix CSRRowWiseSampling<kDLGPU, int32_t, float>(
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int32_t, float>(
  CSRMatrix, IdArray, int64_t, FloatArray, bool);
-template COOMatrix CSRRowWiseSampling<kDLGPU, int64_t, float>(
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int64_t, float>(
  CSRMatrix, IdArray, int64_t, FloatArray, bool);
-template COOMatrix CSRRowWiseSampling<kDLGPU, int32_t, double>(
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int32_t, double>(
  CSRMatrix, IdArray, int64_t, FloatArray, bool);
-template COOMatrix CSRRowWiseSampling<kDLGPU, int64_t, double>(
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int64_t, double>(
  CSRMatrix, IdArray, int64_t, FloatArray, bool);

 }  // namespace impl

--- a/src/array/cuda/sddmm.cu
+++ b/src/array/cuda/sddmm.cu
@@ -54,52 +54,52 @@ void SDDMMCoo(const std::string& op,
 }


-template void SDDMMCsr<kDLGPU, int32_t, 16>(
+template void SDDMMCsr<kDGLCUDA, int32_t, 16>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLGPU, int64_t, 16>(
+template void SDDMMCsr<kDGLCUDA, int64_t, 16>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLGPU, int32_t, 32>(
+template void SDDMMCsr<kDGLCUDA, int32_t, 32>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLGPU, int64_t, 32>(
+template void SDDMMCsr<kDGLCUDA, int64_t, 32>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLGPU, int32_t, 64>(
+template void SDDMMCsr<kDGLCUDA, int32_t, 64>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLGPU, int64_t, 64>(
+template void SDDMMCsr<kDGLCUDA, int64_t, 64>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);

-template void SDDMMCoo<kDLGPU, int32_t, 16>(
+template void SDDMMCoo<kDGLCUDA, int32_t, 16>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLGPU, int64_t, 16>(
+template void SDDMMCoo<kDGLCUDA, int64_t, 16>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLGPU, int32_t, 32>(
+template void SDDMMCoo<kDGLCUDA, int32_t, 32>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLGPU, int64_t, 32>(
+template void SDDMMCoo<kDGLCUDA, int64_t, 32>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLGPU, int32_t, 64>(
+template void SDDMMCoo<kDGLCUDA, int32_t, 64>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLGPU, int64_t, 64>(
+template void SDDMMCoo<kDGLCUDA, int64_t, 64>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);

--- a/src/array/cuda/sddmm_hetero_coo.cu
+++ b/src/array/cuda/sddmm_hetero_coo.cu
@@ -42,42 +42,42 @@ void SDDMMCooHetero(const std::string& op,
 }


-template void SDDMMCooHetero<kDLGPU, int32_t, 16>(
+template void SDDMMCooHetero<kDGLCUDA, int32_t, 16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCooHetero<kDLGPU, int64_t, 16>(
+template void SDDMMCooHetero<kDGLCUDA, int64_t, 16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCooHetero<kDLGPU, int32_t, 32>(
+template void SDDMMCooHetero<kDGLCUDA, int32_t, 32>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCooHetero<kDLGPU, int64_t, 32>(
+template void SDDMMCooHetero<kDGLCUDA, int64_t, 32>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCooHetero<kDLGPU, int32_t, 64>(
+template void SDDMMCooHetero<kDGLCUDA, int32_t, 64>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCooHetero<kDLGPU, int64_t, 64>(
+template void SDDMMCooHetero<kDGLCUDA, int64_t, 64>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,

--- a/src/array/cuda/sddmm_hetero_csr.cu
+++ b/src/array/cuda/sddmm_hetero_csr.cu
@@ -41,42 +41,42 @@ void SDDMMCsrHetero(const std::string& op,
  });
 }

-template void SDDMMCsrHetero<kDLGPU, int32_t, 16>(
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, 16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int64_t, 16>(
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, 16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int32_t, 32>(
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, 32>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int64_t, 32>(
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, 32>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int32_t, 64>(
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, 64>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
    std::vector<NDArray> out, int lhs_target, int rhs_target,
    const std::vector<dgl_type_t>& in_eid,
    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int64_t, 64>(
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, 64>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<CSRMatrix>& vec_csr,
    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,

--- a/src/array/cuda/segment_reduce.cu
+++ b/src/array/cuda/segment_reduce.cu
@@ -73,113 +73,113 @@ void BackwardSegmentCmp(NDArray feat,
 }


-template void SegmentReduce<kDLGPU, int32_t, 16>(
+template void SegmentReduce<kDGLCUDA, int32_t, 16>(
    const std::string& op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLGPU, int64_t, 16>(
+template void SegmentReduce<kDGLCUDA, int64_t, 16>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLGPU, int32_t, 32>(
+template void SegmentReduce<kDGLCUDA, int32_t, 32>(
    const std::string& op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLGPU, int64_t, 32>(
+template void SegmentReduce<kDGLCUDA, int64_t, 32>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLGPU, int32_t, 64>(
+template void SegmentReduce<kDGLCUDA, int32_t, 64>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLGPU, int64_t, 64>(
+template void SegmentReduce<kDGLCUDA, int64_t, 64>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void ScatterAdd<kDLGPU, int32_t, 16>(
+template void ScatterAdd<kDGLCUDA, int32_t, 16>(
    NDArray feat,
    NDArray idx,
    NDArray out);
-template void ScatterAdd<kDLGPU, int64_t, 16>(
+template void ScatterAdd<kDGLCUDA, int64_t, 16>(
    NDArray feat,
    NDArray idx,
    NDArray out);
-template void ScatterAdd<kDLGPU, int32_t, 32>(
+template void ScatterAdd<kDGLCUDA, int32_t, 32>(
    NDArray feat,
    NDArray idx,
    NDArray out);
-template void ScatterAdd<kDLGPU, int64_t, 32>(
+template void ScatterAdd<kDGLCUDA, int64_t, 32>(
    NDArray feat,
    NDArray idx,
    NDArray out);
-template void ScatterAdd<kDLGPU, int32_t, 64>(
+template void ScatterAdd<kDGLCUDA, int32_t, 64>(
    NDArray feat,
    NDArray idx,
    NDArray out);
-template void ScatterAdd<kDLGPU, int64_t, 64>(
+template void ScatterAdd<kDGLCUDA, int64_t, 64>(
    NDArray feat,
    NDArray idx,
    NDArray out);

-template void UpdateGradMinMax_hetero<kDLGPU, int32_t, 16>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, 16>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
-template void UpdateGradMinMax_hetero<kDLGPU, int64_t, 16>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, 16>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
-template void UpdateGradMinMax_hetero<kDLGPU, int32_t, 32>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, 32>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
-template void UpdateGradMinMax_hetero<kDLGPU, int64_t, 32>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, 32>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
-template void UpdateGradMinMax_hetero<kDLGPU, int32_t, 64>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, 64>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
-template void UpdateGradMinMax_hetero<kDLGPU, int64_t, 64>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, 64>(
    const HeteroGraphPtr& g, const std::string& op,
    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);

-template void BackwardSegmentCmp<kDLGPU, int32_t, 16>(
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, 16>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLGPU, int64_t, 16>(
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, 16>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLGPU, int32_t, 32>(
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, 32>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLGPU, int64_t, 32>(
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, 32>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLGPU, int32_t, 64>(
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, 64>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLGPU, int64_t, 64>(
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, 64>(
    NDArray feat,
    NDArray arg,
    NDArray out);

--- a/src/array/cuda/spmat_op_impl_coo.cu
+++ b/src/array/cuda/spmat_op_impl_coo.cu
@@ -71,7 +71,7 @@ __global__ void _COOGetRowNNZKernel(
  }
 }

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto& ctx = coo.row->ctx;
@@ -84,12 +84,12 @@ int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
      nb, nt, 0, stream,
      coo.row.Ptr<IdType>(), rst.Ptr<IdType>(),
      row, nnz);
-  rst = rst.CopyTo(DLContext{kDLCPU, 0});
+  rst = rst.CopyTo(DGLContext{kDGLCPU, 0});
  return *rst.Ptr<IdType>();
 }

-template int64_t COOGetRowNNZ<kDLGPU, int32_t>(COOMatrix, int64_t);
-template int64_t COOGetRowNNZ<kDLGPU, int64_t>(COOMatrix, int64_t);
+template int64_t COOGetRowNNZ<kDGLCUDA, int32_t>(COOMatrix, int64_t);
+template int64_t COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, int64_t);

 template <typename IdType>
 __global__ void _COOGetAllRowNNZKernel(
@@ -104,7 +104,7 @@ __global__ void _COOGetAllRowNNZKernel(
  }
 }

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto& ctx = coo.row->ctx;
@@ -112,7 +112,7 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
  IdType num_rows = coo.num_rows;
  IdType num_queries = rows->shape[0];
  if (num_queries == 1) {
-    auto rows_cpu = rows.CopyTo(DLContext{kDLCPU, 0});
+    auto rows_cpu = rows.CopyTo(DGLContext{kDGLCPU, 0});
    int64_t row = *rows_cpu.Ptr<IdType>();
    IdType nt = 1024;
    IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
@@ -136,8 +136,8 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
  }
 }

-template NDArray COOGetRowNNZ<kDLGPU, int32_t>(COOMatrix, NDArray);
-template NDArray COOGetRowNNZ<kDLGPU, int64_t>(COOMatrix, NDArray);
+template NDArray COOGetRowNNZ<kDGLCUDA, int32_t>(COOMatrix, NDArray);
+template NDArray COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, NDArray);

 }  // namespace impl
 }  // namespace aten

--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
@@ -21,7 +21,7 @@ namespace impl {

 ///////////////////////////// CSRIsNonZero /////////////////////////////

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto& ctx = csr.indptr->ctx;
@@ -38,14 +38,14 @@ bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
      rows.Ptr<IdType>(), cols.Ptr<IdType>(),
      1, 1, 1,
      static_cast<IdType*>(nullptr), static_cast<IdType>(-1), out.Ptr<IdType>());
-  out = out.CopyTo(DLContext{kDLCPU, 0});
+  out = out.CopyTo(DGLContext{kDGLCPU, 0});
  return *out.Ptr<IdType>() != -1;
 }

-template bool CSRIsNonZero<kDLGPU, int32_t>(CSRMatrix, int64_t, int64_t);
-template bool CSRIsNonZero<kDLGPU, int64_t>(CSRMatrix, int64_t, int64_t);
+template bool CSRIsNonZero<kDGLCUDA, int32_t>(CSRMatrix, int64_t, int64_t);
+template bool CSRIsNonZero<kDGLCUDA, int64_t>(CSRMatrix, int64_t, int64_t);

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
  const auto rowlen = row->shape[0];
  const auto collen = col->shape[0];
@@ -69,8 +69,8 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
  return rst != -1;
 }

-template NDArray CSRIsNonZero<kDLGPU, int32_t>(CSRMatrix, NDArray, NDArray);
-template NDArray CSRIsNonZero<kDLGPU, int64_t>(CSRMatrix, NDArray, NDArray);
+template NDArray CSRIsNonZero<kDGLCUDA, int32_t>(CSRMatrix, NDArray, NDArray);
+template NDArray CSRIsNonZero<kDGLCUDA, int64_t>(CSRMatrix, NDArray, NDArray);

 ///////////////////////////// CSRHasDuplicate /////////////////////////////

@@ -95,7 +95,7 @@ __global__ void _SegmentHasNoDuplicate(
 }


-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 bool CSRHasDuplicate(CSRMatrix csr) {
  if (!csr.sorted)
    csr = CSRSort(csr);
@@ -116,20 +116,20 @@ bool CSRHasDuplicate(CSRMatrix csr) {
  return !ret;
 }

-template bool CSRHasDuplicate<kDLGPU, int32_t>(CSRMatrix csr);
-template bool CSRHasDuplicate<kDLGPU, int64_t>(CSRMatrix csr);
+template bool CSRHasDuplicate<kDGLCUDA, int32_t>(CSRMatrix csr);
+template bool CSRHasDuplicate<kDGLCUDA, int64_t>(CSRMatrix csr);

 ///////////////////////////// CSRGetRowNNZ /////////////////////////////

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 int64_t CSRGetRowNNZ(CSRMatrix csr, int64_t row) {
  const IdType cur = aten::IndexSelect<IdType>(csr.indptr, row);
  const IdType next = aten::IndexSelect<IdType>(csr.indptr, row + 1);
  return next - cur;
 }

-template int64_t CSRGetRowNNZ<kDLGPU, int32_t>(CSRMatrix, int64_t);
-template int64_t CSRGetRowNNZ<kDLGPU, int64_t>(CSRMatrix, int64_t);
+template int64_t CSRGetRowNNZ<kDGLCUDA, int32_t>(CSRMatrix, int64_t);
+template int64_t CSRGetRowNNZ<kDGLCUDA, int64_t>(CSRMatrix, int64_t);

 template <typename IdType>
 __global__ void _CSRGetRowNNZKernel(
@@ -146,7 +146,7 @@ __global__ void _CSRGetRowNNZKernel(
  }
 }

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto len = rows->shape[0];
@@ -162,24 +162,24 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
  return rst;
 }

-template NDArray CSRGetRowNNZ<kDLGPU, int32_t>(CSRMatrix, NDArray);
-template NDArray CSRGetRowNNZ<kDLGPU, int64_t>(CSRMatrix, NDArray);
+template NDArray CSRGetRowNNZ<kDGLCUDA, int32_t>(CSRMatrix, NDArray);
+template NDArray CSRGetRowNNZ<kDGLCUDA, int64_t>(CSRMatrix, NDArray);

 ///////////////////////////// CSRGetRowColumnIndices /////////////////////////////

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row) {
  const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
  const int64_t offset = aten::IndexSelect<IdType>(csr.indptr, row) * sizeof(IdType);
  return csr.indices.CreateView({len}, csr.indices->dtype, offset);
 }

-template NDArray CSRGetRowColumnIndices<kDLGPU, int32_t>(CSRMatrix, int64_t);
-template NDArray CSRGetRowColumnIndices<kDLGPU, int64_t>(CSRMatrix, int64_t);
+template NDArray CSRGetRowColumnIndices<kDGLCUDA, int32_t>(CSRMatrix, int64_t);
+template NDArray CSRGetRowColumnIndices<kDGLCUDA, int64_t>(CSRMatrix, int64_t);

 ///////////////////////////// CSRGetRowData /////////////////////////////

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
  const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
  const int64_t offset = aten::IndexSelect<IdType>(csr.indptr, row) * sizeof(IdType);
@@ -189,12 +189,12 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
    return aten::Range(offset, offset + len, csr.indptr->dtype.bits, csr.indptr->ctx);
 }

-template NDArray CSRGetRowData<kDLGPU, int32_t>(CSRMatrix, int64_t);
-template NDArray CSRGetRowData<kDLGPU, int64_t>(CSRMatrix, int64_t);
+template NDArray CSRGetRowData<kDGLCUDA, int32_t>(CSRMatrix, int64_t);
+template NDArray CSRGetRowData<kDGLCUDA, int64_t>(CSRMatrix, int64_t);

 ///////////////////////////// CSRSliceRows /////////////////////////////

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
  const int64_t num_rows = end - start;
  const IdType st_pos = aten::IndexSelect<IdType>(csr.indptr, start);
@@ -215,8 +215,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
                   csr.sorted);
 }

-template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix, int64_t, int64_t);
-template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix, int64_t, int64_t);
+template CSRMatrix CSRSliceRows<kDGLCUDA, int32_t>(CSRMatrix, int64_t, int64_t);
+template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix, int64_t, int64_t);

 /*!
 * \brief Copy data segment to output buffers
@@ -243,7 +243,7 @@ __global__ void _SegmentCopyKernel(
  }
 }

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const int64_t len = rows->shape[0];
@@ -272,8 +272,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
                   csr.sorted);
 }

-template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix , NDArray);
-template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix , NDArray);
+template CSRMatrix CSRSliceRows<kDGLCUDA, int32_t>(CSRMatrix , NDArray);
+template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix , NDArray);

 ///////////////////////////// CSRGetDataAndIndices /////////////////////////////

@@ -345,7 +345,7 @@ __global__ void _SortedSearchKernel(
  }
 }

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray col) {
  const auto rowlen = row->shape[0];
  const auto collen = col->shape[0];
@@ -392,9 +392,9 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
  return {ret_row, ret_col, ret_data};
 }

-template std::vector<NDArray> CSRGetDataAndIndices<kDLGPU, int32_t>(
+template std::vector<NDArray> CSRGetDataAndIndices<kDGLCUDA, int32_t>(
    CSRMatrix csr, NDArray rows, NDArray cols);
-template std::vector<NDArray> CSRGetDataAndIndices<kDLGPU, int64_t>(
+template std::vector<NDArray> CSRGetDataAndIndices<kDGLCUDA, int64_t>(
    CSRMatrix csr, NDArray rows, NDArray cols);

 ///////////////////////////// CSRSliceMatrix /////////////////////////////
@@ -422,7 +422,7 @@ __global__ void _SegmentMaskColKernel(
  }
 }

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto& ctx = rows->ctx;
@@ -501,9 +501,9 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
                   ret_col, ret_data);
 }

-template CSRMatrix CSRSliceMatrix<kDLGPU, int32_t>(
+template CSRMatrix CSRSliceMatrix<kDGLCUDA, int32_t>(
    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols);
-template CSRMatrix CSRSliceMatrix<kDLGPU, int64_t>(
+template CSRMatrix CSRSliceMatrix<kDGLCUDA, int64_t>(
    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols);

 }  // namespace impl

--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
@@ -147,53 +147,53 @@ void SpMMCoo(const std::string& op, const std::string& reduce,
  }
 }

-template void SpMMCsr<kDLGPU, int32_t, 16>(
+template void SpMMCsr<kDGLCUDA, int32_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLGPU, int64_t, 16>(
+template void SpMMCsr<kDGLCUDA, int64_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLGPU, int32_t, 32>(
+template void SpMMCsr<kDGLCUDA, int32_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLGPU, int64_t, 32>(
+template void SpMMCsr<kDGLCUDA, int64_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLGPU, int32_t, 64>(
+template void SpMMCsr<kDGLCUDA, int32_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLGPU, int64_t, 64>(
+template void SpMMCsr<kDGLCUDA, int64_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);


-template void SpMMCoo<kDLGPU, int32_t, 16>(
+template void SpMMCoo<kDGLCUDA, int32_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLGPU, int64_t, 16>(
+template void SpMMCoo<kDGLCUDA, int64_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLGPU, int32_t, 32>(
+template void SpMMCoo<kDGLCUDA, int32_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLGPU, int64_t, 32>(
+template void SpMMCoo<kDGLCUDA, int64_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLGPU, int32_t, 64>(
+template void SpMMCoo<kDGLCUDA, int32_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLGPU, int64_t, 64>(
+template void SpMMCoo<kDGLCUDA, int64_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);

--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -203,7 +203,7 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
 /*! Cusparse implementation of SpMM on Csr format. */
 template <typename DType, typename IdType>
 void CusparseCsrmm2(
-    const DLContext& ctx,
+    const DGLContext& ctx,
    const CSRMatrix& csr,
    const DType* B_data, const DType* A_data,
    DType* C_data,
@@ -303,7 +303,7 @@ void CusparseCsrmm2(
 /*! Cusparse implementation of SpMM on Csr format. */
 template <typename DType, typename IdType>
 void CusparseCsrmm2Hetero(
-    const DLContext& ctx,
+    const DGLContext& ctx,
    const CSRMatrix& csr,
    const DType* B_data, const DType* A_data,
    DType* C_data,

--- a/src/array/cuda/spmm_hetero.cu
+++ b/src/array/cuda/spmm_hetero.cu
@@ -199,37 +199,37 @@ void SpMMCsrHetero(const std::string& op, const std::string& reduce,
  });
 }

-template void SpMMCsrHetero<kDLGPU, int32_t, 16>(
+template void SpMMCsrHetero<kDGLCUDA, int32_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const std::vector<CSRMatrix>& csr,
    const std::vector<NDArray>& ufeat, const std::vector<NDArray>& efeat,
    std::vector<NDArray>* out, std::vector<std::vector<NDArray>>* out_aux,
    const std::vector<dgl_type_t>& ufeat_ntids, const std::vector<dgl_type_t>& out_ntids);
-template void SpMMCsrHetero<kDLGPU, int64_t, 16>(
+template void SpMMCsrHetero<kDGLCUDA, int64_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const std::vector<CSRMatrix>& csr,
    const std::vector<NDArray>& ufeat, const std::vector<NDArray>& efeat,
    std::vector<NDArray>* out, std::vector<std::vector<NDArray>>* out_aux,
    const std::vector<dgl_type_t>& ufeat_ntids, const std::vector<dgl_type_t>& out_ntids);
-template void SpMMCsrHetero<kDLGPU, int32_t, 32>(
+template void SpMMCsrHetero<kDGLCUDA, int32_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const std::vector<CSRMatrix>& csr,
    const std::vector<NDArray>& ufeat, const std::vector<NDArray>& efeat,
    std::vector<NDArray>* out, std::vector<std::vector<NDArray>>* out_aux,
    const std::vector<dgl_type_t>& ufeat_ntids, const std::vector<dgl_type_t>& out_ntids);
-template void SpMMCsrHetero<kDLGPU, int64_t, 32>(
+template void SpMMCsrHetero<kDGLCUDA, int64_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const std::vector<CSRMatrix>& csr,
    const std::vector<NDArray>& ufeat, const std::vector<NDArray>& efeat,
    std::vector<NDArray>* out, std::vector<std::vector<NDArray>>* out_aux,
    const std::vector<dgl_type_t>& ufeat_ntids, const std::vector<dgl_type_t>& out_ntids);
-template void SpMMCsrHetero<kDLGPU, int32_t, 64>(
+template void SpMMCsrHetero<kDGLCUDA, int32_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const std::vector<CSRMatrix>& csr,
    const std::vector<NDArray>& ufeat, const std::vector<NDArray>& efeat,
    std::vector<NDArray>* out, std::vector<std::vector<NDArray>>* out_aux,
    const std::vector<dgl_type_t>& ufeat_ntids, const std::vector<dgl_type_t>& out_ntids);
-template void SpMMCsrHetero<kDLGPU, int64_t, 64>(
+template void SpMMCsrHetero<kDGLCUDA, int64_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const std::vector<CSRMatrix>& csr,
    const std::vector<NDArray>& ufeat, const std::vector<NDArray>& efeat,

--- a/src/array/cuda/utils.cu
+++ b/src/array/cuda/utils.cu
@@ -11,7 +11,7 @@
 namespace dgl {
 namespace cuda {

-bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx) {
+bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
  auto device = runtime::DeviceAPI::Get(ctx);
  int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
  // Call CUB's reduction

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
@@ -7,9 +7,9 @@
 #define DGL_ARRAY_CUDA_UTILS_H_

 #include <dmlc/logging.h>
+#include <dgl/runtime/c_runtime_api.h>
 #include <dgl/runtime/device_api.h>
 #include <dgl/runtime/ndarray.h>
-#include <dlpack/dlpack.h>
 #include "../../runtime/cuda/cuda_common.h"

 namespace dgl {
@@ -115,7 +115,7 @@ __device__ __forceinline__ T _ldg(T* addr) {
 * \param ctx Device context.
 * \return True if all the flags are true.
 */
-bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx);
+bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx);

 /*!
 * \brief CUDA Kernel of filling the vector started from ptr of size length
@@ -187,7 +187,7 @@ __global__ void _LinearSearchKernel(
 template <typename DType>
 inline DType GetCUDAScalar(
    runtime::DeviceAPI* device_api,
-    DLContext ctx,
+    DGLContext ctx,
    const DType* cuda_ptr) {
  DType result;
  device_api->CopyDataFromTo(
@@ -195,8 +195,8 @@ inline DType GetCUDAScalar(
      &result, 0,
      sizeof(result),
      ctx,
-      DLContext{kDLCPU, 0},
-      DLDataTypeTraits<DType>::dtype);
+      DGLContext{kDGLCPU, 0},
+      DGLDataTypeTraits<DType>::dtype);
  return result;
 }


--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
@@ -25,7 +25,7 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
  std::vector<int64_t> shape{len};

  CHECK(array.IsPinned());
-  CHECK_EQ(index->ctx.device_type, kDLGPU);
+  CHECK_EQ(index->ctx.device_type, kDGLCUDA);

  for (int d = 1; d < array->ndim; ++d) {
    num_feat *= array->shape[d];
@@ -85,8 +85,8 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
  std::vector<int64_t> shape{len};

  CHECK(dest.IsPinned());
-  CHECK_EQ(index->ctx.device_type, kDLGPU);
-  CHECK_EQ(source->ctx.device_type, kDLGPU);
+  CHECK_EQ(index->ctx.device_type, kDGLCUDA);
+  CHECK_EQ(source->ctx.device_type, kDGLCUDA);

  for (int d = 1; d < source->ndim; ++d) {
    num_feat *= source->shape[d];

--- a/src/array/filter.cc
+++ b/src/array/filter.cc
@@ -15,7 +15,7 @@ namespace array {

 using namespace dgl::runtime;

-template<DLDeviceType XPU, typename IdType>
+template<DGLDeviceType XPU, typename IdType>
 FilterRef CreateSetFilter(IdArray set);

 DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
@@ -23,10 +23,10 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
  IdArray array = args[0];
  auto ctx = array->ctx;
  // TODO(nv-dlasalle): Implement CPU version.
-  if (ctx.device_type == kDLGPU) {
+  if (ctx.device_type == kDGLCUDA) {
    #ifdef DGL_USE_CUDA
    ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
-      *rv = CreateSetFilter<kDLGPU, IdType>(array);
+      *rv = CreateSetFilter<kDGLCUDA, IdType>(array);
    });
    #else
    LOG(FATAL) << "GPU support not compiled.";

--- a/src/array/kernel.cc
+++ b/src/array/kernel.cc
@@ -8,6 +8,7 @@

 #ifdef USE_TVM
 #include <featgraph.h>
+#include <dgl/runtime/dlpack_convert.h>
 #endif  // USE_TVM

 #include "kernel_decl.h"
@@ -70,7 +71,7 @@ void SegmentMM(const NDArray A,
  }
  CHECK_EQ(B->shape[0], seglen_A.NumElements())
    << "segment_mm expects len(seglen_A) == B.shape[0]";
-  CHECK_EQ(seglen_A->ctx.device_type, kDLCPU)
+  CHECK_EQ(seglen_A->ctx.device_type, kDGLCPU)
    << "segment_mm expects seglen_A to be on CPU.";
  CHECK(A->ctx == B->ctx) << "segment_mm expects A and B to be of the same device";
  ATEN_XPU_SWITCH_CUDA(A->ctx.device_type, XPU, "SegmentMM", {
@@ -89,7 +90,7 @@ void SegmentMMBackwardB(const NDArray A,
  CHECK_EQ(A->ndim, 2) << "segment_mm_backward operator expects a 2D tensor for the first input.";
  CHECK_EQ(dC->ndim, 2)
    << "segment_mm_backward operator expects a 2D tensor for the second input.";
-  CHECK_EQ(seglen->ctx.device_type, kDLCPU)
+  CHECK_EQ(seglen->ctx.device_type, kDGLCPU)
    << "segment_mm expects seglen to be on CPU.";
  ATEN_XPU_SWITCH_CUDA(A->ctx.device_type, XPU, "SegmentMMBackwardB", {
    ATEN_ID_TYPE_SWITCH(seglen->dtype, IdType, {
@@ -829,8 +830,12 @@ DGL_REGISTER_GLOBAL("sparse._CAPI_FG_SDDMMTreeReduction")
    //     {lhs, rhs, out},
    //     {"U_data", "E_data", "V_data"});
    COOMatrix coo = graph.sptr()->GetCOOMatrix(0);
-    dgl::featgraph::SDDMMTreeReduction(coo.row.ToDLPack(), coo.col.ToDLPack(),
-                                       lhs.ToDLPack(), rhs.ToDLPack(), out.ToDLPack());
+    dgl::featgraph::SDDMMTreeReduction(
+      DLPackConvert::ToDLPack(coo.row),
+      DLPackConvert::ToDLPack(coo.col),
+      DLPackConvert::ToDLPack(lhs),
+      DLPackConvert::ToDLPack(rhs),
+      DLPackConvert::ToDLPack(out));
  });
 #endif  // USE_TVM


--- a/src/array/uvm_array.cc
+++ b/src/array/uvm_array.cc
@@ -16,7 +16,7 @@ namespace aten {
 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
 #ifdef DGL_USE_CUDA
  CHECK(array.IsPinned()) << "Input array must be in pinned memory.";
-  CHECK_EQ(index->ctx.device_type, kDLGPU) << "Index must be on the GPU.";
+  CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU.";
  CHECK_GE(array->ndim, 1) << "Input array must have at least 1 dimension.";
  CHECK_EQ(index->ndim, 1) << "Index must be a 1D array.";

@@ -34,8 +34,8 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
 void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
 #ifdef DGL_USE_CUDA
  CHECK(dest.IsPinned())  << "Destination array must be in pinned memory.";
-  CHECK_EQ(index->ctx.device_type, kDLGPU) << "Index must be on the GPU.";
-  CHECK_EQ(source->ctx.device_type, kDLGPU) << "Source array must be on the GPU.";
+  CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU.";
+  CHECK_EQ(source->ctx.device_type, kDGLCUDA) << "Source array must be on the GPU.";
  CHECK_EQ(dest->dtype, source->dtype) << "Destination array and source "
      "array must have the same dtype.";
  CHECK_GE(dest->ndim, 1) << "Destination array must have at least 1 dimension.";

--- a/src/c_api_common.h
+++ b/src/c_api_common.h
@@ -41,8 +41,8 @@ dgl::runtime::NDArray CopyVectorToNDArray(
    const std::vector<DType>& vec) {
  using dgl::runtime::NDArray;
  const int64_t len = vec.size();
-  NDArray a = NDArray::Empty({len}, DLDataType{kDLInt, sizeof(IdType) * 8, 1},
-                             DLContext{kDLCPU, 0});
+  NDArray a = NDArray::Empty({len}, DGLDataType{kDGLInt, sizeof(IdType) * 8, 1},
+                             DGLContext{kDGLCPU, 0});
  std::copy(vec.begin(), vec.end(), static_cast<IdType*>(a->data));
  return a;
 }

--- a/src/geometry/cpu/geometry_op_impl.cc
+++ b/src/geometry/cpu/geometry_op_impl.cc
@@ -50,7 +50,7 @@ template void GroupIndexShuffle<int64_t>(

 template <typename IdType>
 IdArray RandomPerm(int64_t num_nodes) {
-  IdArray perm = aten::NewIdArray(num_nodes, DLContext{kDLCPU, 0}, sizeof(IdType) * 8);
+  IdArray perm = aten::NewIdArray(num_nodes, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8);
  IdType* perm_data = static_cast<IdType*>(perm->data);
  std::iota(perm_data, perm_data + num_nodes, 0);
  IndexShuffle(perm_data, num_nodes);
@@ -59,7 +59,7 @@ IdArray RandomPerm(int64_t num_nodes) {

 template <typename IdType>
 IdArray GroupRandomPerm(const IdType *group_idxs, int64_t num_group_idxs, int64_t num_nodes) {
-  IdArray perm = aten::NewIdArray(num_nodes, DLContext{kDLCPU, 0}, sizeof(IdType) * 8);
+  IdArray perm = aten::NewIdArray(num_nodes, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8);
  IdType* perm_data = static_cast<IdType*>(perm->data);
  std::iota(perm_data, perm_data + num_nodes, 0);
  GroupIndexShuffle(group_idxs, perm_data, num_group_idxs, num_nodes);
@@ -77,7 +77,7 @@ IdArray GroupRandomPerm(const IdType *group_idxs, int64_t num_group_idxs, int64_
 * Finally, we pick the point with the maximum such distance.
 * This process will be repeated for ``sample_points`` - 1 times.
 */
-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void FarthestPointSampler(NDArray array, int64_t batch_size, int64_t sample_points,
    NDArray dist, IdArray start_idx, IdArray result) {
  const FloatType* array_data = static_cast<FloatType*>(array->data);
@@ -135,20 +135,20 @@ void FarthestPointSampler(NDArray array, int64_t batch_size, int64_t sample_poin
    ret_start += sample_points;
  }
 }
-template void FarthestPointSampler<kDLCPU, float, int32_t>(
+template void FarthestPointSampler<kDGLCPU, float, int32_t>(
    NDArray array, int64_t batch_size, int64_t sample_points,
    NDArray dist, IdArray start_idx, IdArray result);
-template void FarthestPointSampler<kDLCPU, float, int64_t>(
+template void FarthestPointSampler<kDGLCPU, float, int64_t>(
    NDArray array, int64_t batch_size, int64_t sample_points,
    NDArray dist, IdArray start_idx, IdArray result);
-template void FarthestPointSampler<kDLCPU, double, int32_t>(
+template void FarthestPointSampler<kDGLCPU, double, int32_t>(
    NDArray array, int64_t batch_size, int64_t sample_points,
    NDArray dist, IdArray start_idx, IdArray result);
-template void FarthestPointSampler<kDLCPU, double, int64_t>(
+template void FarthestPointSampler<kDGLCPU, double, int64_t>(
    NDArray array, int64_t batch_size, int64_t sample_points,
    NDArray dist, IdArray start_idx, IdArray result);

-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void WeightedNeighborMatching(const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
  const int64_t num_nodes = result->shape[0];
  const IdType *indptr_data = static_cast<IdType*>(csr.indptr->data);
@@ -181,16 +181,16 @@ void WeightedNeighborMatching(const aten::CSRMatrix &csr, const NDArray weight,
    result_data[v_max] = result_data[u];
  }
 }
-template void WeightedNeighborMatching<kDLCPU, float, int32_t>(
+template void WeightedNeighborMatching<kDGLCPU, float, int32_t>(
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
-template void WeightedNeighborMatching<kDLCPU, float, int64_t>(
+template void WeightedNeighborMatching<kDGLCPU, float, int64_t>(
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
-template void WeightedNeighborMatching<kDLCPU, double, int32_t>(
+template void WeightedNeighborMatching<kDGLCPU, double, int32_t>(
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
-template void WeightedNeighborMatching<kDLCPU, double, int64_t>(
+template void WeightedNeighborMatching<kDGLCPU, double, int64_t>(
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
  const int64_t num_nodes = result->shape[0];
  const IdType *indptr_data = static_cast<IdType*>(csr.indptr->data);
@@ -221,8 +221,8 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
    }
  }
 }
-template void NeighborMatching<kDLCPU, int32_t>(const aten::CSRMatrix &csr, IdArray result);
-template void NeighborMatching<kDLCPU, int64_t>(const aten::CSRMatrix &csr, IdArray result);
+template void NeighborMatching<kDGLCPU, int32_t>(const aten::CSRMatrix &csr, IdArray result);
+template void NeighborMatching<kDGLCPU, int64_t>(const aten::CSRMatrix &csr, IdArray result);

 }  // namespace impl
 }  // namespace geometry

--- a/src/geometry/cuda/edge_coarsening_impl.cu
+++ b/src/geometry/cuda/edge_coarsening_impl.cu
@@ -150,7 +150,7 @@ bool Colorize(IdType * result_data, int64_t num_nodes, float * const prop) {
 * are marked, mark this node with its id. Else match this (BLUE, RED) node
 * pair and mark them with the smaller id between them.
 */
-template <DLDeviceType XPU, typename FloatType, typename IdType>
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void WeightedNeighborMatching(const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto& ctx = result->ctx;
@@ -182,13 +182,13 @@ void WeightedNeighborMatching(const aten::CSRMatrix &csr, const NDArray weight,
  }
  device->FreeWorkspace(ctx, prop);
 }
-template void WeightedNeighborMatching<kDLGPU, float, int32_t>(
+template void WeightedNeighborMatching<kDGLCUDA, float, int32_t>(
  const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
-template void WeightedNeighborMatching<kDLGPU, float, int64_t>(
+template void WeightedNeighborMatching<kDGLCUDA, float, int64_t>(
  const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
-template void WeightedNeighborMatching<kDLGPU, double, int32_t>(
+template void WeightedNeighborMatching<kDGLCUDA, double, int32_t>(
  const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
-template void WeightedNeighborMatching<kDLGPU, double, int64_t>(
+template void WeightedNeighborMatching<kDGLCUDA, double, int64_t>(
  const aten::CSRMatrix &csr, const NDArray weight, IdArray result);

 /*! \brief Unweighted neighbor matching procedure (GPU version).
@@ -201,7 +201,7 @@ template void WeightedNeighborMatching<kDLGPU, double, int64_t>(
 *  2. Graph is sparse, thus neighborhood of each node is small,
 *     which is suitable for GPU implementation.
 */
-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
  const int64_t num_edges = csr.indices->shape[0];
  const auto& ctx = result->ctx;
@@ -211,7 +211,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
  // generate random weights
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  NDArray weight = NDArray::Empty(
-    {num_edges}, DLDataType{kDLFloat, sizeof(float) * 8, 1}, ctx);
+    {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
  float *weight_data = static_cast<float*>(weight->data);
  uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX);
  auto num_threads = cuda::FindNumThreads(num_edges);
@@ -221,8 +221,8 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {

  WeightedNeighborMatching<XPU, float, IdType>(csr, weight, result);
 }
-template void NeighborMatching<kDLGPU, int32_t>(const aten::CSRMatrix &csr, IdArray result);
-template void NeighborMatching<kDLGPU, int64_t>(const aten::CSRMatrix &csr, IdArray result);
+template void NeighborMatching<kDGLCUDA, int32_t>(const aten::CSRMatrix &csr, IdArray result);
+template void NeighborMatching<kDGLCUDA, int64_t>(const aten::CSRMatrix &csr, IdArray result);

 }  // namespace impl
 }  // namespace geometry