[Feature] Bump DLPack to v0.7 and decouple DLPack from the core library (#4454)

* rename `DLContext` to `DGLContext` * rename `kDLGPU` to `kDLCUDA` * replace DLTensor with DGLArray * fix linting * Unify DGLType and DLDataType to DGLDataType * Fix FFI * rename DLDeviceType to DGLDeviceType * decouple dlpack from the core library * fix bug * fix lint * fix merge * fix build * address comments * rename dl_converter to dlpack_convert * remove redundant comments

[Feature] Bump DLPack to v0.7 and decouple DLPack from the core library (#4454)
* rename `DLContext` to `DGLContext` * rename `kDLGPU` to `kDLCUDA` * replace DLTensor with DGLArray * fix linting * Unify DGLType and DLDataType to DGLDataType * Fix FFI * rename DLDeviceType to DGLDeviceType * decouple dlpack from the core library * fix bug * fix lint * fix merge * fix build * address comments * rename dl_converter to dlpack_convert * remove redundant comments
cded5b80 · Xin Yao · GitHub · f1689ad0 · cded5b80 · cded5b80
Unverified Commit cded5b80 authored Sep 19, 2022 by Xin Yao Committed by GitHub Sep 19, 2022
20 changed files
--- a/src/graph/transform/to_bipartite.h
+++ b/src/graph/transform/to_bipartite.h
@@ -44,7 +44,7 @@ namespace transform {
 *
 * @return The block and the induced edges.
 */
-template<DLDeviceType XPU, typename IdType>
+template<DGLDeviceType XPU, typename IdType>
 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
 ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
        bool include_rhs_in_lhs, std::vector<IdArray>* lhs_nodes);

--- a/src/graph/traversal.cc
+++ b/src/graph/traversal.cc
@@ -54,7 +54,7 @@ IdArray MergeMultipleTraversals(
    max_len = std::max(max_len, tracelen);
    total_len += traces[i].size();
  }
-  IdArray ret = IdArray::Empty({total_len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
+  IdArray ret = IdArray::Empty({total_len}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
  int64_t* ret_data = static_cast<int64_t*>(ret->data);
  for (int64_t i = 0; i < max_len; ++i) {
    for (size_t j = 0; j < traces.size(); ++j) {
@@ -78,7 +78,7 @@ IdArray ComputeMergedSections(
    const int64_t tracelen = traces[i].size();
    max_len = std::max(max_len, tracelen);
  }
-  IdArray ret = IdArray::Empty({max_len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
+  IdArray ret = IdArray::Empty({max_len}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
  int64_t* ret_data = static_cast<int64_t*>(ret->data);
  for (int64_t i = 0; i < max_len; ++i) {
    int64_t sec_len = 0;

--- a/src/graph/unit_graph.cc
+++ b/src/graph/unit_graph.cc
@@ -125,11 +125,11 @@ class UnitGraph::COO : public BaseHeteroGraph {
    LOG(FATAL) << "UnitGraph graph is not mutable.";
  }

-  DLDataType DataType() const override {
+  DGLDataType DataType() const override {
    return adj_.row->dtype;
  }

-  DLContext Context() const override {
+  DGLContext Context() const override {
    return adj_.row->ctx;
  }

@@ -153,7 +153,7 @@ class UnitGraph::COO : public BaseHeteroGraph {
    return ret;
  }

-  COO CopyTo(const DLContext &ctx) const {
+  COO CopyTo(const DGLContext &ctx) const {
    if (Context() == ctx)
      return *this;
    return COO(meta_graph_, adj_.CopyTo(ctx));
@@ -385,7 +385,7 @@ class UnitGraph::COO : public BaseHeteroGraph {
    CHECK(aten::IsValidIdArray(dstvids)) << "Invalid vertex id array.";
    HeteroSubgraph subg;
    const auto& submat = aten::COOSliceMatrix(adj_, srcvids, dstvids);
-    DLContext ctx = aten::GetContextOf(vids);
+    DGLContext ctx = aten::GetContextOf(vids);
    IdArray sub_eids = aten::Range(0, submat.data->shape[0], NumBits(), ctx);
    subg.graph = std::make_shared<COO>(meta_graph(), submat.num_rows, submat.num_cols,
        submat.row, submat.col);
@@ -412,9 +412,9 @@ class UnitGraph::COO : public BaseHeteroGraph {
      IdArray new_src = aten::IndexSelect(adj_.row, eids[0]);
      IdArray new_dst = aten::IndexSelect(adj_.col, eids[0]);
      subg.induced_vertices.emplace_back(
-          aten::NullArray(DLDataType{kDLInt, NumBits(), 1}, Context()));
+          aten::NullArray(DGLDataType{kDGLInt, NumBits(), 1}, Context()));
      subg.induced_vertices.emplace_back(
-          aten::NullArray(DLDataType{kDLInt, NumBits(), 1}, Context()));
+          aten::NullArray(DGLDataType{kDGLInt, NumBits(), 1}, Context()));
      subg.graph = std::make_shared<COO>(
          meta_graph(), NumVertices(SrcType()), NumVertices(DstType()), new_src, new_dst);
      subg.induced_edges = eids;
@@ -532,11 +532,11 @@ class UnitGraph::CSR : public BaseHeteroGraph {
    LOG(FATAL) << "UnitGraph graph is not mutable.";
  }

-  DLDataType DataType() const override {
+  DGLDataType DataType() const override {
    return adj_.indices->dtype;
  }

-  DLContext Context() const override {
+  DGLContext Context() const override {
    return adj_.indices->ctx;
  }

@@ -562,7 +562,7 @@ class UnitGraph::CSR : public BaseHeteroGraph {
    }
  }

-  CSR CopyTo(const DLContext &ctx) const {
+  CSR CopyTo(const DGLContext &ctx) const {
    if (Context() == ctx) {
      return *this;
    } else {
@@ -810,7 +810,7 @@ class UnitGraph::CSR : public BaseHeteroGraph {
    CHECK(aten::IsValidIdArray(dstvids)) << "Invalid vertex id array.";
    HeteroSubgraph subg;
    const auto& submat = aten::CSRSliceMatrix(adj_, srcvids, dstvids);
-    DLContext ctx = aten::GetContextOf(vids);
+    DGLContext ctx = aten::GetContextOf(vids);
    IdArray sub_eids = aten::Range(0, submat.data->shape[0], NumBits(), ctx);
    subg.graph = std::make_shared<CSR>(meta_graph(), submat.num_rows, submat.num_cols,
        submat.indptr, submat.indices, sub_eids);
@@ -860,11 +860,11 @@ class UnitGraph::CSR : public BaseHeteroGraph {
 //
 //////////////////////////////////////////////////////////

-DLDataType UnitGraph::DataType() const {
+DGLDataType UnitGraph::DataType() const {
  return GetAny()->DataType();
 }

-DLContext UnitGraph::Context() const {
+DGLContext UnitGraph::Context() const {
  return GetAny()->Context();
 }

@@ -1285,7 +1285,7 @@ HeteroGraphPtr UnitGraph::AsNumBits(HeteroGraphPtr g, uint8_t bits) {
  }
 }

-HeteroGraphPtr UnitGraph::CopyTo(HeteroGraphPtr g, const DLContext &ctx) {
+HeteroGraphPtr UnitGraph::CopyTo(HeteroGraphPtr g, const DGLContext &ctx) {
  if (ctx == g->Context()) {
    return g;
  } else {

--- a/src/graph/unit_graph.h
+++ b/src/graph/unit_graph.h
@@ -79,9 +79,9 @@ class UnitGraph : public BaseHeteroGraph {
    LOG(FATAL) << "UnitGraph graph is not mutable.";
  }

-  DLDataType DataType() const override;
+  DGLDataType DataType() const override;

-  DLContext Context() const override;
+  DGLContext Context() const override;

  bool IsPinned() const override;

@@ -167,7 +167,7 @@ class UnitGraph : public BaseHeteroGraph {
  /*! \brief Create a graph with no edges */
  static HeteroGraphPtr Empty(
      int64_t num_vtypes, int64_t num_src, int64_t num_dst,
-      DLDataType dtype, DLContext ctx) {
+      DGLDataType dtype, DGLContext ctx) {
    IdArray row = IdArray::Empty({0}, dtype, ctx);
    IdArray col = IdArray::Empty({0}, dtype, ctx);
    return CreateFromCOO(num_vtypes, num_src, num_dst, row, col);
@@ -207,14 +207,14 @@ class UnitGraph : public BaseHeteroGraph {
  static HeteroGraphPtr AsNumBits(HeteroGraphPtr g, uint8_t bits);

  /*! \brief Copy the data to another context */
-  static HeteroGraphPtr CopyTo(HeteroGraphPtr g, const DLContext &ctx);
+  static HeteroGraphPtr CopyTo(HeteroGraphPtr g, const DGLContext &ctx);

  /*!
  * \brief Pin the in_csr_, out_scr_ and coo_ of the current graph.
  * \note The graph will be pinned inplace. Behavior depends on the current context,
-  *       kDLCPU: will be pinned;
+  *       kDGLCPU: will be pinned;
  *       IsPinned: directly return;
-  *       kDLGPU: invalid, will throw an error.
+  *       kDGLCUDA: invalid, will throw an error.
  *       The context check is deferred to pinning the NDArray.
  */
  void PinMemory_() override;

--- a/src/partition/cuda/partition_op.cu
+++ b/src/partition/cuda/partition_op.cu
@@ -251,7 +251,7 @@ __global__ void _MapGlobalIndexByRangeKernel(

 // Remainder Based Partition Operations

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 std::pair<IdArray, NDArray>
 GeneratePermutationFromRemainder(
        int64_t array_size,
@@ -376,18 +376,18 @@ GeneratePermutationFromRemainder(


 template std::pair<IdArray, IdArray>
-GeneratePermutationFromRemainder<kDLGPU, int32_t>(
+GeneratePermutationFromRemainder<kDGLCUDA, int32_t>(
        int64_t array_size,
        int num_parts,
        IdArray in_idx);
 template std::pair<IdArray, IdArray>
-GeneratePermutationFromRemainder<kDLGPU, int64_t>(
+GeneratePermutationFromRemainder<kDGLCUDA, int64_t>(
        int64_t array_size,
        int num_parts,
        IdArray in_idx);


-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 IdArray MapToLocalFromRemainder(
    const int num_parts,
    IdArray global_idx) {
@@ -420,15 +420,15 @@ IdArray MapToLocalFromRemainder(
 }

 template IdArray
-MapToLocalFromRemainder<kDLGPU, int32_t>(
+MapToLocalFromRemainder<kDGLCUDA, int32_t>(
        int num_parts,
        IdArray in_idx);
 template IdArray
-MapToLocalFromRemainder<kDLGPU, int64_t>(
+MapToLocalFromRemainder<kDGLCUDA, int64_t>(
        int num_parts,
        IdArray in_idx);

-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 IdArray MapToGlobalFromRemainder(
    const int num_parts,
    IdArray local_idx,
@@ -468,12 +468,12 @@ IdArray MapToGlobalFromRemainder(
 }

 template IdArray
-MapToGlobalFromRemainder<kDLGPU, int32_t>(
+MapToGlobalFromRemainder<kDGLCUDA, int32_t>(
        int num_parts,
        IdArray in_idx,
        int part_id);
 template IdArray
-MapToGlobalFromRemainder<kDLGPU, int64_t>(
+MapToGlobalFromRemainder<kDGLCUDA, int64_t>(
        int num_parts,
        IdArray in_idx,
        int part_id);
@@ -481,7 +481,7 @@ MapToGlobalFromRemainder<kDLGPU, int64_t>(

 // Range Based Partition Operations

-template <DLDeviceType XPU, typename IdType, typename RangeType>
+template <DGLDeviceType XPU, typename IdType, typename RangeType>
 std::pair<IdArray, NDArray>
 GeneratePermutationFromRange(
        int64_t array_size,
@@ -598,31 +598,31 @@ GeneratePermutationFromRange(


 template std::pair<IdArray, IdArray>
-GeneratePermutationFromRange<kDLGPU, int32_t, int32_t>(
+GeneratePermutationFromRange<kDGLCUDA, int32_t, int32_t>(
        int64_t array_size,
        int num_parts,
        IdArray range,
        IdArray in_idx);
 template std::pair<IdArray, IdArray>
-GeneratePermutationFromRange<kDLGPU, int64_t, int32_t>(
+GeneratePermutationFromRange<kDGLCUDA, int64_t, int32_t>(
        int64_t array_size,
        int num_parts,
        IdArray range,
        IdArray in_idx);
 template std::pair<IdArray, IdArray>
-GeneratePermutationFromRange<kDLGPU, int32_t, int64_t>(
+GeneratePermutationFromRange<kDGLCUDA, int32_t, int64_t>(
        int64_t array_size,
        int num_parts,
        IdArray range,
        IdArray in_idx);
 template std::pair<IdArray, IdArray>
-GeneratePermutationFromRange<kDLGPU, int64_t, int64_t>(
+GeneratePermutationFromRange<kDGLCUDA, int64_t, int64_t>(
        int64_t array_size,
        int num_parts,
        IdArray range,
        IdArray in_idx);

-template <DLDeviceType XPU, typename IdType, typename RangeType>
+template <DGLDeviceType XPU, typename IdType, typename RangeType>
 IdArray MapToLocalFromRange(
    const int num_parts,
    IdArray range,
@@ -657,28 +657,28 @@ IdArray MapToLocalFromRange(
 }

 template IdArray
-MapToLocalFromRange<kDLGPU, int32_t, int32_t>(
+MapToLocalFromRange<kDGLCUDA, int32_t, int32_t>(
        int num_parts,
        IdArray range,
        IdArray in_idx);
 template IdArray
-MapToLocalFromRange<kDLGPU, int64_t, int32_t>(
+MapToLocalFromRange<kDGLCUDA, int64_t, int32_t>(
        int num_parts,
        IdArray range,
        IdArray in_idx);
 template IdArray
-MapToLocalFromRange<kDLGPU, int32_t, int64_t>(
+MapToLocalFromRange<kDGLCUDA, int32_t, int64_t>(
        int num_parts,
        IdArray range,
        IdArray in_idx);
 template IdArray
-MapToLocalFromRange<kDLGPU, int64_t, int64_t>(
+MapToLocalFromRange<kDGLCUDA, int64_t, int64_t>(
        int num_parts,
        IdArray range,
        IdArray in_idx);


-template <DLDeviceType XPU, typename IdType, typename RangeType>
+template <DGLDeviceType XPU, typename IdType, typename RangeType>
 IdArray MapToGlobalFromRange(
    const int num_parts,
    IdArray range,
@@ -720,25 +720,25 @@ IdArray MapToGlobalFromRange(
 }

 template IdArray
-MapToGlobalFromRange<kDLGPU, int32_t, int32_t>(
+MapToGlobalFromRange<kDGLCUDA, int32_t, int32_t>(
        int num_parts,
        IdArray range,
        IdArray in_idx,
        int part_id);
 template IdArray
-MapToGlobalFromRange<kDLGPU, int64_t, int32_t>(
+MapToGlobalFromRange<kDGLCUDA, int64_t, int32_t>(
        int num_parts,
        IdArray range,
        IdArray in_idx,
        int part_id);
 template IdArray
-MapToGlobalFromRange<kDLGPU, int32_t, int64_t>(
+MapToGlobalFromRange<kDGLCUDA, int32_t, int64_t>(
        int num_parts,
        IdArray range,
        IdArray in_idx,
        int part_id);
 template IdArray
-MapToGlobalFromRange<kDLGPU, int64_t, int64_t>(
+MapToGlobalFromRange<kDGLCUDA, int64_t, int64_t>(
        int num_parts,
        IdArray range,
        IdArray in_idx,

--- a/src/partition/ndarray_partition.cc
+++ b/src/partition/ndarray_partition.cc
@@ -46,9 +46,9 @@ class RemainderPartition : public NDArrayPartition {
      IdArray in_idx) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDLGPU) {
+    if (ctx.device_type == kDGLCUDA) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
-        return impl::GeneratePermutationFromRemainder<kDLGPU, IdType>(
+        return impl::GeneratePermutationFromRemainder<kDGLCUDA, IdType>(
            ArraySize(), NumParts(), in_idx);
      });
    }
@@ -64,9 +64,9 @@ class RemainderPartition : public NDArrayPartition {
      IdArray in_idx) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDLGPU) {
+    if (ctx.device_type == kDGLCUDA) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
-        return impl::MapToLocalFromRemainder<kDLGPU, IdType>(
+        return impl::MapToLocalFromRemainder<kDGLCUDA, IdType>(
            NumParts(), in_idx);
      });
    }
@@ -83,9 +83,9 @@ class RemainderPartition : public NDArrayPartition {
      const int part_id) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDLGPU) {
+    if (ctx.device_type == kDGLCUDA) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
-        return impl::MapToGlobalFromRemainder<kDLGPU, IdType>(
+        return impl::MapToGlobalFromRemainder<kDGLCUDA, IdType>(
            NumParts(), in_idx, part_id);
      });
    }
@@ -116,9 +116,9 @@ class RangePartition : public NDArrayPartition {
    // sizes. We require the input range on the GPU, as if we have multiple
    // GPUs, we can't know which is the proper one to copy the array to, but we
    // have only one CPU context, and can safely copy the array to that.
-    range_cpu_(range.CopyTo(DGLContext{kDLCPU, 0})) {
+    range_cpu_(range.CopyTo(DGLContext{kDGLCPU, 0})) {
    auto ctx = range->ctx;
-    if (ctx.device_type != kDLGPU) {
+    if (ctx.device_type != kDGLCUDA) {
        LOG(FATAL) << "The range for an NDArrayPartition is only supported "
            " on GPUs. Transfer the range to the target device before "
            "creating the partition.";
@@ -130,7 +130,7 @@ class RangePartition : public NDArrayPartition {
      IdArray in_idx) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDLGPU) {
+    if (ctx.device_type == kDGLCUDA) {
      if (ctx.device_type != range_->ctx.device_type ||
          ctx.device_id != range_->ctx.device_id) {
        LOG(FATAL) << "The range for the NDArrayPartition and the input "
@@ -138,7 +138,7 @@ class RangePartition : public NDArrayPartition {
      }
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
        ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
-          return impl::GeneratePermutationFromRange<kDLGPU, IdType, RangeType>(
+          return impl::GeneratePermutationFromRange<kDGLCUDA, IdType, RangeType>(
              ArraySize(), NumParts(), range_, in_idx);
        });
      });
@@ -155,10 +155,10 @@ class RangePartition : public NDArrayPartition {
      IdArray in_idx) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDLGPU) {
+    if (ctx.device_type == kDGLCUDA) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
        ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
-          return impl::MapToLocalFromRange<kDLGPU, IdType, RangeType>(
+          return impl::MapToLocalFromRange<kDGLCUDA, IdType, RangeType>(
              NumParts(), range_, in_idx);
        });
      });
@@ -176,10 +176,10 @@ class RangePartition : public NDArrayPartition {
      const int part_id) const override {
 #ifdef DGL_USE_CUDA
    auto ctx = in_idx->ctx;
-    if (ctx.device_type == kDLGPU) {
+    if (ctx.device_type == kDGLCUDA) {
      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
        ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
-          return impl::MapToGlobalFromRange<kDLGPU, IdType, RangeType>(
+          return impl::MapToGlobalFromRange<kDGLCUDA, IdType, RangeType>(
              NumParts(), range_, in_idx, part_id);
        });
      });

--- a/src/partition/partition_op.h
+++ b/src/partition/partition_op.h
@@ -32,7 +32,7 @@ namespace impl {
 * @return The permutation to group the indices by part id, and the number of
 * indices in each part.
 */
-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 std::pair<IdArray, IdArray>
 GeneratePermutationFromRemainder(
        int64_t array_size,
@@ -51,7 +51,7 @@ GeneratePermutationFromRemainder(
 *
 * @return The array of local indices.
 */
-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 IdArray MapToLocalFromRemainder(
    int num_parts,
    IdArray global_idx);
@@ -69,7 +69,7 @@ IdArray MapToLocalFromRemainder(
 *
 * @return The array of global indices.
 */
-template <DLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 IdArray MapToGlobalFromRemainder(
    int num_parts,
    IdArray local_idx,
@@ -95,7 +95,7 @@ IdArray MapToGlobalFromRemainder(
 * @return The permutation to group the indices by part id, and the number of
 * indices in each part.
 */
-template <DLDeviceType XPU, typename IdType, typename RangeType>
+template <DGLDeviceType XPU, typename IdType, typename RangeType>
 std::pair<IdArray, IdArray>
 GeneratePermutationFromRange(
        int64_t array_size,
@@ -118,7 +118,7 @@ GeneratePermutationFromRange(
 *
 * @return The array of local indices.
 */
-template <DLDeviceType XPU, typename IdType, typename RangeType>
+template <DGLDeviceType XPU, typename IdType, typename RangeType>
 IdArray MapToLocalFromRange(
    int num_parts,
    IdArray range,
@@ -140,7 +140,7 @@ IdArray MapToLocalFromRange(
 *
 * @return The array of global indices.
 */
-template <DLDeviceType XPU, typename IdType, typename RangeType>
+template <DGLDeviceType XPU, typename IdType, typename RangeType>
 IdArray MapToGlobalFromRange(
    int num_parts,
    IdArray range,

--- a/src/random/random.cc
+++ b/src/random/random.cc
@@ -29,7 +29,7 @@ DGL_REGISTER_GLOBAL("rng._CAPI_SetSeed")
      }
    });
 #ifdef DGL_USE_CUDA
-    if (DeviceAPI::Get(kDLGPU)->IsAvailable()) {
+    if (DeviceAPI::Get(kDGLCUDA)->IsAvailable()) {
      auto* thr_entry = CUDAThreadEntry::ThreadLocal();
      if (!thr_entry->curand_gen) {
        CURAND_CALL(curandCreateGenerator(&thr_entry->curand_gen, CURAND_RNG_PSEUDO_DEFAULT));

--- a/src/rpc/rpc.cc
+++ b/src/rpc/rpc.cc
@@ -517,7 +517,7 @@ DGL_REGISTER_GLOBAL("distributed.rpc._CAPI_DGLRPCFastPull")
  local_data_shape[0] = ID_size;
  NDArray res_tensor = NDArray::Empty(local_data_shape,
                                      local_data->dtype,
-                                      DLContext{kDLCPU, 0});
+                                      DGLContext{kDGLCPU, 0});
  char* return_data = static_cast<char*>(res_tensor->data);
  // Copy local data
  parallel_for(0, local_ids.size(), [&](size_t b, size_t e) {

--- a/src/runtime/c_object_api.cc
+++ b/src/runtime/c_object_api.cc
@@ -137,7 +137,7 @@ int DGLObjectGetAttr(ObjectHandle handle,
    (*tobject)->VisitAttrs(&getter);
    *ret_success = getter.found_object_ref || rv.type_code() != kNull;
    if (rv.type_code() == kStr ||
-        rv.type_code() == kDGLType) {
+        rv.type_code() == kDGLDataType) {
      DGLAPIThreadLocalEntry *e = DGLAPIThreadLocalStore::Get();
      e->ret_str = rv.operator std::string();
      *ret_type_code = kStr;

--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
 /*!
- *  Copyright (c) 2016 by Contributors
+ *  Copyright (c) 2016-2022 by Contributors
 * \file c_runtime_api.cc
 * \brief Runtime API implementation
 */
@@ -26,17 +26,9 @@ namespace runtime {
 */
 inline std::string DeviceName(int type) {
  switch (type) {
-    case kDLCPU: return "cpu";
-    case kDLGPU: return "gpu";
-    case kDLOpenCL: return "opencl";
-    case kDLSDAccel: return "sdaccel";
-    case kDLAOCL: return "aocl";
-    case kDLVulkan: return "vulkan";
-    case kDLMetal: return "metal";
-    case kDLVPI: return "vpi";
-    case kDLROCM: return "rocm";
-    case kOpenGL: return "opengl";
-    case kExtDev: return "ext_dev";
+    case kDGLCPU: return "cpu";
+    case kDGLCUDA: return "cuda";
+    // add more device here once supported
    default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
  }
 }
@@ -99,13 +91,13 @@ DeviceAPI* DeviceAPI::Get(DGLContext ctx, bool allow_missing) {
      static_cast<int>(ctx.device_type), allow_missing);
 }

-DeviceAPI* DeviceAPI::Get(DLDeviceType dev_type, bool allow_missing) {
+DeviceAPI* DeviceAPI::Get(DGLDeviceType dev_type, bool allow_missing) {
  return DeviceAPIManager::Get(static_cast<int>(dev_type), allow_missing);
 }

 void* DeviceAPI::AllocWorkspace(DGLContext ctx,
                                size_t size,
-                                DGLType type_hint) {
+                                DGLDataType type_hint) {
  return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
 }

@@ -213,10 +205,10 @@ void* DGLBackendAllocWorkspace(int device_type,
                               int dtype_code_hint,
                               int dtype_bits_hint) {
  DGLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
  ctx.device_id = device_id;

-  DGLType type_hint;
+  DGLDataType type_hint;
  type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
  type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
  type_hint.lanes = 1;
@@ -230,7 +222,7 @@ int DGLBackendFreeWorkspace(int device_type,
                            int device_id,
                            void* ptr) {
  DGLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
  ctx.device_id = device_id;
  DeviceAPIManager::Get(ctx)->FreeWorkspace(ctx, ptr);
  return 0;
@@ -265,10 +257,10 @@ int DGLFuncCall(DGLFunctionHandle func,
      DGLArgs(args, arg_type_codes, num_args), &rv);
  // handle return string.
  if (rv.type_code() == kStr ||
-     rv.type_code() == kDGLType ||
+     rv.type_code() == kDGLDataType ||
      rv.type_code() == kBytes) {
    DGLRuntimeEntry* e = DGLAPIRuntimeStore::Get();
-    if (rv.type_code() != kDGLType) {
+    if (rv.type_code() != kDGLDataType) {
      e->ret_str = *rv.ptr<std::string>();
    } else {
      e->ret_str = rv.operator std::string();
@@ -336,7 +328,7 @@ int DGLFuncCreateFromCFunc(DGLPackedCFunc func,
 int DGLStreamCreate(int device_type, int device_id, DGLStreamHandle* out) {
  API_BEGIN();
  DGLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
  ctx.device_id = device_id;
  *out = DeviceAPIManager::Get(ctx)->CreateStream(ctx);
  API_END();
@@ -345,7 +337,7 @@ int DGLStreamCreate(int device_type, int device_id, DGLStreamHandle* out) {
 int DGLStreamFree(int device_type, int device_id, DGLStreamHandle stream) {
  API_BEGIN();
  DGLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
  ctx.device_id = device_id;
  DeviceAPIManager::Get(ctx)->FreeStream(ctx, stream);
  API_END();
@@ -354,7 +346,7 @@ int DGLStreamFree(int device_type, int device_id, DGLStreamHandle stream) {
 int DGLSetStream(int device_type, int device_id, DGLStreamHandle stream) {
  API_BEGIN();
  DGLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
  ctx.device_id = device_id;
  DeviceAPIManager::Get(ctx)->SetStream(ctx, stream);
  API_END();
@@ -363,7 +355,7 @@ int DGLSetStream(int device_type, int device_id, DGLStreamHandle stream) {
 int DGLGetStream(int device_type, int device_id, DGLStreamHandle* stream) {
  API_BEGIN();
  DGLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
  ctx.device_id = device_id;
  *stream = DeviceAPIManager::Get(ctx)->GetStream();
  API_END();
@@ -372,7 +364,7 @@ int DGLGetStream(int device_type, int device_id, DGLStreamHandle* stream) {
 int DGLSynchronize(int device_type, int device_id, DGLStreamHandle stream) {
  API_BEGIN();
  DGLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
  ctx.device_id = device_id;
  DeviceAPIManager::Get(ctx)->StreamSync(ctx, stream);
  API_END();
@@ -384,7 +376,7 @@ int DGLStreamStreamSynchronize(int device_type,
                               DGLStreamHandle dst) {
  API_BEGIN();
  DGLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
  ctx.device_id = device_id;
  DeviceAPIManager::Get(ctx)->SyncStreamFromTo(ctx, src, dst);
  API_END();
@@ -408,7 +400,7 @@ int DGLLoadTensorAdapter(const char *path) {
 DGL_REGISTER_GLOBAL(dgl::runtime::symbol::dgl_set_device)
 .set_body([](DGLArgs args, DGLRetValue *ret) {
    DGLContext ctx;
-    ctx.device_type = static_cast<DLDeviceType>(args[0].operator int());
+    ctx.device_type = static_cast<DGLDeviceType>(args[0].operator int());
    ctx.device_id = args[1];
    DeviceAPIManager::Get(ctx)->SetDevice(ctx);
  });
@@ -417,7 +409,7 @@ DGL_REGISTER_GLOBAL(dgl::runtime::symbol::dgl_set_device)
 DGL_REGISTER_GLOBAL("_GetDeviceAttr")
 .set_body([](DGLArgs args, DGLRetValue *ret) {
    DGLContext ctx;
-    ctx.device_type = static_cast<DLDeviceType>(args[0].operator int());
+    ctx.device_type = static_cast<DGLDeviceType>(args[0].operator int());
    ctx.device_id = args[1];

    DeviceAttrKind kind = static_cast<DeviceAttrKind>(args[2].operator int());

--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -24,7 +24,7 @@ class CPUDeviceAPI final : public DeviceAPI {
  void* AllocDataSpace(DGLContext ctx,
                       size_t nbytes,
                       size_t alignment,
-                       DGLType type_hint) final {
+                       DGLDataType type_hint) final {
    TensorDispatcher* td = TensorDispatcher::Global();
    if (td->IsAvailable())
      return td->CPUAllocWorkspace(nbytes);
@@ -62,7 +62,7 @@ class CPUDeviceAPI final : public DeviceAPI {
                      size_t size,
                      DGLContext ctx_from,
                      DGLContext ctx_to,
-                      DGLType type_hint) final {
+                      DGLDataType type_hint) final {
    memcpy(static_cast<char*>(to) + to_offset,
           static_cast<const char*>(from) + from_offset,
           size);
@@ -73,7 +73,7 @@ class CPUDeviceAPI final : public DeviceAPI {
  void StreamSync(DGLContext ctx, DGLStreamHandle stream) final {
  }

-  void* AllocWorkspace(DGLContext ctx, size_t size, DGLType type_hint) final;
+  void* AllocWorkspace(DGLContext ctx, size_t size, DGLDataType type_hint) final;
  void FreeWorkspace(DGLContext ctx, void* data) final;

  static const std::shared_ptr<CPUDeviceAPI>& Global() {
@@ -85,12 +85,12 @@ class CPUDeviceAPI final : public DeviceAPI {

 struct CPUWorkspacePool : public WorkspacePool {
  CPUWorkspacePool() :
-      WorkspacePool(kDLCPU, CPUDeviceAPI::Global()) {}
+      WorkspacePool(kDGLCPU, CPUDeviceAPI::Global()) {}
 };

 void* CPUDeviceAPI::AllocWorkspace(DGLContext ctx,
                                   size_t size,
-                                   DGLType type_hint) {
+                                   DGLDataType type_hint) {
  TensorDispatcher* td = TensorDispatcher::Global();
  if (td->IsAvailable())
    return td->CPUAllocWorkspace(size);

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -106,7 +106,7 @@ class CUDADeviceAPI final : public DeviceAPI {
  void* AllocDataSpace(DGLContext ctx,
                       size_t nbytes,
                       size_t alignment,
-                       DGLType type_hint) final {
+                       DGLDataType type_hint) final {
    SetDevice(ctx);
    // Redirect to PyTorch's allocator when available.
    TensorDispatcher* td = TensorDispatcher::Global();
@@ -136,12 +136,12 @@ class CUDADeviceAPI final : public DeviceAPI {
                      size_t size,
                      DGLContext ctx_from,
                      DGLContext ctx_to,
-                      DGLType type_hint,
+                      DGLDataType type_hint,
                      DGLStreamHandle stream) {
    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
    from = static_cast<const char*>(from) + from_offset;
    to = static_cast<char*>(to) + to_offset;
-    if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLGPU) {
+    if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA) {
      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
      if (ctx_from.device_id == ctx_to.device_id) {
        GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
@@ -150,10 +150,10 @@ class CUDADeviceAPI final : public DeviceAPI {
                                      from, ctx_from.device_id,
                                      size, cu_stream));
      }
-    } else if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLCPU) {
+    } else if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCPU) {
      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
      GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
-    } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLGPU) {
+    } else if (ctx_from.device_type == kDGLCPU && ctx_to.device_type == kDGLCUDA) {
      CUDA_CALL(cudaSetDevice(ctx_to.device_id));
      GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
    } else {
@@ -168,7 +168,7 @@ class CUDADeviceAPI final : public DeviceAPI {
                      size_t size,
                      DGLContext ctx_from,
                      DGLContext ctx_to,
-                      DGLType type_hint) final {
+                      DGLDataType type_hint) final {
    auto stream = GetStream();
    CopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from, ctx_to, type_hint, stream);
  }
@@ -269,7 +269,7 @@ class CUDADeviceAPI final : public DeviceAPI {
    return result;
  }

-  void* AllocWorkspace(DGLContext ctx, size_t size, DGLType type_hint) final {
+  void* AllocWorkspace(DGLContext ctx, size_t size, DGLDataType type_hint) final {
    SetDevice(ctx);
    // Redirect to PyTorch's allocator when available.
    TensorDispatcher* td = TensorDispatcher::Global();
@@ -313,7 +313,7 @@ class CUDADeviceAPI final : public DeviceAPI {
 typedef dmlc::ThreadLocalStore<CUDAThreadEntry> CUDAThreadStore;

 CUDAThreadEntry::CUDAThreadEntry()
-    : pool(kDLGPU, CUDADeviceAPI::Global()) {
+    : pool(kDGLCUDA, CUDADeviceAPI::Global()) {
 }

 CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
@@ -328,7 +328,7 @@ cudaStream_t getCurrentCUDAStream() {
    return nullptr;
 }

-DGL_REGISTER_GLOBAL("device_api.gpu")
+DGL_REGISTER_GLOBAL("device_api.cuda")
 .set_body([](DGLArgs args, DGLRetValue* rv) {
    DeviceAPI* ptr = CUDADeviceAPI::Global().get();
    *rv = static_cast<void*>(ptr);

--- a/src/runtime/cuda/nccl_api.cu
+++ b/src/runtime/cuda/nccl_api.cu
@@ -222,8 +222,8 @@ std::pair<IdArray, NDArray> SparsePush(
      0,
      send_prefix_host.size()*sizeof(*send_prefix.get()),
      ctx,
-      DGLContext{kDLCPU, 0},
-      DGLType{kDLInt, sizeof(*send_prefix.get())*8, 1});
+      DGLContext{kDGLCPU, 0},
+      DGLDataType{kDGLInt, sizeof(*send_prefix.get())*8, 1});
  send_prefix.free();

  CHECK_EQ(send_prefix_host.back(), num_in) << "Internal Error: "
@@ -260,8 +260,8 @@ std::pair<IdArray, NDArray> SparsePush(
      0,
      recv_prefix_host.size()*sizeof(*recv_prefix.get()),
      ctx,
-      DGLContext{kDLCPU, 0},
-      DGLType{kDLInt, sizeof(*recv_prefix.get())*8, 1});
+      DGLContext{kDGLCPU, 0},
+      DGLDataType{kDGLInt, sizeof(*recv_prefix.get())*8, 1});
  recv_prefix.free();

  // use an event to track when copying is done
@@ -376,8 +376,8 @@ NDArray SparsePull(
      0,
      request_prefix_host.size()*sizeof(*request_prefix.get()),
      ctx,
-      DGLContext{kDLCPU, 0},
-      DGLType{kDLInt, sizeof(*request_prefix.get())*8, 1});
+      DGLContext{kDGLCPU, 0},
+      DGLDataType{kDGLInt, sizeof(*request_prefix.get())*8, 1});
  request_prefix.free();
  CHECK_EQ(request_prefix_host.back(), num_in) << "Internal Error: "
      "request_prefix_host.back() = " << request_prefix_host.back() <<
@@ -411,8 +411,8 @@ NDArray SparsePull(
      0,
      response_prefix_host.size()*sizeof(*response_prefix.get()),
      ctx,
-      DGLContext{kDLCPU, 0},
-      DGLType{kDLInt, sizeof(*response_prefix.get())*8, 1});
+      DGLContext{kDGLCPU, 0},
+      DGLDataType{kDGLInt, sizeof(*response_prefix.get())*8, 1});
  response_prefix.free();

  // use an event to track when copying is done
@@ -617,10 +617,10 @@ void NCCLCommunicator::AllToAllV(

  int dev_id;
  CUDA_CALL(cudaGetDevice(&dev_id));
-  DGLContext ctx{kDLGPU, dev_id};
+  DGLContext ctx{kDGLCUDA, dev_id};

  auto device = runtime::DeviceAPI::Get(ctx);
-  auto dtype = DLDataTypeTraits<DType>::dtype;
+  auto dtype = DGLDataTypeTraits<DType>::dtype;

  // copy using the same stream (local current stream), no need to sync
  device->CopyDataFromTo(send, send_prefix[0],
@@ -679,10 +679,10 @@ void NCCLCommunicator::AllToAll(
  #else
  int dev_id;
  CUDA_CALL(cudaGetDevice(&dev_id));
-  DGLContext ctx{kDLGPU, dev_id};
+  DGLContext ctx{kDGLCUDA, dev_id};

  auto device = runtime::DeviceAPI::Get(ctx);
-  auto dtype = DLDataTypeTraits<IdType>::dtype;
+  auto dtype = DGLDataTypeTraits<IdType>::dtype;

  // copy using the same stream (local current stream), no need to sync
  device->CopyDataFromTo(send, 0, recv, 0, count, ctx, ctx, dtype);

--- a/src/runtime/dlpack_convert.cc
+++ b/src/runtime/dlpack_convert.cc
+/*!
+ *  Copyright (c) 2022 by Contributors
+ * \file src/runtime/dlpack_convert.cc
+ * \brief Conversion between NDArray and DLPack.
+ */
+#include <dgl/runtime/dlpack_convert.h>
+
+#include <dlpack/dlpack.h>
+#include <dgl/runtime/ndarray.h>
+#include <dgl/runtime/c_runtime_api.h>
+#include <dgl/runtime/device_api.h>
+#include "runtime_base.h"
+
+// deleter for arrays used by DLPack exporter
+extern "C" void NDArrayDLPackDeleter(DLManagedTensor* tensor);
+
+namespace dgl {
+namespace runtime {
+
+void NDArrayDLPackDeleter(DLManagedTensor* tensor) {
+  static_cast<NDArray::Container*>(tensor->manager_ctx)->DecRef();
+  delete tensor;
+}
+
+inline DGLContext ToDGLContext(const DLDevice& device) {
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device.device_type);
+  ctx.device_id = device.device_id;
+  return ctx;
+}
+
+inline DGLDataType ToDGLDataType(const DLDataType& src) {
+  DGLDataType ret;
+  ret.code = src.code;
+  ret.bits = src.bits;
+  ret.lanes = src.lanes;
+  return ret;
+}
+
+inline DLDevice ToDLDevice(const DGLContext& ctx) {
+  DLDevice device;
+  device.device_type = static_cast<DLDeviceType>(ctx.device_type);
+  device.device_id = ctx.device_id;
+  return device;
+}
+
+inline DLDataType ToDLDataType(const DGLDataType& src) {
+  DLDataType ret;
+  ret.code = src.code;
+  ret.bits = src.bits;
+  ret.lanes = src.lanes;
+  return ret;
+}
+
+NDArray DLPackConvert::FromDLPack(DLManagedTensor* tensor) {
+  NDArray::Container* data = new NDArray::Container();
+  data->deleter = DLPackConvert::DLPackDeleter;
+  data->manager_ctx = tensor;
+  data->dl_tensor.data = tensor->dl_tensor.data;
+  data->dl_tensor.ctx = ToDGLContext(tensor->dl_tensor.device);
+  data->dl_tensor.ndim = tensor->dl_tensor.ndim;
+  data->dl_tensor.dtype = ToDGLDataType(tensor->dl_tensor.dtype);
+  data->dl_tensor.shape = tensor->dl_tensor.shape;
+  data->dl_tensor.strides = tensor->dl_tensor.strides;
+  data->dl_tensor.byte_offset = tensor->dl_tensor.byte_offset;
+
+  return NDArray(data);
+}
+
+void DLPackConvert::DLPackDeleter(NDArray::Container* ptr) {
+  // if the array is pinned by dgl, unpin it before freeing
+  if (ptr->pinned_by_dgl_)
+    NDArray::UnpinContainer(ptr);
+  DLManagedTensor* tensor = static_cast<DLManagedTensor*>(ptr->manager_ctx);
+  if (tensor->deleter != nullptr) {
+    (*tensor->deleter)(tensor);
+  }
+  delete ptr;
+}
+
+DLManagedTensor* ContainerToDLPack(NDArray::Container* from) {
+  CHECK(from != nullptr);
+  DLManagedTensor* ret = new DLManagedTensor();
+  ret->dl_tensor.data = from->dl_tensor.data;
+  ret->dl_tensor.device = ToDLDevice(from->dl_tensor.ctx);
+  ret->dl_tensor.ndim = from->dl_tensor.ndim;
+  ret->dl_tensor.dtype = ToDLDataType(from->dl_tensor.dtype);
+  ret->dl_tensor.shape = from->dl_tensor.shape;
+  ret->dl_tensor.strides = from->dl_tensor.strides;
+  ret->dl_tensor.byte_offset = from->dl_tensor.byte_offset;
+
+  ret->manager_ctx = from;
+  from->IncRef();
+  ret->deleter = NDArrayDLPackDeleter;
+  return ret;
+}
+
+DLManagedTensor* DLPackConvert::ToDLPack(const NDArray &from) {
+  return ContainerToDLPack(from.data_);
+}
+
+}  // namespace runtime
+}  // namespace dgl
+
+using namespace dgl::runtime;
+
+void DGLDLManagedTensorCallDeleter(DLManagedTensor* dltensor) {
+  (*(dltensor->deleter))(dltensor);
+}
+
+inline bool IsAligned(const void* ptr, std::uintptr_t alignment) noexcept {
+  auto iptr = reinterpret_cast<std::uintptr_t>(ptr);
+  return !(iptr % alignment);
+}
+
+int DGLArrayFromDLPack(DLManagedTensor* from,
+                       DGLArrayHandle* out) {
+  API_BEGIN();
+  *out = NDArray::Internal::MoveAsDGLArray(DLPackConvert::FromDLPack(from));
+  API_END();
+}
+
+int DGLArrayToDLPack(DGLArrayHandle from, DLManagedTensor** out,
+                     int alignment) {
+  API_BEGIN();
+  auto* nd_container = reinterpret_cast<NDArray::Container*>(from);
+  DGLArray* nd = &(nd_container->dl_tensor);
+  // If the source DGLArray is not aligned, we should create a new aligned one
+  if (alignment != 0 && !IsAligned(nd->data, alignment)) {
+    std::vector<int64_t> shape_vec(nd->shape, nd->shape + nd->ndim);
+    NDArray copy_ndarray = NDArray::Empty(shape_vec, nd->dtype, nd->ctx);
+    copy_ndarray.CopyFrom(nd);
+    *out = DLPackConvert::ToDLPack(copy_ndarray);
+  } else {
+    *out = ContainerToDLPack(nd_container);
+  }
+  API_END();
+}
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_util.cc
@@ -17,7 +17,7 @@ namespace runtime {
 void FunctionInfo::Save(dmlc::JSONWriter* writer) const {
  std::vector<std::string> sarg_types(arg_types.size());
  for (size_t i = 0; i < arg_types.size(); ++i) {
-    sarg_types[i] = DGLType2String(arg_types[i]);
+    sarg_types[i] = DGLDataType2String(arg_types[i]);
  }
  writer->BeginObject();
  writer->WriteObjectKeyValue("name", name);
@@ -35,7 +35,7 @@ void FunctionInfo::Load(dmlc::JSONReader* reader) {
  helper.ReadAllFields(reader);
  arg_types.resize(sarg_types.size());
  for (size_t i = 0; i < arg_types.size(); ++i) {
-    arg_types[i] = String2DGLType(sarg_types[i]);
+    arg_types[i] = String2DGLDataType(sarg_types[i]);
  }
 }


--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -19,7 +19,7 @@ namespace runtime {
 /*! \brief function information needed by device */
 struct FunctionInfo {
  std::string name;
-  std::vector<DGLType> arg_types;
+  std::vector<DGLDataType> arg_types;
  std::vector<std::string> thread_axis_tags;

  void Save(dmlc::JSONWriter *writer) const;

--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -105,7 +105,7 @@ bool RuntimeEnabled(const std::string& target) {
  if (target == "cpu") {
    return true;
  } else if (target == "cuda" || target == "gpu") {
-    f_name = "device_api.gpu";
+    f_name = "device_api.cuda";
  } else if (target == "cl" || target == "opencl" || target == "sdaccel") {
    f_name = "device_api.opencl";
  } else if (target == "gl" || target == "opengl") {
@@ -121,7 +121,7 @@ bool RuntimeEnabled(const std::string& target) {
  } else if (target == "vpi" || target == "verilog") {
    f_name = "device_api.vpi";
  } else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") {
-    f_name = "device_api.gpu";
+    f_name = "device_api.cuda";
  } else if (target.length() >= 4 && target.substr(0, 4) == "rocm") {
    f_name = "device_api.rocm";
  } else if (target.length() >= 4 && target.substr(0, 4) == "llvm") {

--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -13,28 +13,25 @@
 #include <dgl/runtime/tensordispatch.h>
 #include "runtime_base.h"

-// deleter for arrays used by DLPack exporter
-extern "C" void NDArrayDLPackDeleter(DLManagedTensor* tensor);
-
 namespace dgl {

-constexpr DLDataType DLDataTypeTraits<int8_t>::dtype;
-constexpr DLDataType DLDataTypeTraits<int16_t>::dtype;
-constexpr DLDataType DLDataTypeTraits<int32_t>::dtype;
-constexpr DLDataType DLDataTypeTraits<int64_t>::dtype;
-constexpr DLDataType DLDataTypeTraits<uint32_t>::dtype;
-constexpr DLDataType DLDataTypeTraits<uint64_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<int8_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<int16_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<int32_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<int64_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<uint32_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype;
 #ifdef USE_FP16
-constexpr DLDataType DLDataTypeTraits<__half>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<__half>::dtype;
 #endif
-constexpr DLDataType DLDataTypeTraits<float>::dtype;
-constexpr DLDataType DLDataTypeTraits<double>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<float>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<double>::dtype;

 namespace runtime {

-inline void VerifyDataType(DLDataType dtype) {
+inline void VerifyDataType(DGLDataType dtype) {
  CHECK_GE(dtype.lanes, 1);
-  if (dtype.code == kDLFloat) {
+  if (dtype.code == kDGLFloat) {
    CHECK_EQ(dtype.bits % 8, 0);
  } else {
    CHECK_EQ(dtype.bits % 8, 0);
@@ -42,7 +39,7 @@ inline void VerifyDataType(DLDataType dtype) {
  CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }

-inline size_t GetDataSize(const DLTensor& arr) {
+inline size_t GetDataSize(const DGLArray& arr) {
  size_t size = 1;
  for (dgl_index_t i = 0; i < arr.ndim; ++i) {
    size *= arr.shape[i];
@@ -51,91 +48,61 @@ inline size_t GetDataSize(const DLTensor& arr) {
  return size;
 }

-inline size_t GetDataAlignment(const DLTensor& arr) {
+inline size_t GetDataAlignment(const DGLArray& arr) {
  size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
  if (align < kAllocAlignment) return kAllocAlignment;
  return align;
 }

-struct NDArray::Internal {
-  // Default deleter for the container
-  static void DefaultDeleter(NDArray::Container* ptr) {
-    using dgl::runtime::NDArray;
-    if (ptr->manager_ctx != nullptr) {
-      static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
-    } else if (ptr->mem) {
-      ptr->mem = nullptr;
-    } else if (ptr->dl_tensor.data != nullptr) {
-      // if the array is still pinned before freeing, unpin it.
-      if (ptr->pinned_by_dgl_)
-        UnpinContainer(ptr);
-      dgl::runtime::DeviceAPI::Get(ptr->dl_tensor.ctx)->FreeDataSpace(
-          ptr->dl_tensor.ctx, ptr->dl_tensor.data);
-    }
-    delete ptr;
-  }
-  // Deleter for NDArray converted from DLPack
-  // This is used from data which is passed from external DLPack(DLManagedTensor)
-  // that are not allocated inside of DGL.
-  // This enables us to create NDArray from memory allocated by other
-  // frameworks that are DLPack compatible
-  static void DLPackDeleter(NDArray::Container* ptr) {
-    // if the array is pinned by dgl, unpin it before freeing
+void NDArray::Internal::DefaultDeleter(NDArray::Container* ptr) {
+  using dgl::runtime::NDArray;
+  if (ptr->manager_ctx != nullptr) {
+    static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
+  } else if (ptr->mem) {
+    ptr->mem = nullptr;
+  } else if (ptr->dl_tensor.data != nullptr) {
+    // if the array is still pinned before freeing, unpin it.
    if (ptr->pinned_by_dgl_)
      UnpinContainer(ptr);
-    DLManagedTensor* tensor = static_cast<DLManagedTensor*>(ptr->manager_ctx);
-    if (tensor->deleter != nullptr) {
-      (*tensor->deleter)(tensor);
-    }
-    delete ptr;
-  }
-  // Local create function which allocates tensor metadata
-  // but does not allocate space for the data.
-  static NDArray Create(std::vector<int64_t> shape,
-                        DLDataType dtype,
-                        DLContext ctx) {
-    VerifyDataType(dtype);
-    // critical zone
-    NDArray::Container* data = new NDArray::Container();
-    data->deleter = DefaultDeleter;
-    NDArray ret(data);
-    ret.data_ = data;
-    // RAII now in effect
-    // setup shape
-    data->shape_ = std::move(shape);
-    data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
-    data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
-    // setup stride (this should be optional, but some framework
-    //   does not support NULL stride and thus will crash the program).
-    data->stride_.resize(data->dl_tensor.ndim, 1);
-    for (int i = data->dl_tensor.ndim - 2; i >= 0; --i) {
-      data->stride_[i] = data->shape_[i+1] * data->stride_[i+1];
-    }
-    data->dl_tensor.strides = dmlc::BeginPtr(data->stride_);
-    // setup dtype
-    data->dl_tensor.dtype = dtype;
-    // setup ctx
-    data->dl_tensor.ctx = ctx;
-    return ret;
-  }
-  // Implementation of API function
-  static DLTensor* MoveAsDLTensor(NDArray arr) {
-    DLTensor* tensor = reinterpret_cast<DLTensor*>(arr.data_);
-    CHECK(tensor == const_cast<DLTensor*>(arr.operator->()));
-    arr.data_ = nullptr;
-    return tensor;
+    dgl::runtime::DeviceAPI::Get(ptr->dl_tensor.ctx)->FreeDataSpace(
+        ptr->dl_tensor.ctx, ptr->dl_tensor.data);
  }
-  // Container to DLManagedTensor
-  static DLManagedTensor* ToDLPack(NDArray::Container* from) {
-    CHECK(from != nullptr);
-    DLManagedTensor* ret = new DLManagedTensor();
-    ret->dl_tensor = from->dl_tensor;
-    ret->manager_ctx = from;
-    from->IncRef();
-    ret->deleter = NDArrayDLPackDeleter;
-    return ret;
+  delete ptr;
+}
+
+NDArray NDArray::Internal::Create(std::vector<int64_t> shape,
+                                  DGLDataType dtype, DGLContext ctx) {
+  VerifyDataType(dtype);
+  // critical zone
+  NDArray::Container* data = new NDArray::Container();
+  data->deleter = DefaultDeleter;
+  NDArray ret(data);
+  ret.data_ = data;
+  // RAII now in effect
+  // setup shape
+  data->shape_ = std::move(shape);
+  data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
+  data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
+  // setup stride (this should be optional, but some framework
+  //   does not support NULL stride and thus will crash the program).
+  data->stride_.resize(data->dl_tensor.ndim, 1);
+  for (int i = data->dl_tensor.ndim - 2; i >= 0; --i) {
+    data->stride_[i] = data->shape_[i+1] * data->stride_[i+1];
  }
-};
+  data->dl_tensor.strides = dmlc::BeginPtr(data->stride_);
+  // setup dtype
+  data->dl_tensor.dtype = dtype;
+  // setup ctx
+  data->dl_tensor.ctx = ctx;
+  return ret;
+}
+
+DGLArray* NDArray::Internal::MoveAsDGLArray(NDArray arr) {
+  DGLArray* tensor = reinterpret_cast<DGLArray*>(arr.data_);
+  CHECK(tensor == const_cast<DGLArray*>(arr.operator->()));
+  arr.data_ = nullptr;
+  return tensor;
+}

 size_t NDArray::GetSize() const {
  return GetDataSize(data_->dl_tensor);
@@ -170,7 +137,7 @@ bool NDArray::IsContiguous() const {
 }

 NDArray NDArray::CreateView(std::vector<int64_t> shape,
-                            DLDataType dtype,
+                            DGLDataType dtype,
                            int64_t offset) {
  CHECK(data_ != nullptr);
  CHECK(IsContiguous()) << "Can only create view for compact tensor";
@@ -189,14 +156,10 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape,
  return ret;
 }

-DLManagedTensor* NDArray::ToDLPack() const {
-  return Internal::ToDLPack(data_);
-}
-
 NDArray NDArray::EmptyShared(const std::string &name,
                       std::vector<int64_t> shape,
-                       DLDataType dtype,
-                       DLContext ctx, bool is_create) {
+                       DGLDataType dtype,
+                       DGLContext ctx, bool is_create) {
  NDArray ret = Internal::Create(shape, dtype, ctx);
  // setup memory content
  size_t size = GetDataSize(ret.data_->dl_tensor);
@@ -212,8 +175,8 @@ NDArray NDArray::EmptyShared(const std::string &name,
 }

 NDArray NDArray::Empty(std::vector<int64_t> shape,
-                       DLDataType dtype,
-                       DLContext ctx) {
+                       DGLDataType dtype,
+                       DGLContext ctx) {
  NDArray ret = Internal::Create(shape, dtype, ctx);
  // setup memory content
  size_t size = GetDataSize(ret.data_->dl_tensor);
@@ -225,30 +188,21 @@ NDArray NDArray::Empty(std::vector<int64_t> shape,
  return ret;
 }

-NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
-  NDArray::Container* data = new NDArray::Container();
-  data->deleter = Internal::DLPackDeleter;
-  data->manager_ctx = tensor;
-  data->dl_tensor = tensor->dl_tensor;
-
-  return NDArray(data);
-}
-
-void NDArray::CopyFromTo(DLTensor* from,
-                         DLTensor* to) {
+void NDArray::CopyFromTo(DGLArray* from,
+                         DGLArray* to) {
  size_t from_size = GetDataSize(*from);
  size_t to_size = GetDataSize(*to);
  CHECK_EQ(from_size, to_size)
    << "DGLArrayCopyFromTo: The size must exactly match";

  CHECK(from->ctx.device_type == to->ctx.device_type
-        || from->ctx.device_type == kDLCPU
-        || to->ctx.device_type == kDLCPU)
+        || from->ctx.device_type == kDGLCPU
+        || to->ctx.device_type == kDGLCPU)
    << "Can not copy across different ctx types directly";

  // Use the context that is *not* a cpu context to get the correct device
  // api manager.
-  DGLContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx;
+  DGLContext ctx = from->ctx.device_type != kDGLCPU ? from->ctx : to->ctx;

  // default: local current cuda stream
  DeviceAPI::Get(ctx)->CopyDataFromTo(
@@ -260,9 +214,9 @@ void NDArray::CopyFromTo(DLTensor* from,
 void NDArray::PinContainer(NDArray::Container* ptr) {
  if (IsContainerPinned(ptr)) return;
  auto* tensor = &(ptr->dl_tensor);
-  CHECK_EQ(tensor->ctx.device_type, kDLCPU)
+  CHECK_EQ(tensor->ctx.device_type, kDGLCPU)
    << "Only NDArray on CPU can be pinned";
-  DeviceAPI::Get(kDLGPU)->PinData(tensor->data, GetDataSize(*tensor));
+  DeviceAPI::Get(kDGLCUDA)->PinData(tensor->data, GetDataSize(*tensor));
  ptr->pinned_by_dgl_ = true;
 }

@@ -275,22 +229,22 @@ void NDArray::UnpinContainer(NDArray::Container* ptr) {
  // 1. not pinned, do nothing
  if (!container_is_pinned) return;
  // 2. pinned by DGL, unpin it
-  DeviceAPI::Get(kDLGPU)->UnpinData(ptr->dl_tensor.data);
+  DeviceAPI::Get(kDGLCUDA)->UnpinData(ptr->dl_tensor.data);
  ptr->pinned_by_dgl_ = false;
 }

 void NDArray::RecordStream(DGLArray* tensor, DGLStreamHandle stream) {
  TensorDispatcher* td = TensorDispatcher::Global();
  CHECK(td->IsAvailable()) << "RecordStream only works when TensorAdaptor is available.";
-  CHECK_EQ(tensor->ctx.device_type, kDLGPU)
+  CHECK_EQ(tensor->ctx.device_type, kDGLCUDA)
    << "RecordStream only works with GPU tensors.";

  td->RecordStream(tensor->data, stream, tensor->ctx.device_id);
 }

 template<typename T>
-NDArray NDArray::FromVector(const std::vector<T>& vec, DLContext ctx) {
-  const DLDataType dtype = DLDataTypeTraits<T>::dtype;
+NDArray NDArray::FromVector(const std::vector<T>& vec, DGLContext ctx) {
+  const DGLDataType dtype = DGLDataTypeTraits<T>::dtype;
  int64_t size = static_cast<int64_t>(vec.size());
  NDArray ret = NDArray::Empty({size}, dtype, ctx);
  DeviceAPI::Get(ctx)->CopyDataFromTo(
@@ -299,29 +253,38 @@ NDArray NDArray::FromVector(const std::vector<T>& vec, DLContext ctx) {
      static_cast<T*>(ret->data),
      0,
      size * sizeof(T),
-      DLContext{kDLCPU, 0},
+      DGLContext{kDGLCPU, 0},
      ctx,
      dtype);
  return ret;
 }

+NDArray NDArray::CreateFromRaw(const std::vector<int64_t>& shape,
+    DGLDataType dtype, DGLContext ctx, void* raw, bool auto_free) {
+  NDArray ret = Internal::Create(shape, dtype, ctx);
+  ret.data_->dl_tensor.data = raw;
+  if (!auto_free)
+    ret.data_->deleter = nullptr;
+  return ret;
+}
+
 // export specializations
-template NDArray NDArray::FromVector<int32_t>(const std::vector<int32_t>&, DLContext);
-template NDArray NDArray::FromVector<int64_t>(const std::vector<int64_t>&, DLContext);
-template NDArray NDArray::FromVector<uint32_t>(const std::vector<uint32_t>&, DLContext);
-template NDArray NDArray::FromVector<uint64_t>(const std::vector<uint64_t>&, DLContext);
-template NDArray NDArray::FromVector<float>(const std::vector<float>&, DLContext);
-template NDArray NDArray::FromVector<double>(const std::vector<double>&, DLContext);
+template NDArray NDArray::FromVector<int32_t>(const std::vector<int32_t>&, DGLContext);
+template NDArray NDArray::FromVector<int64_t>(const std::vector<int64_t>&, DGLContext);
+template NDArray NDArray::FromVector<uint32_t>(const std::vector<uint32_t>&, DGLContext);
+template NDArray NDArray::FromVector<uint64_t>(const std::vector<uint64_t>&, DGLContext);
+template NDArray NDArray::FromVector<float>(const std::vector<float>&, DGLContext);
+template NDArray NDArray::FromVector<double>(const std::vector<double>&, DGLContext);

 template<typename T>
 std::vector<T> NDArray::ToVector() const {
-  const DLDataType dtype = DLDataTypeTraits<T>::dtype;
+  const DGLDataType dtype = DGLDataTypeTraits<T>::dtype;
  CHECK(data_->dl_tensor.ndim == 1) << "ToVector() only supported for 1D arrays";
  CHECK(data_->dl_tensor.dtype == dtype) << "dtype mismatch";

  int64_t size = data_->dl_tensor.shape[0];
  std::vector<T> vec(size);
-  const DLContext &ctx = data_->dl_tensor.ctx;
+  const DGLContext &ctx = data_->dl_tensor.ctx;
  DeviceAPI::Get(ctx)->CopyDataFromTo(
      static_cast<T*>(data_->dl_tensor.data),
      0,
@@ -329,7 +292,7 @@ std::vector<T> NDArray::ToVector() const {
      0,
      size * sizeof(T),
      ctx,
-      DLContext{kDLCPU, 0},
+      DGLContext{kDGLCPU, 0},
      dtype);
  return vec;
 }
@@ -350,10 +313,10 @@ bool NDArray::IsContainerPinned(NDArray::Container* ptr) {
    return true;
  auto* tensor = &(ptr->dl_tensor);
  // Can only be pinned if on CPU...
-  if (tensor->ctx.device_type != kDLCPU)
+  if (tensor->ctx.device_type != kDGLCPU)
    return false;
  // ... and CUDA device API is enabled, and the tensor is indeed in pinned memory.
-  auto device = DeviceAPI::Get(kDLGPU, true);
+  auto device = DeviceAPI::Get(kDGLCUDA, true);
  return device && device->IsPinned(tensor->data);
 }

@@ -363,7 +326,7 @@ void NDArray::Save(dmlc::Stream* strm) const {
    zc_strm->PushNDArray(*this);
    return;
  }
-  SaveDLTensor(strm, const_cast<DLTensor*>(operator->()));
+  SaveDGLArray(strm, const_cast<DGLArray*>(operator->()));
 }

 bool NDArray::Load(dmlc::Stream* strm) {
@@ -374,26 +337,26 @@ bool NDArray::Load(dmlc::Stream* strm) {
  }
  uint64_t header, reserved;
  CHECK(strm->Read(&header))
-      << "Invalid DLTensor file format";
+      << "Invalid DGLArray file format";
  CHECK(strm->Read(&reserved))
-      << "Invalid DLTensor file format";
+      << "Invalid DGLArray file format";
  CHECK(header == kDGLNDArrayMagic)
-      << "Invalid DLTensor file format";
-  DLContext ctx;
+      << "Invalid DGLArray file format";
+  DGLContext ctx;
  int ndim;
-  DLDataType dtype;
+  DGLDataType dtype;
  CHECK(strm->Read(&ctx))
-      << "Invalid DLTensor file format";
+      << "Invalid DGLArray file format";
  CHECK(strm->Read(&ndim))
-      << "Invalid DLTensor file format";
+      << "Invalid DGLArray file format";
  CHECK(strm->Read(&dtype))
-      << "Invalid DLTensor file format";
-  CHECK_EQ(ctx.device_type, kDLCPU)
-      << "Invalid DLTensor context: can only save as CPU tensor";
+      << "Invalid DGLArray file format";
+  CHECK_EQ(ctx.device_type, kDGLCPU)
+      << "Invalid DGLArray context: can only save as CPU tensor";
  std::vector<int64_t> shape(ndim);
  if (ndim != 0) {
    CHECK(strm->ReadArray(&shape[0], ndim))
-        << "Invalid DLTensor file format";
+        << "Invalid DGLArray file format";
  }
  NDArray ret = NDArray::Empty(shape, dtype, ctx);
  int64_t num_elems = 1;
@@ -403,14 +366,14 @@ bool NDArray::Load(dmlc::Stream* strm) {
  }
  int64_t data_byte_size;
  CHECK(strm->Read(&data_byte_size))
-      << "Invalid DLTensor file format";
+      << "Invalid DGLArray file format";
  CHECK(data_byte_size == num_elems * elem_bytes)
-      << "Invalid DLTensor file format";
+      << "Invalid DGLArray file format";
  if (data_byte_size != 0)  {
    // strm->Read will return the total number of elements successfully read.
    // Therefore if data_byte_size is zero, the CHECK below would fail.
    CHECK(strm->Read(ret->data, data_byte_size))
-        << "Invalid DLTensor file format";
+        << "Invalid DGLArray file format";
  }
  if (!DMLC_IO_NO_ENDIAN_SWAP) {
    dmlc::ByteSwap(ret->data, elem_bytes, num_elems);
@@ -425,11 +388,6 @@ bool NDArray::Load(dmlc::Stream* strm) {

 using namespace dgl::runtime;

-void NDArrayDLPackDeleter(DLManagedTensor* tensor) {
-  static_cast<NDArray::Container*>(tensor->manager_ctx)->DecRef();
-  delete tensor;
-}
-
 int DGLArrayAlloc(const dgl_index_t* shape,
                  int ndim,
                  int dtype_code,
@@ -439,14 +397,14 @@ int DGLArrayAlloc(const dgl_index_t* shape,
                  int device_id,
                  DGLArrayHandle* out) {
  API_BEGIN();
-  DLDataType dtype;
+  DGLDataType dtype;
  dtype.code = static_cast<uint8_t>(dtype_code);
  dtype.bits = static_cast<uint8_t>(dtype_bits);
  dtype.lanes = static_cast<uint16_t>(dtype_lanes);
-  DLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
  ctx.device_id = device_id;
-  *out = NDArray::Internal::MoveAsDLTensor(
+  *out = NDArray::Internal::MoveAsDGLArray(
      NDArray::Empty(std::vector<int64_t>(shape, shape + ndim), dtype, ctx));
  API_END();
 }
@@ -460,14 +418,14 @@ int DGLArrayAllocSharedMem(const char *mem_name,
                           bool is_create,
                           DGLArrayHandle* out) {
  API_BEGIN();
-  DLDataType dtype;
+  DGLDataType dtype;
  dtype.code = static_cast<uint8_t>(dtype_code);
  dtype.bits = static_cast<uint8_t>(dtype_bits);
  dtype.lanes = static_cast<uint16_t>(dtype_lanes);
  std::vector<int64_t> shape_vec(shape, shape + ndim);
  NDArray arr = NDArray::EmptyShared(mem_name, shape_vec, dtype,
-                                     DLContext{kDLCPU, 0}, is_create);
-  *out = NDArray::Internal::MoveAsDLTensor(arr);
+                                     DGLContext{kDGLCPU, 0}, is_create);
+  *out = NDArray::Internal::MoveAsDGLArray(arr);
  API_END();
 }

@@ -484,44 +442,12 @@ int DGLArrayCopyFromTo(DGLArrayHandle from,
  API_END();
 }

-int DGLArrayFromDLPack(DLManagedTensor* from,
-                       DGLArrayHandle* out) {
-  API_BEGIN();
-  *out = NDArray::Internal::MoveAsDLTensor(NDArray::FromDLPack(from));
-  API_END();
-}
-
-inline bool is_aligned(const void* ptr, std::uintptr_t alignment) noexcept {
-  auto iptr = reinterpret_cast<std::uintptr_t>(ptr);
-  return !(iptr % alignment);
-}
-
-int DGLArrayToDLPack(DGLArrayHandle from, DLManagedTensor** out,
-                     int alignment) {
-  API_BEGIN();
-  auto* nd_container = reinterpret_cast<NDArray::Container*>(from);
-  DLTensor* nd = &(nd_container->dl_tensor);
-  if (alignment != 0 && !is_aligned(nd->data, alignment)) {
-    std::vector<int64_t> shape_vec(nd->shape, nd->shape + nd->ndim);
-    NDArray copy_ndarray = NDArray::Empty(shape_vec, nd->dtype, nd->ctx);
-    copy_ndarray.CopyFrom(nd);
-    *out = copy_ndarray.ToDLPack();
-  } else {
-    *out = NDArray::Internal::ToDLPack(nd_container);
-  }
-  API_END();
-}
-
-void DGLDLManagedTensorCallDeleter(DLManagedTensor* dltensor) {
-  (*(dltensor->deleter))(dltensor);
-}
-
 int DGLArrayCopyFromBytes(DGLArrayHandle handle,
                          void* data,
                          size_t nbytes) {
  API_BEGIN();
  DGLContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
+  cpu_ctx.device_type = kDGLCPU;
  cpu_ctx.device_id = 0;
  size_t arr_size = GetDataSize(*handle);
  CHECK_EQ(arr_size, nbytes)
@@ -538,7 +464,7 @@ int DGLArrayCopyToBytes(DGLArrayHandle handle,
                        size_t nbytes) {
  API_BEGIN();
  DGLContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
+  cpu_ctx.device_type = kDGLCPU;
  cpu_ctx.device_id = 0;
  size_t arr_size = GetDataSize(*handle);
  CHECK_EQ(arr_size, nbytes)
@@ -551,7 +477,7 @@ int DGLArrayCopyToBytes(DGLArrayHandle handle,
 }

 int DGLArrayPinData(DGLArrayHandle handle,
-                    DLContext ctx) {
+                    DGLContext ctx) {
  API_BEGIN();
  auto* nd_container = reinterpret_cast<NDArray::Container*>(handle);
  NDArray::PinContainer(nd_container);
@@ -559,7 +485,7 @@ int DGLArrayPinData(DGLArrayHandle handle,
 }

 int DGLArrayUnpinData(DGLArrayHandle handle,
-                      DLContext ctx) {
+                      DGLContext ctx) {
  API_BEGIN();
  auto* nd_container = reinterpret_cast<NDArray::Container*>(handle);
  NDArray::UnpinContainer(nd_container);

--- a/src/runtime/pack_args.h
+++ b/src/runtime/pack_args.h
@@ -39,7 +39,7 @@ union ArgUnion {
 * \return The wrapped packed function.
 */
 template<typename F>
-inline PackedFunc PackFuncVoidAddr(F f, const std::vector<DGLType>& arg_types);
+inline PackedFunc PackFuncVoidAddr(F f, const std::vector<DGLDataType>& arg_types);
 /*!
 * \brief Create a packed function that from function only packs buffer arguments.
 *
@@ -50,7 +50,7 @@ inline PackedFunc PackFuncVoidAddr(F f, const std::vector<DGLType>& arg_types);
 * \return The wrapped packed function.
 */
 template<typename F>
-inline PackedFunc PackFuncNonBufferArg(F f, const std::vector<DGLType>& arg_types);
+inline PackedFunc PackFuncNonBufferArg(F f, const std::vector<DGLDataType>& arg_types);
 /*!
 * \brief Create a packed function that from function that takes a packed arguments.
 *
@@ -61,13 +61,13 @@ inline PackedFunc PackFuncNonBufferArg(F f, const std::vector<DGLType>& arg_type
 * \return The wrapped packed function.
 */
 template<typename F>
-inline PackedFunc PackFuncPackedArg(F f, const std::vector<DGLType>& arg_types);
+inline PackedFunc PackFuncPackedArg(F f, const std::vector<DGLDataType>& arg_types);
 /*!
 * \brief Extract number of buffer argument from the argument types.
 * \param arg_types The argument types.
 * \return number of buffer arguments
 */
-inline size_t NumBufferArgs(const std::vector<DGLType>& arg_types);
+inline size_t NumBufferArgs(const std::vector<DGLDataType>& arg_types);

 // implementations details
 namespace detail {
@@ -102,15 +102,15 @@ enum ArgConvertCode {
  HANDLE_TO_HANDLE
 };

-inline ArgConvertCode GetArgConvertCode(DGLType t) {
+inline ArgConvertCode GetArgConvertCode(DGLDataType t) {
  CHECK_EQ(t.lanes, 1U)
      << "Cannot pass vector type argument to devic function for now";
-  if (t.code == kDLInt) {
+  if (t.code == kDGLInt) {
    if (t.bits == 64U) return INT64_TO_INT64;
    if (t.bits == 32U) return INT64_TO_INT32;
-  } else if (t.code == kDLUInt) {
+  } else if (t.code == kDGLUInt) {
    if (t.bits == 32U) return INT64_TO_UINT32;
-  } else if (t.code == kDLFloat) {
+  } else if (t.code == kDGLFloat) {
    if (t.bits == 64U) return FLOAT64_TO_FLOAT64;
    if (t.bits == 32U) return FLOAT64_TO_FLOAT32;
  } else if (t.code == kHandle) {
@@ -245,7 +245,7 @@ inline PackedFunc PackFuncPackedArg_(
 }  // namespace detail

 template<typename F>
-inline PackedFunc PackFuncVoidAddr(F f, const std::vector<DGLType>& arg_types) {
+inline PackedFunc PackFuncVoidAddr(F f, const std::vector<DGLDataType>& arg_types) {
  std::vector<detail::ArgConvertCode> codes(arg_types.size());
  for (size_t i = 0; i < arg_types.size(); ++i) {
    codes[i] = detail::GetArgConvertCode(arg_types[i]);
@@ -261,7 +261,7 @@ inline PackedFunc PackFuncVoidAddr(F f, const std::vector<DGLType>& arg_types) {
  }
 }

-inline size_t NumBufferArgs(const std::vector<DGLType>& arg_types) {
+inline size_t NumBufferArgs(const std::vector<DGLDataType>& arg_types) {
  size_t base = arg_types.size();
  for (size_t i = 0; i < arg_types.size(); ++i) {
    if (arg_types[i].code != kHandle) {
@@ -276,7 +276,7 @@ inline size_t NumBufferArgs(const std::vector<DGLType>& arg_types) {
 }

 template<typename F>
-inline PackedFunc PackFuncNonBufferArg(F f, const std::vector<DGLType>& arg_types) {
+inline PackedFunc PackFuncNonBufferArg(F f, const std::vector<DGLDataType>& arg_types) {
  size_t num_buffer = NumBufferArgs(arg_types);
  std::vector<detail::ArgConvertCode> codes;
  for (size_t i = num_buffer; i < arg_types.size(); ++i) {
@@ -293,7 +293,7 @@ inline PackedFunc PackFuncNonBufferArg(F f, const std::vector<DGLType>& arg_type
 }

 template<typename F>
-inline PackedFunc PackFuncPackedArg(F f, const std::vector<DGLType>& arg_types) {
+inline PackedFunc PackFuncPackedArg(F f, const std::vector<DGLDataType>& arg_types) {
  std::vector<detail::ArgConvertCode> codes;
  for (size_t i = 0; i < arg_types.size(); ++i) {
    codes.push_back(detail::GetArgConvertCode(arg_types[i]));