[Refactor] Immutable graph index (#543)

* WIP * header * WIP .cc * WIP * transpose * wip * immutable graph .h and .cc * WIP: nodeflow.cc * compile * remove all tmp dl managed ctx; they caused refcount issue * one simple test * WIP: testing * test_graph * fix graph index * fix bug in sampler; pass pytorch utest * WIP on mxnet * fix lint * fix mxnet unittest w/ unfortunate workaround * fix msvc * fix lint * SliceRows and test_nodeflow * resolve reviews * resolve reviews * try fix win ci * try fix win ci * poke win ci again * poke * lazy multigraph flag; stackoverflow error * revert node subgraph test * lazy object * try fix win build * try fix win build * poke ci * fix build script * fix compile * add a todo * fix reviews * fix compile

[Refactor] Immutable graph index (#543)
* WIP * header * WIP .cc * WIP * transpose * wip * immutable graph .h and .cc * WIP: nodeflow.cc * compile * remove all tmp dl managed ctx; they caused refcount issue * one simple test * WIP: testing * test_graph * fix graph index * fix bug in sampler; pass pytorch utest * WIP on mxnet * fix lint * fix mxnet unittest w/ unfortunate workaround * fix msvc * fix lint * SliceRows and test_nodeflow * resolve reviews * resolve reviews * try fix win ci * try fix win ci * poke win ci again * poke * lazy multigraph flag; stackoverflow error * revert node subgraph test * lazy object * try fix win build * try fix win build * poke ci * fix build script * fix compile * add a todo * fix reviews * fix compile
605b5185 · Minjie Wang · GitHub · b2b8be25 · 605b5185 · 605b5185
Unverified Commit 605b5185 authored May 21, 2019 by Minjie Wang Committed by GitHub May 21, 2019
8 changed files
--- a/src/graph/randomwalk.cc
+++ b/src/graph/randomwalk.cc
@@ -219,7 +219,7 @@ RandomWalkTraces BipartiteSingleSidedRandomWalkWithRestart(
 DGL_REGISTER_GLOBAL("randomwalk._CAPI_DGLRandomWalk")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
-    const IdArray seeds = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray seeds = args[1];
    const int num_traces = args[2];
    const int num_hops = args[3];
    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghandle);
@@ -230,7 +230,7 @@ DGL_REGISTER_GLOBAL("randomwalk._CAPI_DGLRandomWalk")
 DGL_REGISTER_GLOBAL("randomwalk._CAPI_DGLRandomWalkWithRestart")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
-    const IdArray seeds = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray seeds = args[1];
    const double restart_prob = args[2];
    const uint64_t visit_threshold_per_seed = args[3];
    const uint64_t max_visit_counts = args[4];
@@ -245,7 +245,7 @@ DGL_REGISTER_GLOBAL("randomwalk._CAPI_DGLRandomWalkWithRestart")
 DGL_REGISTER_GLOBAL("randomwalk._CAPI_DGLBipartiteSingleSidedRandomWalkWithRestart")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
-    const IdArray seeds = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray seeds = args[1];
    const double restart_prob = args[2];
    const uint64_t visit_threshold_per_seed = args[3];
    const uint64_t max_visit_counts = args[4];

--- a/src/graph/sampler.cc
+++ b/src/graph/sampler.cc
@@ -248,14 +248,10 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,
                           int64_t num_edges, int num_hops, bool is_multigraph) {
  NodeFlow nf;
  uint64_t num_vertices = sub_vers->size();
-  nf.node_mapping = IdArray::Empty({static_cast<int64_t>(num_vertices)},
-                                   DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  nf.edge_mapping = IdArray::Empty({static_cast<int64_t>(num_edges)},
-                                   DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  nf.layer_offsets = IdArray::Empty({static_cast<int64_t>(num_hops + 1)},
-                                    DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  nf.flow_offsets = IdArray::Empty({static_cast<int64_t>(num_hops)},
-                                    DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
+  nf.node_mapping = NewIdArray(num_vertices);
+  nf.edge_mapping = NewIdArray(num_edges);
+  nf.layer_offsets = NewIdArray(num_hops + 1);
+  nf.flow_offsets = NewIdArray(num_hops);

  dgl_id_t *node_map_data = static_cast<dgl_id_t *>(nf.node_mapping->data);
  dgl_id_t *layer_off_data = static_cast<dgl_id_t *>(nf.layer_offsets->data);
@@ -263,11 +259,11 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,
  dgl_id_t *edge_map_data = static_cast<dgl_id_t *>(nf.edge_mapping->data);

  // Construct sub_csr_graph
-  auto subg_csr = std::make_shared<ImmutableGraph::CSR>(num_vertices, num_edges);
-  subg_csr->indices.resize(num_edges);
-  subg_csr->edge_ids.resize(num_edges);
-  dgl_id_t* col_list_out = subg_csr->indices.data();
-  int64_t* indptr_out = subg_csr->indptr.data();
+  // TODO(minjie): is nodeflow a multigraph?
+  auto subg_csr = CSRPtr(new CSR(num_vertices, num_edges, is_multigraph));
+  dgl_id_t* indptr_out = static_cast<dgl_id_t*>(subg_csr->indptr()->data);
+  dgl_id_t* col_list_out = static_cast<dgl_id_t*>(subg_csr->indices()->data);
+  dgl_id_t* eid_out = static_cast<dgl_id_t*>(subg_csr->edge_ids()->data);
  size_t collected_nedges = 0;

  // The data from the previous steps:
@@ -303,10 +299,8 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,
  // When we expose the sampled graph to a Python user, we say the input nodes
  // are in the first layer and the seed nodes are in the last layer.
  // Thus, when we copy sampled results to a CSR, we need to reverse the order of layers.
-  size_t row_idx = 0;
-  for (size_t i = layer_offsets[num_hops - 1]; i < layer_offsets[num_hops]; i++) {
-    indptr_out[row_idx++] = 0;
-  }
+  std::fill(indptr_out, indptr_out + num_vertices + 1, 0);
+  size_t row_idx = layer_offsets[num_hops] - layer_offsets[num_hops - 1];
  layer_off_data[0] = 0;
  layer_off_data[1] = layer_offsets[num_hops] - layer_offsets[num_hops - 1];
  int out_layer_idx = 1;
@@ -322,51 +316,49 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,
      CHECK_EQ(dst_id, neigh_pos->at(i).id);
      size_t pos = neigh_pos->at(i).pos;
      CHECK_LE(pos, neighbor_list.size());
-      size_t num_edges = neigh_pos->at(i).num_edges;
-      if (neighbor_list.empty()) CHECK_EQ(num_edges, 0);
+      const size_t nedges = neigh_pos->at(i).num_edges;
+      if (neighbor_list.empty()) CHECK_EQ(nedges, 0);

      // We need to map the Ids of the neighbors to the subgraph.
      auto neigh_it = neighbor_list.begin() + pos;
-      for (size_t i = 0; i < num_edges; i++) {
+      for (size_t i = 0; i < nedges; i++) {
        dgl_id_t neigh = *(neigh_it + i);
        CHECK(layer_ver_maps[layer_id + 1].find(neigh) != layer_ver_maps[layer_id + 1].end());
        col_list_out[collected_nedges + i] = layer_ver_maps[layer_id + 1][neigh];
      }
      // We can simply copy the edge Ids.
      std::copy_n(edge_list.begin() + pos,
-                  num_edges, edge_map_data + collected_nedges);
-      collected_nedges += num_edges;
-      indptr_out[row_idx+1] = indptr_out[row_idx] + num_edges;
+                  nedges, edge_map_data + collected_nedges);
+      collected_nedges += nedges;
+      indptr_out[row_idx+1] = indptr_out[row_idx] + nedges;
      row_idx++;
    }
    layer_off_data[out_layer_idx + 1] = layer_off_data[out_layer_idx]
        + layer_offsets[layer_id + 1] - layer_offsets[layer_id];
    out_layer_idx++;
  }
-  CHECK(row_idx == num_vertices);
-  CHECK(indptr_out[row_idx] == num_edges);
-  CHECK(out_layer_idx == num_hops);
-  CHECK(layer_off_data[out_layer_idx] == num_vertices);
+  CHECK_EQ(row_idx, num_vertices);
+  CHECK_EQ(indptr_out[row_idx], num_edges);
+  CHECK_EQ(out_layer_idx, num_hops);
+  CHECK_EQ(layer_off_data[out_layer_idx], num_vertices);

  // Copy flow offsets.
  flow_off_data[0] = 0;
  int out_flow_idx = 0;
  for (size_t i = 0; i < layer_offsets.size() - 2; i++) {
-    size_t num_edges = subg_csr->GetDegree(layer_off_data[i + 1], layer_off_data[i + 2]);
+    size_t num_edges = indptr_out[layer_off_data[i + 2]] - indptr_out[layer_off_data[i + 1]];
    flow_off_data[out_flow_idx + 1] = flow_off_data[out_flow_idx] + num_edges;
    out_flow_idx++;
  }
  CHECK(out_flow_idx == num_hops - 1);
  CHECK(flow_off_data[num_hops - 1] == static_cast<uint64_t>(num_edges));

-  for (size_t i = 0; i < subg_csr->edge_ids.size(); i++) {
-    subg_csr->edge_ids[i] = i;
-  }
+  std::iota(eid_out, eid_out + num_edges, 0);

-  if (edge_type == "in") {
-    nf.graph = GraphPtr(new ImmutableGraph(subg_csr, nullptr, is_multigraph));
+  if (edge_type == std::string("in")) {
+    nf.graph = GraphPtr(new ImmutableGraph(subg_csr, nullptr));
  } else {
-    nf.graph = GraphPtr(new ImmutableGraph(nullptr, subg_csr, is_multigraph));
+    nf.graph = GraphPtr(new ImmutableGraph(nullptr, subg_csr));
  }

  return nf;
@@ -382,9 +374,9 @@ NodeFlow SampleSubgraph(const ImmutableGraph *graph,
  unsigned int time_seed = randseed();
  const size_t num_seeds = seeds.size();
  auto orig_csr = edge_type == "in" ? graph->GetInCSR() : graph->GetOutCSR();
-  const dgl_id_t* val_list = orig_csr->edge_ids.data();
-  const dgl_id_t* col_list = orig_csr->indices.data();
-  const int64_t* indptr = orig_csr->indptr.data();
+  const dgl_id_t* val_list = static_cast<dgl_id_t*>(orig_csr->edge_ids()->data);
+  const dgl_id_t* col_list = static_cast<dgl_id_t*>(orig_csr->indices()->data);
+  const dgl_id_t* indptr = static_cast<dgl_id_t*>(orig_csr->indptr()->data);

  std::unordered_set<dgl_id_t> sub_ver_map;  // The vertex Ids in a layer.
  std::vector<std::pair<dgl_id_t, int> > sub_vers;
@@ -535,7 +527,7 @@ NodeFlow SamplerOp::NeighborUniformSample(const ImmutableGraph *graph,
 }

 namespace {
-  void ConstructLayers(const int64_t *indptr,
+  void ConstructLayers(const dgl_id_t *indptr,
                       const dgl_id_t *indices,
                       const std::vector<dgl_id_t>& seed_array,
                       IdArray layer_sizes,
@@ -596,14 +588,14 @@ namespace {
    }
  }

-  void ConstructFlows(const int64_t *indptr,
+  void ConstructFlows(const dgl_id_t *indptr,
                      const dgl_id_t *indices,
                      const dgl_id_t *eids,
                      const std::vector<dgl_id_t> &node_mapping,
                      const std::vector<int64_t> &actl_layer_sizes,
-                      ImmutableGraph::CSR::vector<int64_t> *sub_indptr,
-                      ImmutableGraph::CSR::vector<dgl_id_t> *sub_indices,
-                      ImmutableGraph::CSR::vector<dgl_id_t> *sub_eids,
+                      std::vector<dgl_id_t> *sub_indptr,
+                      std::vector<dgl_id_t> *sub_indices,
+                      std::vector<dgl_id_t> *sub_eids,
                      std::vector<dgl_id_t> *flow_offsets,
                      std::vector<dgl_id_t> *edge_mapping) {
    /*
@@ -626,7 +618,7 @@ namespace {
        auto dst = node_mapping[first + src_size + j];
        typedef std::pair<dgl_id_t, dgl_id_t> id_pair;
        std::vector<id_pair> neighbor_indices;
-        for (int64_t k = indptr[dst]; k < indptr[dst + 1]; ++k) {
+        for (dgl_id_t k = indptr[dst]; k < indptr[dst + 1]; ++k) {
          // TODO(gaiyu): accelerate hash table lookup
          auto ret = source_map.find(indices[k]);
          if (ret != source_map.end()) {
@@ -654,9 +646,9 @@ NodeFlow SamplerOp::LayerUniformSample(const ImmutableGraph *graph,
                                       const std::string &neighbor_type,
                                       IdArray layer_sizes) {
  const auto g_csr = neighbor_type == "in" ? graph->GetInCSR() : graph->GetOutCSR();
-  const int64_t *indptr = g_csr->indptr.data();
-  const dgl_id_t *indices = g_csr->indices.data();
-  const dgl_id_t *eids = g_csr->edge_ids.data();
+  const dgl_id_t *indptr = static_cast<dgl_id_t*>(g_csr->indptr()->data);
+  const dgl_id_t *indices = static_cast<dgl_id_t*>(g_csr->indices()->data);
+  const dgl_id_t *eids = static_cast<dgl_id_t*>(g_csr->edge_ids()->data);

  std::vector<dgl_id_t> layer_offsets;
  std::vector<dgl_id_t> node_mapping;
@@ -671,13 +663,7 @@ NodeFlow SamplerOp::LayerUniformSample(const ImmutableGraph *graph,
                  &actl_layer_sizes,
                  &probabilities);

-  NodeFlow nf;
-
-  int64_t n_nodes = node_mapping.size();
-  // TODO(gaiyu): a better estimate for the expected number of nodes
-  auto sub_csr = std::make_shared<ImmutableGraph::CSR>(n_nodes, n_nodes);
-  sub_csr->indptr.clear();  // TODO(zhengda): Why indptr.resize(num_vertices + 1)?
-
+  std::vector<dgl_id_t> sub_indptr, sub_indices, sub_edge_ids;
  std::vector<dgl_id_t> flow_offsets;
  std::vector<dgl_id_t> edge_mapping;
  ConstructFlows(indptr,
@@ -685,35 +671,31 @@ NodeFlow SamplerOp::LayerUniformSample(const ImmutableGraph *graph,
                 eids,
                 node_mapping,
                 actl_layer_sizes,
-                 &(sub_csr->indptr),
-                 &(sub_csr->indices),
-                 &(sub_csr->edge_ids),
+                 &sub_indptr,
+                 &sub_indices,
+                 &sub_edge_ids,
                 &flow_offsets,
                 &edge_mapping);
+  // sanity check
+  CHECK_GT(sub_indptr.size(), 0);
+  CHECK_EQ(sub_indptr[0], 0);
+  CHECK_EQ(sub_indptr.back(), sub_indices.size());
+  CHECK_EQ(sub_indices.size(), sub_edge_ids.size());

-  if (neighbor_type == "in") {
-    nf.graph = GraphPtr(new ImmutableGraph(sub_csr, nullptr, graph->IsMultigraph()));
+  NodeFlow nf;
+  auto sub_csr = CSRPtr(new CSR(
+        VecToIdArray(sub_indptr), VecToIdArray(sub_indices), VecToIdArray(sub_edge_ids)));
+
+  if (neighbor_type == std::string("in")) {
+    nf.graph = GraphPtr(new ImmutableGraph(sub_csr, nullptr));
  } else {
-    nf.graph = GraphPtr(new ImmutableGraph(nullptr, sub_csr, graph->IsMultigraph()));
-  }
-
-  nf.node_mapping = IdArray::Empty({n_nodes},
-                                   DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  nf.edge_mapping = IdArray::Empty({static_cast<int64_t>(edge_mapping.size())},
-                                   DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  nf.layer_offsets = IdArray::Empty({static_cast<int64_t>(layer_offsets.size())},
-                                    DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  nf.flow_offsets = IdArray::Empty({static_cast<int64_t>(flow_offsets.size())},
-                                   DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-
-  std::copy(node_mapping.begin(), node_mapping.end(),
-            static_cast<dgl_id_t*>(nf.node_mapping->data));
-  std::copy(edge_mapping.begin(), edge_mapping.end(),
-            static_cast<dgl_id_t*>(nf.edge_mapping->data));
-  std::copy(layer_offsets.begin(), layer_offsets.end(),
-            static_cast<dgl_id_t*>(nf.layer_offsets->data));
-  std::copy(flow_offsets.begin(), flow_offsets.end(),
-            static_cast<dgl_id_t*>(nf.flow_offsets->data));
+    nf.graph = GraphPtr(new ImmutableGraph(nullptr, sub_csr));
+  }
+
+  nf.node_mapping = VecToIdArray(node_mapping);
+  nf.edge_mapping = VecToIdArray(edge_mapping);
+  nf.layer_offsets = VecToIdArray(layer_offsets);
+  nf.flow_offsets = VecToIdArray(flow_offsets);

  return nf;
 }
@@ -722,7 +704,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_UniformSampling")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    // arguments
    const GraphHandle ghdl = args[0];
-    const IdArray seed_nodes = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray seed_nodes = args[1];
    const int64_t batch_start_id = args[2];
    const int64_t batch_size = args[3];
    const int64_t max_num_workers = args[4];
@@ -761,11 +743,11 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    // arguments
    const GraphHandle ghdl = args[0];
-    const IdArray seed_nodes = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray seed_nodes = args[1];
    const int64_t batch_start_id = args[2];
    const int64_t batch_size = args[3];
    const int64_t max_num_workers = args[4];
-    const IdArray layer_sizes = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[5]));
+    const IdArray layer_sizes = args[5];
    const std::string neigh_type = args[6];
    // process args
    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghdl);
@@ -794,6 +776,4 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
    *rv = WrapVectorReturn(nflows);
  });

-
-
 }  // namespace dgl
--- a/src/graph/traversal.cc
+++ b/src/graph/traversal.cc
@@ -133,7 +133,7 @@ DGL_REGISTER_GLOBAL("traversal._CAPI_DGLBFSNodes")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const Graph* gptr = static_cast<Graph*>(ghandle);
-    const IdArray src = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray src = args[1];
    bool reversed = args[2];
    const auto& front = BFSNodesFrontiers(*gptr, src, reversed);
    IdArray node_ids = CopyVectorToNDArray(front.ids);
@@ -164,7 +164,7 @@ DGL_REGISTER_GLOBAL("traversal._CAPI_DGLBFSEdges")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const Graph* gptr = static_cast<Graph*>(ghandle);
-    const IdArray src = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray src = args[1];
    bool reversed = args[2];
    const auto& front = BFSEdgesFrontiers(*gptr, src, reversed);
    IdArray edge_ids = CopyVectorToNDArray(front.ids);
@@ -202,7 +202,7 @@ DGL_REGISTER_GLOBAL("traversal._CAPI_DGLDFSEdges")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const Graph* gptr = static_cast<Graph*>(ghandle);
-    const IdArray source = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray source = args[1];
    const bool reversed = args[2];
    CHECK(IsValidIdArray(source)) << "Invalid source node id array.";
    const int64_t len = source->shape[0];
@@ -221,7 +221,7 @@ DGL_REGISTER_GLOBAL("traversal._CAPI_DGLDFSLabeledEdges")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const Graph* gptr = static_cast<Graph*>(ghandle);
-    const IdArray source = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray source = args[1];
    const bool reversed = args[2];
    const bool has_reverse_edge = args[3];
    const bool has_nontree_edge = args[4];

--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -121,11 +121,23 @@ size_t NDArray::GetSize() const {
  return GetDataSize(data_->dl_tensor);
 }

+bool NDArray::IsContiguous() const {
+  CHECK(data_ != nullptr);
+  if (data_->dl_tensor.strides == nullptr)
+    return true;
+  for (int i = 0; i < data_->dl_tensor.ndim - 1; ++i) {
+    if (data_->dl_tensor.strides[i] !=
+        data_->dl_tensor.shape[i+1] * data_->dl_tensor.strides[i+1])
+      return false;
+  }
+  return data_->dl_tensor.strides[data_->dl_tensor.ndim - 1] == 1;
+}
+
 NDArray NDArray::CreateView(std::vector<int64_t> shape,
-                            DLDataType dtype) {
+                            DLDataType dtype,
+                            int64_t offset) {
  CHECK(data_ != nullptr);
-  CHECK(data_->dl_tensor.strides == nullptr)
-      << "Can only create view for compact tensor";
+  CHECK(IsContiguous()) << "Can only create view for compact tensor";
  NDArray ret = Internal::Create(shape, dtype, data_->dl_tensor.ctx);
  ret.data_->dl_tensor.byte_offset =
      this->data_->dl_tensor.byte_offset;
@@ -136,7 +148,8 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape,
  // increase ref count
  this->data_->IncRef();
  ret.data_->manager_ctx = this->data_;
-  ret.data_->dl_tensor.data = this->data_->dl_tensor.data;
+  ret.data_->dl_tensor.data =
+    static_cast<char*>(this->data_->dl_tensor.data) + offset;
  return ret;
 }


--- a/src/scheduler/scheduler_apis.cc
+++ b/src/scheduler/scheduler_apis.cc
@@ -15,17 +15,17 @@ namespace dgl {

 DGL_REGISTER_GLOBAL("runtime.degree_bucketing._CAPI_DGLDegreeBucketing")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
-    const IdArray msg_ids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[0]));
-    const IdArray vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
-    const IdArray nids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[2]));
+    const IdArray msg_ids = args[0];
+    const IdArray vids = args[1];
+    const IdArray nids = args[2];
    *rv = ConvertNDArrayVectorToPackedFunc(sched::DegreeBucketing(msg_ids, vids, nids));
  });

 DGL_REGISTER_GLOBAL("runtime.degree_bucketing._CAPI_DGLGroupEdgeByNodeDegree")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
-    const IdArray uids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[0]));
-    const IdArray vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
-    const IdArray eids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[2]));
+    const IdArray uids = args[0];
+    const IdArray vids = args[1];
+    const IdArray eids = args[2];
    *rv = ConvertNDArrayVectorToPackedFunc(
            sched::GroupEdgeByNodeDegree(uids, vids, eids));
  });

--- a/tests/compute/test_batched_graph.py
+++ b/tests/compute/test_batched_graph.py
@@ -207,9 +207,9 @@ def test_batch_no_edge():
 if __name__ == '__main__':
    test_batch_unbatch()
    test_batch_unbatch1()
-    test_batch_unbatch2()
-    test_batched_edge_ordering()
-    test_batch_send_then_recv()
-    test_batch_send_and_recv()
-    test_batch_propagate()
-    test_batch_no_edge()
+    #test_batch_unbatch2()
+    #test_batched_edge_ordering()
+    #test_batch_send_then_recv()
+    #test_batch_send_and_recv()
+    #test_batch_propagate()
+    #test_batch_no_edge()
--- a/tests/compute/test_graph.py
+++ b/tests/compute/test_graph.py
@@ -7,7 +7,214 @@ import dgl
 import backend as F
 from dgl import DGLError

-def test_graph_creation():
+# graph generation: a random graph with 10 nodes
+#  and 20 edges.
+#  - has self loop
+#  - no multi edge
+def edge_pair_input(sort=False):
+    if sort:
+        src = [0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 7, 7, 7, 9]
+        dst = [4, 6, 9, 3, 5, 3, 7, 5, 8, 1, 3, 4, 9, 1, 9, 6, 2, 8, 9, 2]
+        return src, dst
+    else:
+        src = [0, 0, 4, 5, 0, 4, 7, 4, 4, 3, 2, 7, 7, 5, 3, 2, 1, 9, 6, 1]
+        dst = [9, 6, 3, 9, 4, 4, 9, 9, 1, 8, 3, 2, 8, 1, 5, 7, 3, 2, 6, 5]
+        return src, dst
+
+def nx_input():
+    g = nx.DiGraph()
+    src, dst = edge_pair_input()
+    for i, e in enumerate(zip(src, dst)):
+        g.add_edge(*e, id=i)
+    return g
+
+def elist_input():
+    src, dst = edge_pair_input()
+    return list(zip(src, dst))
+
+def scipy_coo_input():
+    src, dst = edge_pair_input()
+    return sp.coo_matrix((np.ones((20,)), (src, dst)), shape=(10,10))
+
+def scipy_csr_input():
+    src, dst = edge_pair_input()
+    csr = sp.coo_matrix((np.ones((20,)), (src, dst)), shape=(10,10)).tocsr()
+    csr.sort_indices()
+    # src = [0 0 0 1 1 2 2 3 3 4 4 4 4 5 5 6 7 7 7 9]
+    # dst = [4 6 9 3 5 3 7 5 8 1 3 4 9 1 9 6 2 8 9 2]
+    return csr
+
+def gen_by_mutation():
+    g = dgl.DGLGraph()
+    src, dst = edge_pair_input()
+    g.add_nodes(10)
+    g.add_edges(src, dst)
+    return g
+
+def gen_from_data(data, readonly):
+    g = dgl.DGLGraph(data, readonly=readonly)
+    return g
+
+def test_query():
+    def _test_one(g):
+        assert g.number_of_nodes() == 10
+        assert g.number_of_edges() == 20
+        assert len(g) == 10
+        assert not g.is_multigraph
+
+        for i in range(10):
+            assert g.has_node(i)
+            assert i in g
+        assert not g.has_node(11)
+        assert not g.has_node(-1)
+        assert not -1 in g
+        assert F.allclose(g.has_nodes([-1,0,2,10,11]), F.tensor([0,1,1,0,0]))
+
+        src, dst = edge_pair_input()
+        for u, v in zip(src, dst):
+            assert g.has_edge_between(u, v)
+        assert not g.has_edge_between(0, 0)
+        assert F.allclose(g.has_edges_between([0, 0, 3], [0, 9, 8]), F.tensor([0,1,1]))
+        assert set(F.asnumpy(g.predecessors(9))) == set([0,5,7,4])
+        assert set(F.asnumpy(g.successors(2))) == set([7,3])
+
+        assert g.edge_id(4,4) == 5
+        assert F.allclose(g.edge_ids([4,0], [4,9]), F.tensor([5,0]))
+
+        src, dst = g.find_edges([3, 6, 5])
+        assert F.allclose(src, F.tensor([5, 7, 4]))
+        assert F.allclose(dst, F.tensor([9, 9, 4]))
+
+        src, dst, eid = g.in_edges(9, form='all')
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set([(0,9,0),(5,9,3),(7,9,6),(4,9,7)])
+        src, dst, eid = g.in_edges([9,0,8], form='all')  # test node#0 has no in edges
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set([(0,9,0),(5,9,3),(7,9,6),(4,9,7),(3,8,9),(7,8,12)])
+
+        src, dst, eid = g.out_edges(0, form='all')
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set([(0,9,0),(0,6,1),(0,4,4)])
+        src, dst, eid = g.out_edges([0,4,8], form='all')  # test node#8 has no out edges
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set([(0,9,0),(0,6,1),(0,4,4),(4,3,2),(4,4,5),(4,9,7),(4,1,8)])
+
+        src, dst, eid = g.edges('all', 'eid')
+        t_src, t_dst = edge_pair_input()
+        t_tup = list(zip(t_src, t_dst, list(range(20))))
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set(t_tup)
+        assert list(F.asnumpy(eid)) == list(range(20))
+
+        src, dst, eid = g.edges('all', 'srcdst')
+        t_src, t_dst = edge_pair_input()
+        t_tup = list(zip(t_src, t_dst, list(range(20))))
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set(t_tup)
+        assert list(F.asnumpy(src)) == sorted(list(F.asnumpy(src)))
+
+        assert g.in_degree(0) == 0
+        assert g.in_degree(9) == 4
+        assert F.allclose(g.in_degrees([0, 9]), F.tensor([0, 4]))
+        assert g.out_degree(8) == 0
+        assert g.out_degree(9) == 1
+        assert F.allclose(g.out_degrees([8, 9]), F.tensor([0, 1]))
+
+        assert np.array_equal(F.sparse_to_numpy(g.adjacency_matrix()), scipy_coo_input().toarray().T)
+        assert np.array_equal(F.sparse_to_numpy(g.adjacency_matrix(transpose=True)), scipy_coo_input().toarray())
+
+    def _test(g):
+        # test twice to see whether the cached format works or not
+        _test_one(g)
+        _test_one(g)
+
+    def _test_csr_one(g):
+        assert g.number_of_nodes() == 10
+        assert g.number_of_edges() == 20
+        assert len(g) == 10
+        assert not g.is_multigraph
+
+        for i in range(10):
+            assert g.has_node(i)
+            assert i in g
+        assert not g.has_node(11)
+        assert not g.has_node(-1)
+        assert not -1 in g
+        assert F.allclose(g.has_nodes([-1,0,2,10,11]), F.tensor([0,1,1,0,0]))
+
+        src, dst = edge_pair_input(sort=True)
+        for u, v in zip(src, dst):
+            assert g.has_edge_between(u, v)
+        assert not g.has_edge_between(0, 0)
+        assert F.allclose(g.has_edges_between([0, 0, 3], [0, 9, 8]), F.tensor([0,1,1]))
+        assert set(F.asnumpy(g.predecessors(9))) == set([0,5,7,4])
+        assert set(F.asnumpy(g.successors(2))) == set([7,3])
+
+        # src = [0 0 0 1 1 2 2 3 3 4 4 4 4 5 5 6 7 7 7 9]
+        # dst = [4 6 9 3 5 3 7 5 8 1 3 4 9 1 9 6 2 8 9 2]
+        # eid = [0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
+        assert g.edge_id(4,4) == 11
+        assert F.allclose(g.edge_ids([4,0], [4,9]), F.tensor([11,2]))
+
+        src, dst = g.find_edges([3, 6, 5])
+        assert F.allclose(src, F.tensor([1, 2, 2]))
+        assert F.allclose(dst, F.tensor([3, 7, 3]))
+
+        src, dst, eid = g.in_edges(9, form='all')
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set([(0,9,2),(5,9,14),(7,9,18),(4,9,12)])
+        src, dst, eid = g.in_edges([9,0,8], form='all')  # test node#0 has no in edges
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set([(0,9,2),(5,9,14),(7,9,18),(4,9,12),(3,8,8),(7,8,17)])
+
+        src, dst, eid = g.out_edges(0, form='all')
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set([(0,9,2),(0,6,1),(0,4,0)])
+        src, dst, eid = g.out_edges([0,4,8], form='all')  # test node#8 has no out edges
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set([(0,9,2),(0,6,1),(0,4,0),(4,3,10),(4,4,11),(4,9,12),(4,1,9)])
+
+        src, dst, eid = g.edges('all', 'eid')
+        t_src, t_dst = edge_pair_input(sort=True)
+        t_tup = list(zip(t_src, t_dst, list(range(20))))
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set(t_tup)
+        assert list(F.asnumpy(eid)) == list(range(20))
+
+        src, dst, eid = g.edges('all', 'srcdst')
+        t_src, t_dst = edge_pair_input(sort=True)
+        t_tup = list(zip(t_src, t_dst, list(range(20))))
+        tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid)))
+        assert set(tup) == set(t_tup)
+        assert list(F.asnumpy(src)) == sorted(list(F.asnumpy(src)))
+
+        assert g.in_degree(0) == 0
+        assert g.in_degree(9) == 4
+        assert F.allclose(g.in_degrees([0, 9]), F.tensor([0, 4]))
+        assert g.out_degree(8) == 0
+        assert g.out_degree(9) == 1
+        assert F.allclose(g.out_degrees([8, 9]), F.tensor([0, 1]))
+
+        assert np.array_equal(F.sparse_to_numpy(g.adjacency_matrix()), scipy_coo_input().toarray().T)
+        assert np.array_equal(F.sparse_to_numpy(g.adjacency_matrix(transpose=True)), scipy_coo_input().toarray())
+
+    def _test_csr(g):
+        # test twice to see whether the cached format works or not
+        _test_csr_one(g)
+        _test_csr_one(g)
+
+    _test(gen_by_mutation())
+    _test(gen_from_data(elist_input(), False))
+    _test(gen_from_data(elist_input(), True))
+    _test(gen_from_data(nx_input(), False))
+    _test(gen_from_data(nx_input(), True))
+    _test(gen_from_data(scipy_coo_input(), False))
+    _test(gen_from_data(scipy_coo_input(), True))
+
+    _test_csr(gen_from_data(scipy_csr_input(), False))
+    _test_csr(gen_from_data(scipy_csr_input(), True))
+
+def test_mutation():
    g = dgl.DGLGraph()
    # test add nodes with data
    g.add_nodes(5)
@@ -31,17 +238,6 @@ def test_graph_creation():
    g.init_edata('h2', (g.number_of_edges(), 3), 'float32')
    assert F.allclose(F.zeros((g.number_of_edges(), 3)), g.edata['h2'])

-def test_create_from_elist():
-    elist = [(2, 1), (1, 0), (2, 0), (3, 0), (0, 2)]
-    g = dgl.DGLGraph(elist)
-    for i, (u, v) in enumerate(elist):
-        assert g.edge_id(u, v) == i
-    # immutable graph
-    # XXX: not enabled for pytorch
-    #g = dgl.DGLGraph(elist, readonly=True)
-    #for i, (u, v) in enumerate(elist):
-    #    assert g.edge_id(u, v) == i
-
 def test_scipy_adjmat():
    g = dgl.DGLGraph()
    g.add_nodes(10)
@@ -66,34 +262,6 @@ def test_scipy_adjmat():
    assert np.array_equal(adj_t2.toarray(), adj_t3.toarray())
    assert np.array_equal(adj_t0.toarray(), adj_t2.toarray())

-def test_adjmat_cache():
-    n = 1000
-    p = 10 * math.log(n) / n
-    a = sp.random(n, n, p, data_rvs=lambda n: np.ones(n))
-    g = dgl.DGLGraph(a)
-    # the first call should contruct the adj
-    t0 = time.time()
-    adj1 = g.adjacency_matrix()
-    dur1 = time.time() - t0
-    # the second call should be cached and should be very fast
-    t0 = time.time()
-    adj2 = g.adjacency_matrix()
-    dur2 = time.time() - t0
-    print('first time {}, second time {}'.format(dur1, dur2))
-    assert dur2 < dur1
-    assert id(adj1) == id(adj2)
-    # different arg should result in different cache
-    adj3 = g.adjacency_matrix(transpose=True)
-    assert id(adj3) != id(adj2)
-    # manually clear the cache
-    g.clear_cache()
-    adj35 = g.adjacency_matrix()
-    assert id(adj35) != id(adj2)
-    # mutating the graph should invalidate the cache
-    g.add_nodes(10)
-    adj4 = g.adjacency_matrix()
-    assert id(adj4) != id(adj35)
-
 def test_incmat():
    g = dgl.DGLGraph()
    g.add_nodes(4)
@@ -127,34 +295,6 @@ def test_incmat():
                      [0., 1., 0., -1., 0.],
                      [0., 0., 1., 1., 0.]]))

-def test_incmat_cache():
-    n = 1000
-    p = 10 * math.log(n) / n
-    a = sp.random(n, n, p, data_rvs=lambda n: np.ones(n))
-    g = dgl.DGLGraph(a)
-    # the first call should contruct the inc
-    t0 = time.time()
-    inc1 = g.incidence_matrix("in")
-    dur1 = time.time() - t0
-    # the second call should be cached and should be very fast
-    t0 = time.time()
-    inc2 = g.incidence_matrix("in")
-    dur2 = time.time() - t0
-    print('first time {}, second time {}'.format(dur1, dur2))
-    assert dur2 < dur1
-    assert id(inc1) == id(inc2)
-    # different arg should result in different cache
-    inc3 = g.incidence_matrix("both")
-    assert id(inc3) != id(inc2)
-    # manually clear the cache
-    g.clear_cache()
-    inc35 = g.incidence_matrix("in")
-    assert id(inc35) != id(inc2)
-    # mutating the graph should invalidate the cache
-    g.add_nodes(10)
-    inc4 = g.incidence_matrix("in")
-    assert id(inc4) != id(inc35)
-
 def test_readonly():
    g = dgl.DGLGraph()
    g.add_nodes(5)
@@ -242,11 +382,9 @@ def test_find_edges():
        assert fail

 if __name__ == '__main__':
-    test_graph_creation()
-    test_create_from_elist()
-    test_adjmat_cache()
+    test_query()
+    test_mutation()
    test_scipy_adjmat()
    test_incmat()
-    test_incmat_cache()
    test_readonly()
    test_find_edges()
--- a/tests/scripts/build_dgl.sh
+++ b/tests/scripts/build_dgl.sh
 #!/bin/bash
+set -e

 if [ -d build ]; then
 	rm -rf build