[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)

* add cub; array cumsum * CSRSliceRows * fix warning * operator << for ndarray; CSRSliceRows * add CSRIsSorted * add csr_sort * inplace coosort and outplace csrsort * WIP: coo is sorted * mv cuda_utils * add AllTrue utility * csr sort * coo sort * coo2csr for sorted coo arrays * CSRToCOO from sorted * pass tests for the new kernel changes * cannot use inplace sort * lint * try fix msvc error * Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC * stash * revert some hack * revert some changes * address comments * fix * fix to_block unittest * add todo note

[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)
* add cub; array cumsum * CSRSliceRows * fix warning * operator << for ndarray; CSRSliceRows * add CSRIsSorted * add csr_sort * inplace coosort and outplace csrsort * WIP: coo is sorted * mv cuda_utils * add AllTrue utility * csr sort * coo sort * coo2csr for sorted coo arrays * CSRToCOO from sorted * pass tests for the new kernel changes * cannot use inplace sort * lint * try fix msvc error * Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC * stash * revert some hack * revert some changes * address comments * fix * fix to_block unittest * add todo note
870da747 · Minjie Wang · GitHub · da8632ca · 870da747 · 870da747
Unverified Commit 870da747 authored Jun 28, 2020 by Minjie Wang Committed by GitHub Jun 28, 2020
20 changed files
--- a/src/array/cpu/coo_sort.cc
+++ b/src/array/cpu/coo_sort.cc
@@ -10,37 +10,181 @@
 #include <numeric>
 #include <algorithm>
 #include <vector>
+#include <iterator>
+#include <tuple>
+namespace {
+template <typename IdType>
+struct TupleRef {
+  TupleRef() = delete;
+  TupleRef(const TupleRef& other) = default;
+  TupleRef(TupleRef&& other) = default;
+  TupleRef(IdType *const r, IdType *const c, IdType *const d)
+    : row(r), col(c), data(d) {}
+  TupleRef& operator=(const TupleRef& other) {
+    *row = *other.row;
+    *col = *other.col;
+    *data = *other.data;
+    return *this;
+  }
+  TupleRef& operator=(const std::tuple<IdType, IdType, IdType>& val) {
+    *row = std::get<0>(val);
+    *col = std::get<1>(val);
+    *data = std::get<2>(val);
+    return *this;
+  }
+  operator std::tuple<IdType, IdType, IdType>() const {
+    return std::make_tuple(*row, *col, *data);
+  }
+  void Swap(const TupleRef& other) const {
+    std::swap(*row, *other.row);
+    std::swap(*col, *other.col);
+    std::swap(*data, *other.data);
+  }
+  IdType *row, *col, *data;
+};
+using std::swap;
+template <typename IdType>
+void swap(const TupleRef<IdType>& r1, const TupleRef<IdType>& r2) {
+  r1.Swap(r2);
+}
+template <typename IdType>
+struct CooIterator : public std::iterator<std::random_access_iterator_tag,
+                                          std::tuple<IdType, IdType, IdType>,
+                                          std::ptrdiff_t,
+                                          std::tuple<IdType*, IdType*, IdType*>,
+                                          TupleRef<IdType>> {
+  CooIterator() = default;
+  CooIterator(const CooIterator& other) = default;
+  CooIterator(CooIterator&& other) = default;
+  CooIterator(IdType *r, IdType *c, IdType *d): row(r), col(c), data(d) {}
+  CooIterator& operator=(const CooIterator& other) = default;
+  CooIterator& operator=(CooIterator&& other) = default;
+  ~CooIterator() = default;
+  bool operator==(const CooIterator& other) const {
+    return row == other.row;
+  }
+  bool operator!=(const CooIterator& other) const {
+    return row != other.row;
+  }
+  bool operator<(const CooIterator& other) const {
+    return row < other.row;
+  }
+  bool operator>(const CooIterator& other) const {
+    return row > other.row;
+  }
+  bool operator<=(const CooIterator& other) const {
+    return row <= other.row;
+  }
+  bool operator>=(const CooIterator& other) const {
+    return row >= other.row;
+  }
+  CooIterator& operator+=(const std::ptrdiff_t& movement) {
+    row += movement;
+    col += movement;
+    data += movement;
+    return *this;
+  }
+  CooIterator& operator-=(const std::ptrdiff_t& movement) {
+    row -= movement;
+    col -= movement;
+    data -= movement;
+    return *this;
+  }
+  CooIterator& operator++() {
+    return operator+=(1);
+  }
+  CooIterator& operator--() {
+    return operator-=(1);
+  }
+  CooIterator operator++(int) {
+    CooIterator ret(*this);
+    operator++();
+    return ret;
+  }
+  CooIterator operator--(int) {
+    CooIterator ret(*this);
+    operator--();
+    return ret;
+  }
+  CooIterator operator+(const std::ptrdiff_t& movement) const {
+    CooIterator ret(*this);
+    ret += movement;
+    return ret;
+  }
+  CooIterator operator-(const std::ptrdiff_t& movement) const {
+    CooIterator ret(*this);
+    ret -= movement;
+    return ret;
+  }
+  std::ptrdiff_t operator-(const CooIterator& other) const {
+    return row - other.row;
+  }
+  TupleRef<IdType> operator*() const {
+    return TupleRef<IdType>(row, col, data);
+  }
+  TupleRef<IdType> operator*() {
+    return TupleRef<IdType>(row, col, data);
+  }
+  IdType *row, *col, *data;
+};
+}  // namespace
 namespace dgl {
 namespace aten {
 namespace impl {
+///////////////////////////// COOSort_ /////////////////////////////
 template <DLDeviceType XPU, typename IdType>
-COOMatrix COOSort(COOMatrix coo, bool sort_column) {
+void COOSort_(COOMatrix* coo, bool sort_column) {
-  const int64_t nnz = coo.row->shape[0];
+  const int64_t nnz = coo->row->shape[0];
-  const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
+  IdType* coo_row = coo->row.Ptr<IdType>();
-  const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
+  IdType* coo_col = coo->col.Ptr<IdType>();
+  if (!COOHasData(*coo))
-  // Argsort
+    coo->data = aten::Range(0, nnz, coo->row->dtype.bits, coo->row->ctx);
-  IdArray new_row = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
+  IdType* coo_data = coo->data.Ptr<IdType>();
-  IdArray new_col = IdArray::Empty({nnz}, coo.col->dtype, coo.col->ctx);
-  IdArray new_idx = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
+  typedef std::tuple<IdType, IdType, IdType> Tuple;
-  IdType* new_row_data = static_cast<IdType*>(new_row->data);
-  IdType* new_col_data = static_cast<IdType*>(new_col->data);
+  // Arg sort
-  IdType* new_idx_data = static_cast<IdType*>(new_idx->data);
-  std::iota(new_idx_data, new_idx_data + nnz, 0);
  if (sort_column) {
 #ifdef PARALLEL_ALGORITHMS
    __gnu_parallel::sort(
 #else
    std::sort(
 #endif
-        new_idx_data,
+        CooIterator<IdType>(coo_row, coo_col, coo_data),
-        new_idx_data + nnz,
+        CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
-        [coo_row_data, coo_col_data](const IdType a, const IdType b) {
+        [](const Tuple& a, const Tuple& b) {
-          return (coo_row_data[a] != coo_row_data[b]) ?
+          return (std::get<0>(a) != std::get<0>(b)) ?
-            (coo_row_data[a] < coo_row_data[b]) :
+              (std::get<0>(a) < std::get<0>(b)) : (std::get<1>(a) < std::get<1>(b));
-            (coo_col_data[a] < coo_col_data[b]);
        });
  } else {
 #ifdef PARALLEL_ALGORITHMS
@@ -48,39 +192,41 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
 #else
    std::sort(
 #endif
-        new_idx_data,
+        CooIterator<IdType>(coo_row, coo_col, coo_data),
-        new_idx_data + nnz,
+        CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
-        [coo_row_data](const IdType a, const IdType b) {
+        [](const Tuple& a, const Tuple& b) {
-          return coo_row_data[a] < coo_row_data[b];
+          return std::get<0>(a) < std::get<0>(b);
        });
  }
-  // Reorder according to shuffle
+  coo->row_sorted = true;
-#pragma omp parallel for
+  coo->col_sorted = sort_column;
-  for (IdType i = 0; i < nnz; ++i) {
+}
-    new_row_data[i] = coo_row_data[new_idx_data[i]];
-    new_col_data[i] = coo_col_data[new_idx_data[i]];
-  }
-  if (COOHasData(coo)) {
+template void COOSort_<kDLCPU, int32_t>(COOMatrix*, bool);
-    const IdType* coo_data_data = static_cast<IdType*>(coo.data->data);
+template void COOSort_<kDLCPU, int64_t>(COOMatrix*, bool);
-    IdArray new_data = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
-    IdType* new_data_data = static_cast<IdType*>(new_data->data);
-#pragma omp parallel for
-    for (IdType i = 0; i < nnz; ++i) {
-      new_data_data[i] = coo_data_data[new_idx_data[i]];
-    }
-    new_idx = new_data;
-  }
-  return COOMatrix{
+///////////////////////////// COOIsSorted /////////////////////////////
-      coo.num_rows, coo.num_cols, std::move(new_row), std::move(new_col),
-      std::move(new_idx), true, sort_column};
+template <DLDeviceType XPU, typename IdType>
+std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
+  const int64_t nnz = coo.row->shape[0];
+  IdType* row = coo.row.Ptr<IdType>();
+  IdType* col = coo.col.Ptr<IdType>();
+  bool row_sorted = true;
+  bool col_sorted = true;
+  for (int64_t i = 1; row_sorted && i < nnz; ++i) {
+    row_sorted = (row[i - 1] <= row[i]);
+    col_sorted = col_sorted && (row[i - 1] < row[i] || col[i - 1] <= col[i]);
+  }
+  if (!row_sorted)
+    col_sorted = false;
+  return {row_sorted, col_sorted};
 }
-template COOMatrix COOSort<kDLCPU, int32_t>(COOMatrix, bool);
+template std::pair<bool, bool> COOIsSorted<kDLCPU, int32_t>(COOMatrix coo);
-template COOMatrix COOSort<kDLCPU, int64_t>(COOMatrix, bool);
+template std::pair<bool, bool> COOIsSorted<kDLCPU, int64_t>(COOMatrix coo);
 }  // namespace impl
 }  // namespace aten

--- a/src/array/cpu/csr_sort.cc
+++ b/src/array/cpu/csr_sort.cc
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cpu/csr_sort.cc
+ * \brief CSR sorting
+ */
+#include <dgl/array.h>
+#include <numeric>
+#include <algorithm>
+#include <vector>
+namespace dgl {
+namespace aten {
+namespace impl {
+///////////////////////////// CSRIsSorted /////////////////////////////
+template <DLDeviceType XPU, typename IdType>
+bool CSRIsSorted(CSRMatrix csr) {
+  const IdType* indptr = csr.indptr.Ptr<IdType>();
+  const IdType* indices = csr.indices.Ptr<IdType>();
+  bool ret = true;
+#pragma omp parallel for shared(ret)
+  for (int64_t row = 0; row < csr.num_rows; ++row) {
+    if (!ret)
+      continue;
+    for (IdType i = indptr[row] + 1; i < indptr[row + 1]; ++i) {
+      if (indices[i - 1] > indices[i]) {
+        ret = false;
+        break;
+      }
+    }
+  }
+  return ret;
+}
+template bool CSRIsSorted<kDLCPU, int64_t>(CSRMatrix csr);
+template bool CSRIsSorted<kDLCPU, int32_t>(CSRMatrix csr);
+///////////////////////////// CSRSort /////////////////////////////
+template <DLDeviceType XPU, typename IdType>
+void CSRSort_(CSRMatrix* csr) {
+  typedef std::pair<IdType, IdType> ShufflePair;
+  const int64_t num_rows = csr->num_rows;
+  const int64_t nnz = csr->indices->shape[0];
+  const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
+  IdType* indices_data = static_cast<IdType*>(csr->indices->data);
+  if (!CSRHasData(*csr)) {
+    csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
+  }
+  IdType* eid_data = static_cast<IdType*>(csr->data->data);
+#pragma omp parallel
+  {
+    std::vector<ShufflePair> reorder_vec;
+#pragma omp for
+    for (int64_t row = 0; row < num_rows; row++) {
+      const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
+      IdType *col = indices_data + indptr_data[row];
+      IdType *eid = eid_data + indptr_data[row];
+      reorder_vec.resize(num_cols);
+      for (int64_t i = 0; i < num_cols; i++) {
+        reorder_vec[i].first = col[i];
+        reorder_vec[i].second = eid[i];
+      }
+      std::sort(reorder_vec.begin(), reorder_vec.end(),
+                [](const ShufflePair &e1, const ShufflePair &e2) {
+                  return e1.first < e2.first;
+                });
+      for (int64_t i = 0; i < num_cols; i++) {
+        col[i] = reorder_vec[i].first;
+        eid[i] = reorder_vec[i].second;
+      }
+    }
+  }
+  csr->sorted = true;
+}
+template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
+template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cpu/spmat_op_impl.cc
+++ b/src/array/cpu/spmat_op_impl.cc
@@ -377,7 +377,9 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
              ret_row_data + indptr_data[i + 1],
              i);
  }
-  return COOMatrix{csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data};
+  return COOMatrix(csr.num_rows, csr.num_cols,
+                   ret_row, csr.indices, csr.data,
+                   true, csr.sorted);
 }
 template COOMatrix CSRToCOO<kDLCPU, int32_t>(CSRMatrix csr);
@@ -543,49 +545,6 @@ template CSRMatrix CSRSliceMatrix<kDLCPU, int32_t>(
 template CSRMatrix CSRSliceMatrix<kDLCPU, int64_t>(
    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols);
-///////////////////////////// CSRSort /////////////////////////////
-template <DLDeviceType XPU, typename IdType>
-void CSRSort_(CSRMatrix* csr) {
-  typedef std::pair<IdType, IdType> ShufflePair;
-  const int64_t num_rows = csr->num_rows;
-  const int64_t nnz = csr->indices->shape[0];
-  const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
-  IdType* indices_data = static_cast<IdType*>(csr->indices->data);
-  if (!CSRHasData(*csr)) {
-    csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
-  }
-  IdType* eid_data = static_cast<IdType*>(csr->data->data);
-#pragma omp parallel
-  {
-    std::vector<ShufflePair> reorder_vec;
-#pragma omp for
-    for (int64_t row = 0; row < num_rows; row++) {
-      const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
-      IdType *col = indices_data + indptr_data[row];
-      IdType *eid = eid_data + indptr_data[row];
-      reorder_vec.resize(num_cols);
-      for (int64_t i = 0; i < num_cols; i++) {
-        reorder_vec[i].first = col[i];
-        reorder_vec[i].second = eid[i];
-      }
-      std::sort(reorder_vec.begin(), reorder_vec.end(),
-                [](const ShufflePair &e1, const ShufflePair &e2) {
-                  return e1.first < e2.first;
-                });
-      for (int64_t i = 0; i < num_cols; i++) {
-        col[i] = reorder_vec[i].first;
-        eid[i] = reorder_vec[i].second;
-      }
-    }
-  }
-  csr->sorted = true;
-}
-template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
-template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
 ///////////////////////////// CSRReorder /////////////////////////////
 template <DLDeviceType XPU, typename IdType>

--- a/src/array/cpu/spmat_op_impl_coo.cc
+++ b/src/array/cpu/spmat_op_impl_coo.cc
@@ -3,10 +3,10 @@
 * \file array/cpu/spmat_op_impl.cc
 * \brief CPU implementation of COO sparse matrix operators
 */
-#include <dgl/array.h>
 #include <vector>
 #include <unordered_set>
 #include <unordered_map>
+#include <tuple>
 #include "array_utils.h"
 namespace dgl {
@@ -266,12 +266,43 @@ CSRMatrix COOToCSR(COOMatrix coo) {
  const IdType* row_data = static_cast<IdType*>(coo.row->data);
  const IdType* col_data = static_cast<IdType*>(coo.col->data);
  const IdType* data = COOHasData(coo)? static_cast<IdType*>(coo.data->data) : nullptr;
  NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
  NDArray ret_indices;
  NDArray ret_data;
+  bool row_sorted = coo.row_sorted;
+  bool col_sorted = coo.col_sorted;
+  if (!row_sorted) {
+    // It is possible that the flag is simply not set (default value is false),
+    // so we still perform a linear scan to check the flag.
+    std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
+  }
+  if (row_sorted) {
+    // compute indptr
    IdType* Bp = static_cast<IdType*>(ret_indptr->data);
+    Bp[0] = 0;
+    int64_t j = 0;
+    for (int64_t i = 0; i < N; ++i) {
+      const int64_t k = j;
+      for (; j < NNZ && row_data[j] == i; ++j) {}
+      Bp[i + 1] = Bp[i] + j - k;
+    }
+    // TODO(minjie): Many of our current implementation assumes that CSR must have
+    //   a data array. This is a temporary workaround. Remove this after:
+    //   - The old immutable graph implementation is deprecated.
+    //   - The old binary reduce kernel is deprecated.
+    if (!COOHasData(coo))
+      coo.data = aten::Range(0, NNZ, coo.row->dtype.bits, coo.row->ctx);
+    // compute indices and data
+    ret_indices = coo.col;
+    ret_data = coo.data;
+  } else {
+    // compute indptr
+    IdType* Bp = static_cast<IdType*>(ret_indptr->data);
    std::fill(Bp, Bp + N, 0);
    for (int64_t i = 0; i < NNZ; ++i) {
      Bp[row_data[i]]++;
@@ -285,10 +316,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {
    }
    Bp[N] = NNZ;
-  if (coo.row_sorted == true) {
+    // compute indices and data
-    ret_indices = coo.col;
-    ret_data = coo.data;
-  } else {
    ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
    ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
    IdType* Bi = static_cast<IdType*>(ret_indices->data);
@@ -311,7 +339,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {
  return CSRMatrix(coo.num_rows, coo.num_cols,
                   ret_indptr, ret_indices, ret_data,
-                   coo.col_sorted);
+                   col_sorted);
 }
 template CSRMatrix COOToCSR<kDLCPU, int32_t>(COOMatrix coo);
@@ -439,7 +467,6 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
  // Input COO
  const IdType* in_rows = static_cast<IdType*>(coo.row->data);
  const IdType* in_cols = static_cast<IdType*>(coo.col->data);
-  const IdType* in_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
  int64_t num_rows = coo.num_rows;
  int64_t num_cols = coo.num_cols;
  int64_t nnz = coo.row->shape[0];

--- a/src/array/cuda/array_cumsum.cu
+++ b/src/array/cuda/array_cumsum.cu
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cpu/array_cumsum.cu
+ * \brief Array cumsum GPU implementation
+ */
+#include <dgl/array.h>
+#include <cub/cub.cuh>
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+template <DLDeviceType XPU, typename IdType>
+IdArray CumSum(IdArray array, bool prepend_zero) {
+  const int64_t len = array.NumElements();
+  if (len == 0)
+    return array;
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(array->ctx);
+  const IdType* in_d = array.Ptr<IdType>();
+  IdArray ret;
+  IdType* out_d = nullptr;
+  if (prepend_zero) {
+    ret = aten::Full(0, len + 1, array->dtype.bits, array->ctx);
+    out_d = ret.Ptr<IdType>() + 1;
+  } else {
+    ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
+    out_d = ret.Ptr<IdType>();
+  }
+  // Allocate workspace
+  size_t workspace_size = 0;
+  cub::DeviceScan::InclusiveSum(nullptr, workspace_size, in_d, out_d, len, thr_entry->stream);
+  void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
+  // Compute cumsum
+  cub::DeviceScan::InclusiveSum(workspace, workspace_size, in_d, out_d, len, thr_entry->stream);
+  device->FreeWorkspace(array->ctx, workspace);
+  return ret;
+}
+template IdArray CumSum<kDLGPU, int32_t>(IdArray, bool);
+template IdArray CumSum<kDLGPU, int64_t>(IdArray, bool);
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cuda/array_index_select.cu
+++ b/src/array/cuda/array_index_select.cu
@@ -5,7 +5,7 @@
 */
 #include <dgl/array.h>
 #include "../../runtime/cuda/cuda_common.h"
-#include "../../cuda_utils.h"
+#include "./utils.h"
 namespace dgl {
 using runtime::NDArray;
@@ -50,7 +50,7 @@ template NDArray IndexSelect<kDLGPU, double, int32_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDLGPU, double, int64_t>(NDArray, IdArray);
 template <DLDeviceType XPU, typename DType>
-DType IndexSelect(NDArray array, uint64_t index) {
+DType IndexSelect(NDArray array, int64_t index) {
  auto device = runtime::DeviceAPI::Get(array->ctx);
  DType ret = 0;
  device->CopyDataFromTo(
@@ -60,12 +60,12 @@ DType IndexSelect(NDArray array, uint64_t index) {
  return ret;
 }
-template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, uint64_t index);
+template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, int64_t index);
-template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, uint64_t index);
+template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, int64_t index);
-template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, uint64_t index);
+template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, int64_t index);
-template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, uint64_t index);
+template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, int64_t index);
-template float IndexSelect<kDLGPU, float>(NDArray array, uint64_t index);
+template float IndexSelect<kDLGPU, float>(NDArray array, int64_t index);
-template double IndexSelect<kDLGPU, double>(NDArray array, uint64_t index);
+template double IndexSelect<kDLGPU, double>(NDArray array, int64_t index);
 }  // namespace impl
 }  // namespace aten

--- a/src/array/cuda/array_op_impl.cu
+++ b/src/array/cuda/array_op_impl.cu
@@ -5,7 +5,7 @@
 */
 #include <dgl/array.h>
 #include "../../runtime/cuda/cuda_common.h"
-#include "../../cuda_utils.h"
+#include "./utils.h"
 #include "../arith.h"
 namespace dgl {

--- a/src/array/cuda/coo2csr.cc
+++ b/src/array/cuda/coo2csr.cc
@@ -17,63 +17,43 @@ template <DLDeviceType XPU, typename IdType>
 CSRMatrix COOToCSR(COOMatrix coo) {
  CHECK(sizeof(IdType) == 4) << "CUDA COOToCSR does not support int64.";
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  auto device = runtime::DeviceAPI::Get(coo.row->ctx);
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
  }
  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
+  bool row_sorted = coo.row_sorted;
-  NDArray row = coo.row, col = coo.col, data = coo.data;
+  bool col_sorted = coo.col_sorted;
-  int32_t* row_ptr = static_cast<int32_t*>(row->data);
+  if (!row_sorted) {
-  int32_t* col_ptr = static_cast<int32_t*>(col->data);
+    // It is possible that the flag is simply not set (default value is false),
-  int32_t* data_ptr = aten::IsNullArray(data) ? nullptr : static_cast<int32_t*>(data->data);
+    // so we still perform a linear scan to check the flag.
+    std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
-  if (!coo.row_sorted) {
-    // make a copy of row and col because sort is done in-place
-    row = row.CopyTo(row->ctx);
-    col = col.CopyTo(col->ctx);
-    row_ptr = static_cast<int32_t*>(row->data);
-    col_ptr = static_cast<int32_t*>(col->data);
-    if (aten::IsNullArray(data)) {
-      // create the index array
-      data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
-      data_ptr = static_cast<int32_t*>(data->data);
  }
-    // sort row
+  if (!row_sorted) {
-    size_t workspace_size = 0;
+    coo = COOSort(coo);
-    CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
-        thr_entry->cusparse_handle,
-        coo.num_rows, coo.num_cols,
-        row->shape[0],
-        row_ptr,
-        col_ptr,
-        &workspace_size));
-    void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
-    CUSPARSE_CALL(cusparseXcoosortByRow(
-        thr_entry->cusparse_handle,
-        coo.num_rows, coo.num_cols,
-        row->shape[0],
-        row_ptr,
-        col_ptr,
-        data_ptr,
-        workspace));
-    device->FreeWorkspace(row->ctx, workspace);
  }
-  NDArray indptr = aten::NewIdArray(coo.num_rows + 1, row->ctx, row->dtype.bits);
+  const int64_t nnz = coo.row->shape[0];
+  // TODO(minjie): Many of our current implementation assumes that CSR must have
+  //   a data array. This is a temporary workaround. Remove this after:
+  //   - The old immutable graph implementation is deprecated.
+  //   - The old binary reduce kernel is deprecated.
+  if (!COOHasData(coo))
+    coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);
+  NDArray indptr = aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
  int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
  CUSPARSE_CALL(cusparseXcoo2csr(
        thr_entry->cusparse_handle,
-        row_ptr,
+        coo.row.Ptr<int32_t>(),
-        row->shape[0],
+        nnz,
        coo.num_rows,
        indptr_ptr,
        CUSPARSE_INDEX_BASE_ZERO));
  return CSRMatrix(coo.num_rows, coo.num_cols,
-                   indptr, col, data, false);
+                   indptr, coo.col, coo.data, col_sorted);
 }
 template CSRMatrix COOToCSR<kDLGPU, int32_t>(COOMatrix coo);

--- a/src/array/cuda/coo_sort.cc
+++ b/src/array/cuda/coo_sort.cc
@@ -5,6 +5,7 @@
 */
 #include <dgl/array.h>
 #include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
 namespace dgl {
@@ -13,11 +14,16 @@ using runtime::NDArray;
 namespace aten {
 namespace impl {
+///////////////////////////// COOSort_ /////////////////////////////
 template <DLDeviceType XPU, typename IdType>
-COOMatrix COOSort(COOMatrix coo, bool sort_column) {
+void COOSort_(COOMatrix* coo, bool sort_column) {
+  // TODO(minjie): Current implementation is based on cusparse which only supports
+  //   int32_t. To support int64_t, we could use the Radix sort algorithm provided
+  //   by CUB.
  CHECK(sizeof(IdType) == 4) << "CUDA COOSort does not support int64.";
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  auto device = runtime::DeviceAPI::Get(coo.row->ctx);
+  auto device = runtime::DeviceAPI::Get(coo->row->ctx);
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
@@ -25,15 +31,11 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
-  NDArray row = coo.row.CopyTo(coo.row->ctx);
+  NDArray row = coo->row;
-  NDArray col = coo.col.CopyTo(coo.col->ctx);
+  NDArray col = coo->col;
-  NDArray data;
+  if (!aten::COOHasData(*coo))
-  if (aten::IsNullArray(coo.data)) {
+    coo->data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
-    // create the index array
+  NDArray data = coo->data;
-    data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
-  } else {
-    data = coo.data.CopyTo(coo.data->ctx);
-  }
  int32_t* row_ptr = static_cast<int32_t*>(row->data);
  int32_t* col_ptr = static_cast<int32_t*>(col->data);
  int32_t* data_ptr = static_cast<int32_t*>(data->data);
@@ -42,7 +44,7 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
  size_t workspace_size = 0;
  CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
      thr_entry->cusparse_handle,
-      coo.num_rows, coo.num_cols,
+      coo->num_rows, coo->num_cols,
      row->shape[0],
      row_ptr,
      col_ptr,
@@ -50,7 +52,7 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
  void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
  CUSPARSE_CALL(cusparseXcoosortByRow(
      thr_entry->cusparse_handle,
-      coo.num_rows, coo.num_cols,
+      coo->num_rows, coo->num_cols,
      row->shape[0],
      row_ptr,
      col_ptr,
@@ -61,18 +63,18 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
  if (sort_column) {
    // First create a row indptr array and then call csrsort
    int32_t* indptr = static_cast<int32_t*>(
-        device->AllocWorkspace(row->ctx, (coo.num_rows + 1) * sizeof(IdType)));
+        device->AllocWorkspace(row->ctx, (coo->num_rows + 1) * sizeof(IdType)));
    CUSPARSE_CALL(cusparseXcoo2csr(
          thr_entry->cusparse_handle,
          row_ptr,
          row->shape[0],
-          coo.num_rows,
+          coo->num_rows,
          indptr,
          CUSPARSE_INDEX_BASE_ZERO));
    CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
          thr_entry->cusparse_handle,
-          coo.num_rows,
+          coo->num_rows,
-          coo.num_cols,
+          coo->num_cols,
          row->shape[0],
          indptr,
          col_ptr,
@@ -82,8 +84,8 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
    CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
    CUSPARSE_CALL(cusparseXcsrsort(
          thr_entry->cusparse_handle,
-          coo.num_rows,
+          coo->num_rows,
-          coo.num_cols,
+          coo->num_cols,
          row->shape[0],
          descr,
          indptr,
@@ -95,13 +97,61 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
    device->FreeWorkspace(row->ctx, indptr);
  }
-  return COOMatrix(coo.num_rows, coo.num_cols,
+  coo->row_sorted = true;
-                   row, col, data, true, sort_column);
+  coo->col_sorted = sort_column;
+}
+template void COOSort_<kDLGPU, int32_t>(COOMatrix* coo, bool sort_column);
+template void COOSort_<kDLGPU, int64_t>(COOMatrix* coo, bool sort_column);
+///////////////////////////// COOIsSorted /////////////////////////////
+template <typename IdType>
+__global__ void _COOIsSortedKernel(
+    const IdType* row, const IdType* col,
+    int64_t nnz, int8_t* row_sorted, int8_t* col_sorted) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < nnz) {
+    if (tx == 0) {
+      row_sorted[0] = 1;
+      col_sorted[0] = 1;
+    } else {
+      row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
+      col_sorted[tx] = static_cast<int8_t>(
+          row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
+    }
+    tx += stride_x;
+  }
 }
-template COOMatrix COOSort<kDLGPU, int32_t>(COOMatrix coo, bool sort_column);
+template <DLDeviceType XPU, typename IdType>
-template COOMatrix COOSort<kDLGPU, int64_t>(COOMatrix coo, bool sort_column);
+std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
+  const int64_t nnz = coo.row->shape[0];
+  const auto& ctx = coo.row->ctx;
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but should
+  // be fine.
+  int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
+  int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
+  const int nt = cuda::FindNumThreads(nnz);
+  const int nb = (nnz + nt - 1) / nt;
+  _COOIsSortedKernel<<<nb, nt, 0, thr_entry->stream>>>(
+      coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
+      nnz, row_flags, col_flags);
+  const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
+  const bool col_sorted = row_sorted? cuda::AllTrue(col_flags, nnz, ctx) : false;
+  device->FreeWorkspace(ctx, row_flags);
+  device->FreeWorkspace(ctx, col_flags);
+  return {row_sorted, col_sorted};
+}
+template std::pair<bool, bool> COOIsSorted<kDLGPU, int32_t>(COOMatrix coo);
+template std::pair<bool, bool> COOIsSorted<kDLGPU, int64_t>(COOMatrix coo);
 }  // namespace impl
 }  // namespace aten

--- a/src/array/cuda/csr_sort.cu
+++ b/src/array/cuda/csr_sort.cu
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cuda/csr_sort.cc
+ * \brief Sort COO index
+ */
+#include <dgl/array.h>
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+/*!
+ * \brief Check whether each row is sorted.
+ */
+template <typename IdType>
+__global__ void _SegmentIsSorted(
+    const IdType* indptr, const IdType* indices,
+    int64_t num_rows, int8_t* flags) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < num_rows) {
+    bool f = true;
+    for (IdType i = indptr[tx] + 1; f && i < indptr[tx + 1]; ++i) {
+      f = (indices[i - 1] <= indices[i]);
+    }
+    flags[tx] = static_cast<int8_t>(f);
+    tx += stride_x;
+  }
+}
+template <DLDeviceType XPU, typename IdType>
+bool CSRIsSorted(CSRMatrix csr) {
+  const auto& ctx = csr.indptr->ctx;
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  // We allocate a workspace of num_rows bytes. It wastes a little bit memory but should
+  // be fine.
+  int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
+  const int nt = cuda::FindNumThreads(csr.num_rows);
+  const int nb = (csr.num_rows + nt - 1) / nt;
+  _SegmentIsSorted<<<nb, nt, 0, thr_entry->stream>>>(
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
+      csr.num_rows, flags);
+  bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
+  device->FreeWorkspace(ctx, flags);
+  return ret;
+}
+template bool CSRIsSorted<kDLGPU, int32_t>(CSRMatrix csr);
+template bool CSRIsSorted<kDLGPU, int64_t>(CSRMatrix csr);
+template <DLDeviceType XPU, typename IdType>
+void CSRSort_(CSRMatrix* csr) {
+  CHECK(sizeof(IdType) == 4) << "CUDA CSRSort_ does not support int64.";
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
+  NDArray indptr = csr->indptr;
+  NDArray indices = csr->indices;
+  const auto& ctx = indptr->ctx;
+  const int64_t nnz = indices->shape[0];
+  if (!aten::CSRHasData(*csr))
+    csr->data = aten::Range(0, nnz, indices->dtype.bits, ctx);
+  NDArray data = csr->data;
+  size_t workspace_size = 0;
+  CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
+      thr_entry->cusparse_handle,
+      csr->num_rows, csr->num_cols, nnz,
+      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
+      &workspace_size));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  cusparseMatDescr_t descr;
+  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CALL(cusparseXcsrsort(
+      thr_entry->cusparse_handle,
+      csr->num_rows, csr->num_cols, nnz,
+      descr,
+      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
+      data.Ptr<int32_t>(),
+      workspace));
+  csr->sorted = true;
+  // free resources
+  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  device->FreeWorkspace(ctx, workspace);
+}
+template void CSRSort_<kDLGPU, int32_t>(CSRMatrix* csr);
+template void CSRSort_<kDLGPU, int64_t>(CSRMatrix* csr);
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
@@ -10,7 +10,7 @@
 #include "macro.cuh"
 #include "atomic.cuh"
 #include "functor.cuh"
-#include "../../cuda_utils.h"
+#include "./utils.h"
 #include "../../runtime/cuda/cuda_common.h"
 namespace dgl {

--- a/src/array/cuda/spmat_op_impl.cu
+++ b/src/array/cuda/spmat_op_impl.cu
@@ -8,7 +8,7 @@
 #include <unordered_set>
 #include <numeric>
 #include "../../runtime/cuda/cuda_common.h"
-#include "../../cuda_utils.h"
+#include "./utils.h"
 namespace dgl {
@@ -17,8 +17,6 @@ using runtime::NDArray;
 namespace aten {
 namespace impl {
-///////////////////////////// CSRIsNonZero /////////////////////////////
 /*!
 * \brief Search adjacency list linearly for each (row, col) pair and
 * write the matched position in the indices array to the output.
@@ -33,7 +31,7 @@ __global__ void _LinearSearchKernel(
    int64_t row_stride, int64_t col_stride,
    int64_t length, IdType* out) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride_x = gridDim.x * blockDim.x;
+  const int stride_x = gridDim.x * blockDim.x;
  int rpos = tx, cpos = tx;
  while (tx < length) {
    out[tx] = -1;
@@ -50,6 +48,8 @@ __global__ void _LinearSearchKernel(
  }
 }
+///////////////////////////// CSRIsNonZero /////////////////////////////
 template <DLDeviceType XPU, typename IdType>
 bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
@@ -169,6 +169,88 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
 template NDArray CSRGetRowData<kDLGPU, int32_t>(CSRMatrix, int64_t);
 template NDArray CSRGetRowData<kDLGPU, int64_t>(CSRMatrix, int64_t);
+///////////////////////////// CSRSliceRows /////////////////////////////
+template <DLDeviceType XPU, typename IdType>
+CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
+  const int64_t num_rows = end - start;
+  const IdType st_pos = aten::IndexSelect<IdType>(csr.indptr, start);
+  const IdType ed_pos = aten::IndexSelect<IdType>(csr.indptr, end);
+  const IdType nnz = ed_pos - st_pos;
+  IdArray ret_indptr = aten::IndexSelect(csr.indptr, start, end + 1) - st_pos;
+  // indices and data can be view arrays
+  IdArray ret_indices = csr.indices.CreateView(
+      {nnz}, csr.indices->dtype, st_pos * sizeof(IdType));
+  IdArray ret_data;
+  if (CSRHasData(csr))
+    ret_data = csr.data.CreateView({nnz}, csr.data->dtype, st_pos * sizeof(IdType));
+  else
+    ret_data = aten::Range(st_pos, ed_pos,
+                           csr.indptr->dtype.bits, csr.indptr->ctx);
+  return CSRMatrix(num_rows, csr.num_cols,
+                   ret_indptr, ret_indices, ret_data,
+                   csr.sorted);
+}
+template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix, int64_t, int64_t);
+template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix, int64_t, int64_t);
+/*!
+ * \brief Copy data segment to output buffers
+ * 
+ * For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1]
+ * to the out_data from out_indptr[i] ~ out_indptr[i+1]
+ *
+ * If the provided `data` array is nullptr, write the read index to the out_data.
+ *
+ */
+template <typename IdType, typename DType>
+__global__ void _SegmentCopyKernel(
+    const IdType* indptr, const DType* data,
+    const IdType* row, int64_t row_stride, int64_t length,
+    const IdType* out_indptr, DType* out_data) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  int rpos = tx;
+  while (tx < length) {
+    const IdType r = row[rpos];
+    DType* out_buf = out_data + out_indptr[tx];
+    for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
+      *(out_buf++) = data? data[i] : i;
+    }
+    rpos += row_stride;
+    tx += stride_x;
+  }
+}
+template <DLDeviceType XPU, typename IdType>
+CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  const int64_t len = rows->shape[0];
+  IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
+  const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
+  const int nt = cuda::FindNumThreads(len);
+  const int nb = (len + nt - 1) / nt;
+  // Copy indices.
+  IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
+  _SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
+      rows.Ptr<IdType>(), 1, len,
+      ret_indptr.Ptr<IdType>(), ret_indices.Ptr<IdType>());
+  // Copy data.
+  IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
+  _SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
+      csr.indptr.Ptr<IdType>(), CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
+      rows.Ptr<IdType>(), 1, len,
+      ret_indptr.Ptr<IdType>(), ret_data.Ptr<IdType>());
+  return CSRMatrix(len, csr.num_cols,
+                   ret_indptr, ret_indices, ret_data,
+                   csr.sorted);
+}
+template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix , NDArray);
+template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix , NDArray);
 }  // namespace impl
 }  // namespace aten

--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
@@ -140,6 +140,7 @@ void CusparseCsrmm2(
      static_cast<int32_t*>(csr.indptr->data),
      static_cast<int32_t*>(csr.indices->data),
      B_data, n, &beta, trans_out, m));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
  if (valptr)
    device->FreeWorkspace(ctx, valptr);
  // transpose the output matrix

--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -9,8 +9,8 @@
 #include <dgl/bcast.h>
 #include "macro.cuh"
 #include "atomic.cuh"
-#include "../../cuda_utils.h"
 #include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
 namespace dgl {

--- a/src/array/cuda/utils.cu
+++ b/src/array/cuda/utils.cu
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cuda/utils.cu
+ * \brief Utilities for CUDA kernels.
+ */
+#include "./utils.h"
+#include <cub/cub.cuh>
+#include "../../runtime/cuda/cuda_common.h"
+namespace dgl {
+namespace cuda {
+bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx) {
+  auto device = runtime::DeviceAPI::Get(ctx);
+  int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
+  // Call CUB's reduction
+  size_t workspace_size = 0;
+  CUDA_CALL(cub::DeviceReduce::Min(nullptr, workspace_size, flags, rst, length));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUDA_CALL(cub::DeviceReduce::Min(workspace, workspace_size, flags, rst, length));
+  int8_t cpu_rst = 0;
+  CUDA_CALL(cudaMemcpy(&cpu_rst, rst, 1, cudaMemcpyDeviceToHost));
+  device->FreeWorkspace(ctx, workspace);
+  device->FreeWorkspace(ctx, rst);
+  return cpu_rst == 1;
+}
+}  // namespace cuda
+}  // namespace dgl
--- a/src/cuda_utils.h
+++ b/src/cuda_utils.h
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file cuda_utils.h
+ * \file array/cuda/utils.h
 * \brief Utilities for CUDA kernels.
 */
-#ifndef DGL_CUDA_UTILS_H_
+#ifndef DGL_ARRAY_CUDA_UTILS_H_
-#define DGL_CUDA_UTILS_H_
+#define DGL_ARRAY_CUDA_UTILS_H_
 #include <dmlc/logging.h>
+#include <dlpack/dlpack.h>
 namespace dgl {
 namespace cuda {
@@ -68,7 +69,18 @@ __device__ __forceinline__ T _ldg(T* addr) {
 #endif
 }
+/*!
+ * \brief Return true if the given bool flag array is all true.
+ * The input bool array is in int8_t type so it is aligned with byte address.
+ *
+ * \param flags The bool array.
+ * \param length The length.
+ * \param ctx Device context.
+ * \return True if all the flags are true.
+ */
+bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx);
 }  // namespace cuda
 }  // namespace dgl
-#endif  // DGL_CUDA_UTILS_H_
+#endif  // DGL_ARRAY_CUDA_UTILS_H_
--- a/src/array/kernel.cc
+++ b/src/array/kernel.cc
@@ -3,7 +3,6 @@
 * \file array/kernel.cc
 * \brief New kernels
 */
-#include <dgl/array.h>
 #include <dgl/packed_func_ext.h>
 #include <dgl/base_heterograph.h>

--- a/src/array/kernel_decl.h
+++ b/src/array/kernel_decl.h
@@ -6,9 +6,9 @@
 #ifndef DGL_ARRAY_KERNEL_DECL_H_
 #define DGL_ARRAY_KERNEL_DECL_H_
-#include <dgl/array.h>
 #include <dgl/bcast.h>
 #include <dgl/base_heterograph.h>
+#include <dgl/runtime/ndarray.h>
 #include <string>
 #include <vector>

--- a/src/c_api_common.h
+++ b/src/c_api_common.h
@@ -14,6 +14,7 @@
 #include <algorithm>
 #include <vector>
 #include <string>
+#include <utility>
 namespace dgl {

--- a/src/graph/network.cc
+++ b/src/graph/network.cc
@@ -804,7 +804,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
      }
    }
    int msg_count = 0;
-    for (int i = 0; i < remote_ids.size(); ++i) {
+    for (size_t i = 0; i < remote_ids.size(); ++i) {
      if (remote_ids[i].size() != 0) {
        KVStoreMsg kv_msg;
        kv_msg.msg_type = MessageType::kPullMsg;
@@ -827,9 +827,10 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
      }
    }
    char *return_data = new char[ID_size*row_size];
+    const int64_t local_ids_size = local_ids.size();
    // Copy local data
 #pragma omp parallel for
-    for (int64_t i = 0; i < local_ids.size(); ++i) {
+    for (int64_t i = 0; i < local_ids_size; ++i) {
      CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
      CHECK_GE(data_size, local_ids[i] * row_size + row_size);
      CHECK_GE(local_ids[i], 0);
@@ -843,7 +844,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
      int64_t id_size = kv_msg->id.GetSize() / sizeof(int64_t);
      int part_id = kv_msg->rank / group_count;
      char* data_char = static_cast<char*>(kv_msg->data->data);
-      for (size_t n = 0; n < id_size; ++n) {
+      for (int64_t n = 0; n < id_size; ++n) {
        memcpy(return_data + remote_ids_original[part_id][n] * row_size,
               data_char + n * row_size,
               row_size);