[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)

* add cub; array cumsum * CSRSliceRows * fix warning * operator << for ndarray; CSRSliceRows * add CSRIsSorted * add csr_sort * inplace coosort and outplace csrsort * WIP: coo is sorted * mv cuda_utils * add AllTrue utility * csr sort * coo sort * coo2csr for sorted coo arrays * CSRToCOO from sorted * pass tests for the new kernel changes * cannot use inplace sort * lint * try fix msvc error * Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC * stash * revert some hack * revert some changes * address comments * fix * fix to_block unittest * add todo note

[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)
* add cub; array cumsum * CSRSliceRows * fix warning * operator << for ndarray; CSRSliceRows * add CSRIsSorted * add csr_sort * inplace coosort and outplace csrsort * WIP: coo is sorted * mv cuda_utils * add AllTrue utility * csr sort * coo sort * coo2csr for sorted coo arrays * CSRToCOO from sorted * pass tests for the new kernel changes * cannot use inplace sort * lint * try fix msvc error * Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC * stash * revert some hack * revert some changes * address comments * fix * fix to_block unittest * add todo note
870da747 · Minjie Wang · GitHub · da8632ca · 870da747 · 870da747
Unverified Commit 870da747 authored Jun 28, 2020 by Minjie Wang Committed by GitHub Jun 28, 2020
20 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,6 +13,10 @@
 [submodule "third_party/METIS"]
 	path = third_party/METIS
 	url = https://github.com/KarypisLab/METIS.git
+[submodule "third_party/cub"]
+	path = third_party/cub
+	url = https://github.com/NVlabs/cub.git
+	branch = 1.8.0
 [submodule "third_party/phmap"]
 	path = third_party/phmap
 	url = https://github.com/greg7mdp/parallel-hashmap.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,8 @@ include_directories("third_party/METIS/include/")
 include_directories("third_party/dmlc-core/include")
 include_directories("third_party/minigun/minigun")
 include_directories("third_party/minigun/third_party/moderngpu/src")
+include_directories("third_party/cub/")
+include_directories("third_party/phmap/")
 # initial variables
 set(DGL_LINKER_LIBS "")

--- a/include/dgl/aten/array_ops.h
+++ b/include/dgl/aten/array_ops.h
@@ -13,6 +13,7 @@
 #include <utility>
 #include <vector>
 #include <tuple>
+#include <string>
 #include "./types.h"
 namespace dgl {
@@ -131,9 +132,18 @@ IdArray HStack(IdArray arr1, IdArray arr2);
 * \tparam ValueType The type of return value.
 */
 template<typename ValueType>
-ValueType IndexSelect(NDArray array, uint64_t index);
+ValueType IndexSelect(NDArray array, int64_t index);
+/*!
+ * \brief Return the data under the index. In numpy notation, A[I]
+ */
 NDArray IndexSelect(NDArray array, IdArray index);
+/*!
+ * \brief Return the data from `start` (inclusive) to `end` (exclusive).
+ */
+NDArray IndexSelect(NDArray array, int64_t start, int64_t end);
 /*!
 * \brief Permute the elements of an array according to given indices.
 *
@@ -238,6 +248,27 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, ValueType pad_value);
 */
 std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
+/*!
+ * \brief Return the cumulative summation (or inclusive sum) of the input array.
+ *
+ * The first element out[0] is equal to the first element of the input array
+ * array[0]. The rest elements are defined recursively, out[i] = out[i-1] + array[i].
+ * Hence, the result array length is the same as the input array length.
+ *
+ * If prepend_zero is true, then the first element is zero and the result array
+ * length is the input array length plus one. This is useful for creating
+ * an indptr array over a count array.
+ *
+ * \param array The 1D input array.
+ * \return Array after cumsum.
+ */
+IdArray CumSum(IdArray array, bool prepend_zero = false);
+/*!
+ * \brief Return a string that prints out some debug information.
+ */
+std::string ToDebugString(NDArray array);
 // inline implementations
 template <typename T>
 IdArray VecToIdArray(const std::vector<T>& vec,

--- a/include/dgl/aten/coo.h
+++ b/include/dgl/aten/coo.h
@@ -116,6 +116,16 @@ struct COOMatrix {
    CHECK_NO_OVERFLOW(row->dtype, num_rows);
    CHECK_NO_OVERFLOW(row->dtype, num_cols);
  }
+  /*! \brief Return a copy of this matrix on the give device context. */
+  inline COOMatrix CopyTo(const DLContext& ctx) const {
+    if (ctx == row->ctx)
+      return *this;
+    return COOMatrix(num_rows, num_cols,
+                     row.CopyTo(ctx), col.CopyTo(ctx),
+                     aten::IsNullArray(data)? data : data.CopyTo(ctx),
+                     row_sorted, col_sorted);
+  }
 };
 ///////////////////////// COO routines //////////////////////////
@@ -141,6 +151,17 @@ inline bool COOHasData(COOMatrix csr) {
  return !IsNullArray(csr.data);
 }
+/*!
+ * \brief Check whether the COO is sorted.
+ *
+ * It returns two flags: one for whether the row is sorted;
+ * the other for whether the columns of each row is sorted
+ * if the first flag is true.
+ *
+ * Complexity: O(NNZ)
+ */
+std::pair<bool, bool> COOIsSorted(COOMatrix coo);
 /*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */
 runtime::NDArray COOGetData(COOMatrix , int64_t row, int64_t col);
@@ -161,6 +182,20 @@ COOMatrix COOTranspose(COOMatrix coo);
 * the result CSR matrix stores a shuffle index for how the entries
 * will be reordered in CSR. The i^th entry in the result CSR corresponds
 * to the CSR.data[i] th entry in the input COO.
+ *
+ * Conversion complexity: O(nnz)
+ *
+ * - The function first check whether the input COO matrix is sorted
+ *   using a linear scan.
+ * - If the COO matrix is row sorted, the conversion can be done very
+ *   efficiently in a sequential scan. The result indices and data arrays 
+ *   are directly equal to the column and data arrays from the input.
+ * - If the COO matrix is further column sorted, the result CSR is
+ *   also column sorted.
+ * - Otherwise, the conversion is more costly but still is O(nnz).
+ *
+ * \param coo Input COO matrix.
+ * \return CSR matrix.
 */
 CSRMatrix COOToCSR(COOMatrix coo);
@@ -195,6 +230,21 @@ bool COOHasDuplicate(COOMatrix coo);
 */
 std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
+/*!
+ * \brief Sort the indices of a COO matrix in-place.
+ *
+ * The function sorts row indices in ascending order. If sort_column is true,
+ * col indices are sorted in ascending order too. The data array of the returned COOMatrix
+ * stores the shuffled index which could be used to fetch edge data.
+ *
+ * Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
+ * TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
+ *
+ * \param mat The coo matrix to sort.
+ * \param sort_column True if column index should be sorted too.
+ */
+void COOSort_(COOMatrix* mat, bool sort_column = false);
 /*!
 * \brief Sort the indices of a COO matrix.
 *
@@ -202,11 +252,23 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
 * col indices are sorted in ascending order too. The data array of the returned COOMatrix
 * stores the shuffled index which could be used to fetch edge data.
 *
+ * Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
+ * TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
+ *
 * \param mat The input coo matrix
 * \param sort_column True if column index should be sorted too.
 * \return COO matrix with index sorted.
 */
-COOMatrix COOSort(COOMatrix mat, bool sort_column = false);
+inline COOMatrix COOSort(COOMatrix mat, bool sort_column = false) {
+  if ((mat.row_sorted && !sort_column) || mat.col_sorted)
+    return mat;
+  COOMatrix ret(mat.num_rows, mat.num_cols,
+                mat.row.Clone(), mat.col.Clone(),
+                COOHasData(mat)? mat.data.Clone() : mat.data,
+                mat.row_sorted, mat.col_sorted);
+  COOSort_(&ret, sort_column);
+  return ret;
+}
 /*!
 * \brief Remove entries from COO matrix by entry indices (data indices)

--- a/include/dgl/aten/csr.h
+++ b/include/dgl/aten/csr.h
@@ -106,6 +106,17 @@ struct CSRMatrix {
    }
    CHECK_NO_OVERFLOW(indptr->dtype, num_rows);
    CHECK_NO_OVERFLOW(indptr->dtype, num_cols);
+    CHECK_EQ(indptr->shape[0], num_rows + 1);
+  }
+  /*! \brief Return a copy of this matrix on the give device context. */
+  inline CSRMatrix CopyTo(const DLContext& ctx) const {
+    if (ctx == indptr->ctx)
+      return *this;
+    return CSRMatrix(num_rows, num_cols,
+                     indptr.CopyTo(ctx), indices.CopyTo(ctx),
+                     aten::IsNullArray(data)? data : data.CopyTo(ctx),
+                     sorted);
  }
 };
@@ -134,6 +145,9 @@ inline bool CSRHasData(CSRMatrix csr) {
  return !IsNullArray(csr.data);
 }
+/*! \brief Whether the column indices of each row is sorted. */
+bool CSRIsSorted(CSRMatrix csr);
 /* \brief Get data. The return type is an ndarray due to possible duplicate entries. */
 runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col);
 /*!
@@ -155,6 +169,15 @@ CSRMatrix CSRTranspose(CSRMatrix csr);
 /*!
 * \brief Convert CSR matrix to COO matrix.
+ *
+ * Complexity: O(nnz)
+ * 
+ * - If data_as_order is false, the column and data arrays of the
+ *   result COO are equal to the indices and data arrays of the
+ *   input CSR. The result COO is also row sorted.
+ * - If the input CSR is further sorted, the result COO is also
+ *   column sorted.
+ *
 * \param csr Input csr matrix
 * \param data_as_order If true, the data array in the input csr matrix contains the order
 *                      by which the resulting COO tuples are stored. In this case, the
@@ -166,9 +189,8 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
 /*!
 * \brief Slice rows of the given matrix and return.
- * \param csr CSR matrix
+ *
- * \param start Start row id (inclusive)
+ * The sliced row IDs are relabeled to starting from zero.
- * \param end End row id (exclusive)
 *
 * Examples:
 * num_rows = 4
@@ -182,6 +204,11 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
 * num_cols = 4
 * indptr = [0, 1, 1]
 * indices = [2]
+ *
+ * \param csr CSR matrix
+ * \param start Start row id (inclusive)
+ * \param end End row id (exclusive)
+ * \return sliced rows stored in a CSR matrix
 */
 CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end);
 CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
@@ -192,6 +219,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
 * In numpy notation, given matrix M, row index array I, col index array J
 * This function returns the submatrix M[I, J].
 *
+ * The sliced row and column IDs are relabeled to starting from zero.
+ *
 * \param csr The input csr matrix
 * \param rows The row index to select
 * \param cols The col index to select
@@ -203,7 +232,10 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
 bool CSRHasDuplicate(CSRMatrix csr);
 /*!
- * \brief Sort the column index at each row in the ascending order.
+ * \brief Sort the column index at each row in ascending order in-place.
+ *
+ * Only the indices and data arrays (if available) will be mutated. The indptr array
+ * stays the same.
 *
 * Examples:
 * num_rows = 4
@@ -218,6 +250,22 @@ bool CSRHasDuplicate(CSRMatrix csr);
 */
 void CSRSort_(CSRMatrix* csr);
+/*!
+ * \brief Sort the column index at each row in ascending order.
+ *
+ * Return a new CSR matrix with sorted column indices and data arrays.
+ */
+inline CSRMatrix CSRSort(CSRMatrix csr) {
+  if (csr.sorted)
+    return csr;
+  CSRMatrix ret(csr.num_rows, csr.num_cols,
+                csr.indptr, csr.indices.Clone(),
+                CSRHasData(csr)? csr.data.Clone() : csr.data,
+                csr.sorted);
+  CSRSort_(&ret);
+  return ret;
+}
 /*!
 * \brief Reorder the rows and colmns according to the new row and column order.
 * \param csr The input csr matrix.

--- a/include/dgl/aten/macro.h
+++ b/include/dgl/aten/macro.h
@@ -252,4 +252,8 @@
      CHECK_LE((val), 0x7FFFFFFFL) << "int32 overflow for argument " << (#val) << "."; \
  } while (0);
+#define CHECK_IS_ID_ARRAY(VAR)                                              \
+  CHECK((VAR)->ndim == 1 && (IS_INT32(VAR) || IS_INT64(VAR)))               \
+    << "Expected argument " << (#VAR) << " to be an 1D integer array.";
 #endif  // DGL_ATEN_MACRO_H_
--- a/include/dgl/graph_interface.h
+++ b/include/dgl/graph_interface.h
@@ -10,6 +10,7 @@
 #include <vector>
 #include <utility>
 #include <algorithm>
+#include <memory>
 #include "./runtime/object.h"
 #include "array.h"

--- a/include/dgl/immutable_graph.h
+++ b/include/dgl/immutable_graph.h
@@ -12,6 +12,7 @@
 #include <utility>
 #include <tuple>
 #include <algorithm>
+#include <memory>
 #include "runtime/ndarray.h"
 #include "graph_interface.h"
 #include "lazy.h"

--- a/include/dgl/nodeflow.h
+++ b/include/dgl/nodeflow.h
@@ -8,6 +8,7 @@
 #include <vector>
 #include <string>
+#include <memory>
 #include "./runtime/object.h"
 #include "graph_interface.h"

--- a/include/dgl/runtime/ndarray.h
+++ b/include/dgl/runtime/ndarray.h
@@ -11,6 +11,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <memory>
 #include "c_runtime_api.h"
 #include "dlpack/dlpack.h"
@@ -157,6 +158,10 @@ class NDArray {
   * \return The array under another context.
   */
  inline NDArray CopyTo(const DLContext& ctx) const;
+  /*!
+   * \brief Return a new array with a copy of the content.
+   */
+  inline NDArray Clone() const;
  /*!
   * \brief Load NDArray from stream
   * \param stream The input data stream
@@ -410,6 +415,12 @@ inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
  return ret;
 }
+inline NDArray NDArray::Clone() const {
+  CHECK(data_ != nullptr);
+  const DLTensor* dptr = operator->();
+  return this->CopyTo(dptr->ctx);
+}
 inline int NDArray::use_count() const {
  if (data_ == nullptr) return 0;
  return data_->ref_counter_.load(std::memory_order_relaxed);
@@ -627,6 +638,8 @@ dgl::runtime::NDArray operator <= (int64_t lhs, const dgl::runtime::NDArray& a2)
 dgl::runtime::NDArray operator == (int64_t lhs, const dgl::runtime::NDArray& a2);
 dgl::runtime::NDArray operator != (int64_t lhs, const dgl::runtime::NDArray& a2);
+std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array);
 ///////////////// Operator overloading for DLDataType /////////////////
 /*! \brief Check whether two data types are the same.*/

--- a/include/dgl/runtime/packed_func.h
+++ b/include/dgl/runtime/packed_func.h
@@ -13,6 +13,7 @@
 #include <string>
 #include <limits>
 #include <memory>
+#include <utility>
 #include <type_traits>
 #include "c_runtime_api.h"
 #include "module.h"

--- a/include/dgl/runtime/smart_ptr_serializer.h
+++ b/include/dgl/runtime/smart_ptr_serializer.h
@@ -10,6 +10,7 @@
 #include <dgl/graph_serializer.h>
 #include <dmlc/io.h>
 #include <dmlc/serializer.h>
+#include <memory>
 namespace dmlc {
 namespace serializer {

--- a/include/dgl/zerocopy_serializer.h
+++ b/include/dgl/zerocopy_serializer.h
@@ -17,31 +17,36 @@
 #include <tuple>
 #include <utility>
 #include <vector>
+#include <memory>
 #include "dmlc/logging.h"
 namespace dgl {
-/* StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
+/*!
-dmlc::MemoryStringStream. This class supports serializing and deserializing
+ *
-NDArrays stored in shared memory. If the stream is created for
+ * StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
-sending/recving data through network, the data pointer of the NDArray will be
+ * dmlc::MemoryStringStream. This class supports serializing and deserializing
-transmitted directly without and copy. Otherwise, the stream is for
+ * NDArrays stored in shared memory. If the stream is created for
-sending/recving data to another process on the same machine, so if an NDArray
+ * sending/recving data through network, the data pointer of the NDArray will be
-is stored in shared memory, it will just record the shared memory name
+ * transmitted directly without and copy. Otherwise, the stream is for
-instead of the actual data buffer.
+ * sending/recving data to another process on the same machine, so if an NDArray
-For example:
+ * is stored in shared memory, it will just record the shared memory name
-std::string blob;
+ * instead of the actual data buffer.
-// Send to local
+ *
-StreamWithBuffer strm(&blob, false);
+ * For example:
-// Send to remote
+ *
-StreamWithBuffer strm(&blob, true);
+ * std::string blob;
-// Receive from local
+ * // Send to local
-StreamWithBuffer strm(&blob, false);
+ * StreamWithBuffer strm(&blob, false);
-// Receive from remote
+ * // Send to remote
-std::vector<void*> ptr_list
+ * StreamWithBuffer strm(&blob, true);
-StreamWithBuffer strm(&blob, ptr_list);
+ * // Receive from local
-*/
+ * StreamWithBuffer strm(&blob, false);
+ * // Receive from remote
+ * std::vector<void*> ptr_list
+ * StreamWithBuffer strm(&blob, ptr_list);
+ */
 class StreamWithBuffer : public dmlc::SeekStream {
 public:
  // Buffer type. Storing NDArray to maintain the reference counting to ensure

--- a/src/array/array.cc
+++ b/src/array/array.cc
@@ -8,6 +8,8 @@
 #include <dgl/packed_func_ext.h>
 #include <dgl/runtime/container.h>
 #include <dgl/runtime/shared_mem.h>
+#include <dgl/runtime/device_api.h>
+#include <sstream>
 #include "../c_api_common.h"
 #include "./array_op.h"
 #include "./arith.h"
@@ -100,8 +102,10 @@ NDArray IndexSelect(NDArray array, IdArray index) {
 }
 template<typename ValueType>
-ValueType IndexSelect(NDArray array, uint64_t index) {
+ValueType IndexSelect(NDArray array, int64_t index) {
  CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
+  CHECK(index >= 0 && index < array.NumElements())
+    << "Index " << index << " is out of bound.";
  ValueType ret = 0;
  ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "IndexSelect", {
    ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
@@ -110,12 +114,30 @@ ValueType IndexSelect(NDArray array, uint64_t index) {
  });
  return ret;
 }
-template int32_t IndexSelect<int32_t>(NDArray array, uint64_t index);
+template int32_t IndexSelect<int32_t>(NDArray array, int64_t index);
-template int64_t IndexSelect<int64_t>(NDArray array, uint64_t index);
+template int64_t IndexSelect<int64_t>(NDArray array, int64_t index);
-template uint32_t IndexSelect<uint32_t>(NDArray array, uint64_t index);
+template uint32_t IndexSelect<uint32_t>(NDArray array, int64_t index);
-template uint64_t IndexSelect<uint64_t>(NDArray array, uint64_t index);
+template uint64_t IndexSelect<uint64_t>(NDArray array, int64_t index);
-template float IndexSelect<float>(NDArray array, uint64_t index);
+template float IndexSelect<float>(NDArray array, int64_t index);
-template double IndexSelect<double>(NDArray array, uint64_t index);
+template double IndexSelect<double>(NDArray array, int64_t index);
+NDArray IndexSelect(NDArray array, int64_t start, int64_t end) {
+  CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
+  CHECK(start >= 0 && start < array.NumElements())
+    << "Index " << start << " is out of bound.";
+  CHECK(end >= 0 && end <= array.NumElements())
+    << "Index " << end << " is out of bound.";
+  CHECK_LE(start, end);
+  auto device = runtime::DeviceAPI::Get(array->ctx);
+  const int64_t len = end - start;
+  NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
+  ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
+    device->CopyDataFromTo(array->data, start * sizeof(DType),
+                           ret->data, 0, len * sizeof(DType),
+                           array->ctx, ret->ctx, array->dtype, nullptr);
+  });
+  return ret;
+}
 NDArray Scatter(NDArray array, IdArray indices) {
  NDArray ret;
@@ -181,6 +203,31 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
  return ret;
 }
+IdArray CumSum(IdArray array, bool prepend_zero) {
+  IdArray ret;
+  ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "CumSum", {
+    ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
+      ret = impl::CumSum<XPU, IdType>(array, prepend_zero);
+    });
+  });
+  return ret;
+}
+std::string ToDebugString(NDArray array) {
+  std::ostringstream oss;
+  NDArray a = array.CopyTo(DLContext{kDLCPU, 0});
+  oss << "array([";
+  ATEN_DTYPE_SWITCH(a->dtype, DType, "array", {
+    for (int64_t i = 0; i < std::min<int64_t>(a.NumElements(), 10L); ++i) {
+      oss << a.Ptr<DType>()[i] << ", ";
+    }
+  });
+  if (a.NumElements() > 10)
+    oss << "...";
+  oss << "], dtype=" << array->dtype << ", ctx=" << array->ctx << ")";
+  return oss.str();
+}
 ///////////////////////// CSR routines //////////////////////////
 bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
@@ -250,6 +297,16 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
  return ret;
 }
+bool CSRIsSorted(CSRMatrix csr) {
+  if (csr.indices->shape[0] <= 1)
+    return true;
+  bool ret = false;
+  ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRIsSorted", {
+    ret = impl::CSRIsSorted<XPU, IdType>(csr);
+  });
+  return ret;
+}
 NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col) {
  CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
  CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
@@ -318,7 +375,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
  CHECK(end >= 0 && end <= csr.num_rows) << "Invalid end index: " << end;
  CHECK_GE(end, start);
  CSRMatrix ret;
-  ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
+  ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
    ret = impl::CSRSliceRows<XPU, IdType>(csr, start, end);
  });
  return ret;
@@ -328,7 +385,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
  CHECK_SAME_DTYPE(csr.indices, rows);
  CHECK_SAME_CONTEXT(csr.indices, rows);
  CSRMatrix ret;
-  ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
+  ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
    ret = impl::CSRSliceRows<XPU, IdType>(csr, rows);
  });
  return ret;
@@ -347,7 +404,9 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, NDArray rows, NDArray cols) {
 }
 void CSRSort_(CSRMatrix* csr) {
-  ATEN_CSR_SWITCH(*csr, XPU, IdType, "CSRSort_", {
+  if (csr->sorted)
+    return;
+  ATEN_CSR_SWITCH_CUDA(*csr, XPU, IdType, "CSRSort_", {
    impl::CSRSort_<XPU, IdType>(csr);
  });
 }
@@ -509,13 +568,23 @@ COOMatrix COOSliceMatrix(COOMatrix coo, NDArray rows, NDArray cols) {
  return ret;
 }
-COOMatrix COOSort(COOMatrix mat, bool sort_column) {
+void COOSort_(COOMatrix* mat, bool sort_column) {
-  COOMatrix ret;
+  if ((mat->row_sorted && !sort_column) || mat->col_sorted)
-  ATEN_XPU_SWITCH_CUDA(mat.row->ctx.device_type, XPU, "COOSort", {
+    return;
-    ATEN_ID_TYPE_SWITCH(mat.row->dtype, IdType, {
+  ATEN_XPU_SWITCH_CUDA(mat->row->ctx.device_type, XPU, "COOSort_", {
-      ret = impl::COOSort<XPU, IdType>(mat, sort_column);
+    ATEN_ID_TYPE_SWITCH(mat->row->dtype, IdType, {
+      impl::COOSort_<XPU, IdType>(mat, sort_column);
    });
  });
+}
+std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
+  if (coo.row->shape[0] <= 1)
+    return {true, true};
+  std::pair<bool, bool> ret;
+  ATEN_COO_SWITCH_CUDA(coo, XPU, IdType, "COOIsSorted", {
+    ret = impl::COOIsSorted<XPU, IdType>(coo);
+  });
  return ret;
 }
@@ -709,3 +778,7 @@ DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLExistSharedMemArray")
 }  // namespace aten
 }  // namespace dgl
+std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array) {
+  return os << dgl::aten::ToDebugString(array);
+}
--- a/src/array/array_arith.cc
+++ b/src/array/array_arith.cc
@@ -3,8 +3,8 @@
 * \file array/array_aritch.cc
 * \brief DGL array arithmetic operations
 */
-#include <dgl/array.h>
 #include <dgl/packed_func_ext.h>
+#include <dgl/runtime/ndarray.h>
 #include <dgl/runtime/container.h>
 #include "../c_api_common.h"
 #include "./array_op.h"

--- a/src/array/array_op.h
+++ b/src/array/array_op.h
@@ -44,7 +44,7 @@ template <DLDeviceType XPU, typename DType, typename IdType>
 NDArray IndexSelect(NDArray array, IdArray index);
 template <DLDeviceType XPU, typename DType>
-DType IndexSelect(NDArray array, uint64_t index);
+DType IndexSelect(NDArray array, int64_t index);
 template <DLDeviceType XPU, typename DType, typename IdType>
 NDArray Scatter(NDArray array, IdArray indices);
@@ -61,6 +61,9 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value);
 template <DLDeviceType XPU, typename DType, typename IdType>
 std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
+template <DLDeviceType XPU, typename IdType>
+IdArray CumSum(IdArray array, bool prepend_zero);
 // sparse arrays
 template <DLDeviceType XPU, typename IdType>
@@ -84,6 +87,9 @@ runtime::NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row);
 template <DLDeviceType XPU, typename IdType>
 runtime::NDArray CSRGetRowData(CSRMatrix csr, int64_t row);
+template <DLDeviceType XPU, typename IdType>
+bool CSRIsSorted(CSRMatrix csr);
 template <DLDeviceType XPU, typename IdType>
 runtime::NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col);
@@ -187,7 +193,10 @@ template <DLDeviceType XPU, typename IdType>
 std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
 template <DLDeviceType XPU, typename IdType>
-COOMatrix COOSort(COOMatrix mat, bool sort_column);
+void COOSort_(COOMatrix* mat, bool sort_column);
+template <DLDeviceType XPU, typename IdType>
+std::pair<bool, bool> COOIsSorted(COOMatrix coo);
 template <DLDeviceType XPU, typename IdType>
 COOMatrix COORemove(COOMatrix coo, IdArray entries);

--- a/src/array/cpu/array_cumsum.cc
+++ b/src/array/cpu/array_cumsum.cc
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cpu/array_cumsum.cc
+ * \brief Array cumsum CPU implementation
+ */
+#include <dgl/array.h>
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+template <DLDeviceType XPU, typename IdType>
+IdArray CumSum(IdArray array, bool prepend_zero) {
+  const int64_t len = array.NumElements();
+  if (len == 0)
+    return array;
+  if (prepend_zero) {
+    IdArray ret = aten::NewIdArray(len + 1, array->ctx, array->dtype.bits);
+    const IdType* in_d = array.Ptr<IdType>();
+    IdType* out_d = ret.Ptr<IdType>();
+    out_d[0] = 0;
+    for (int64_t i = 0; i < len; ++i)
+      out_d[i + 1] = out_d[i] + in_d[i];
+    return ret;
+  } else {
+    IdArray ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
+    const IdType* in_d = array.Ptr<IdType>();
+    IdType* out_d = ret.Ptr<IdType>();
+    out_d[0] = in_d[0];
+    for (int64_t i = 1; i < len; ++i)
+      out_d[i] = out_d[i - 1] + in_d[i];
+    return ret;
+  }
+}
+template IdArray CumSum<kDLCPU, int32_t>(IdArray, bool);
+template IdArray CumSum<kDLCPU, int64_t>(IdArray, bool);
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cpu/array_index_select.cc
+++ b/src/array/cpu/array_index_select.cc
@@ -35,20 +35,16 @@ template NDArray IndexSelect<kDLCPU, double, int32_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDLCPU, double, int64_t>(NDArray, IdArray);
 template <DLDeviceType XPU, typename DType>
-DType IndexSelect(NDArray array, uint64_t index) {
+DType IndexSelect(NDArray array, int64_t index) {
  const DType* data = static_cast<DType*>(array->data);
  return data[index];
 }
-template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, uint64_t index);
+template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, int64_t index);
-template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, uint64_t index);
+template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, int64_t index);
-template uint32_t IndexSelect<kDLCPU, uint32_t>(NDArray array, uint64_t index);
+template float IndexSelect<kDLCPU, float>(NDArray array, int64_t index);
-template uint64_t IndexSelect<kDLCPU, uint64_t>(NDArray array, uint64_t index);
+template double IndexSelect<kDLCPU, double>(NDArray array, int64_t index);
-template float IndexSelect<kDLCPU, float>(NDArray array, uint64_t index);
-template double IndexSelect<kDLCPU, double>(NDArray array, uint64_t index);
-};  // namespace impl
+}  // namespace impl
+}  // namespace aten
-};  // namespace aten
+}  // namespace dgl
-};  // namespace dgl
--- a/src/array/cpu/array_pack.cc
+++ b/src/array/cpu/array_pack.cc
@@ -76,8 +76,6 @@ template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, int64_t>(NDArray, in
 template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, float>(NDArray, float);
 template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, double>(NDArray, double);
-};  // namespace impl
+}  // namespace impl
+}  // namespace aten
-};  // namespace aten
+}  // namespace dgl
-};  // namespace dgl
--- a/src/array/cpu/array_utils.h
+++ b/src/array/cpu/array_utils.h
@@ -6,12 +6,12 @@
 #ifndef DGL_ARRAY_CPU_ARRAY_UTILS_H_
 #define DGL_ARRAY_CPU_ARRAY_UTILS_H_
-#include <dgl/array.h>
+#include <dgl/aten/types.h>
+#include <parallel_hashmap/phmap.h>
 #include <vector>
 #include <unordered_map>
 #include <utility>
 #include "../../c_api_common.h"
-#include "../third_party/phmap/parallel_hashmap/phmap.h"
 namespace dgl {
 namespace aten {