[Performance] improve coo2csr space complexity when row is not sorted (#3326)

* [Performance] improve coo2csr space complexity when row is not sorted * [Perf] replace std::vector<> by NDArray * keep both impl of unsorted coo to csr and choose according to graph density dynamically * refine criteria to choose btw Unsorted algos Co-authored-by: Ubuntu <ubuntu@ip-172-31-34-27.us-west-2.compute.internal>

[Performance] improve coo2csr space complexity when row is not sorted (#3326)
* [Performance] improve coo2csr space complexity when row is not sorted * [Perf] replace std::vector<> by NDArray * keep both impl of unsorted coo to csr and choose according to graph density dynamically * refine criteria to choose btw Unsorted algos Co-authored-by: Ubuntu <ubuntu@ip-172-31-34-27.us-west-2.compute.internal>
f4c79f7f · Rhett Ying · GitHub · a609b4f0 · f4c79f7f · f4c79f7f
Unverified Commit f4c79f7f authored Sep 14, 2021 by Rhett Ying Committed by GitHub Sep 14, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 345 additions and 147 deletions

src/array/cpu/spmat_op_impl_coo.cc src/array/cpu/spmat_op_impl_coo.cc +290 -144

tests/cpp/test_spmat_coo.cc tests/cpp/test_spmat_coo.cc +55 -3

No files found.
--- a/src/array/cpu/spmat_op_impl_coo.cc
+++ b/src/array/cpu/spmat_op_impl_coo.cc
@@ -302,189 +302,335 @@ template COOMatrix COOTranspose<kDLCPU, int32_t>(COOMatrix coo);
 template COOMatrix COOTranspose<kDLCPU, int64_t>(COOMatrix coo);

 ///////////////////////////// COOToCSR /////////////////////////////
+namespace {

-// complexity: time O(NNZ), space O(1) if the coo is row sorted,
-// time O(NNZ/p + N), space O(NNZ + N*p) otherwise, where p is the number of
-// threads.
-template <DLDeviceType XPU, typename IdType>
-CSRMatrix COOToCSR(COOMatrix coo) {
+template <class IdType> CSRMatrix SortedCOOToCSR(const COOMatrix &coo) {
  const int64_t N = coo.num_rows;
  const int64_t NNZ = coo.row->shape[0];
-  const IdType* const row_data = static_cast<IdType*>(coo.row->data);
-  const IdType* const col_data = static_cast<IdType*>(coo.col->data);
-  const IdType* const data = COOHasData(coo)? static_cast<IdType*>(coo.data->data) : nullptr;
+  const IdType *const row_data = static_cast<IdType *>(coo.row->data);
+  const IdType *const col_data = static_cast<IdType *>(coo.col->data);
+  const IdType *const data =
+      COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;

  NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
-  NDArray ret_indices;
-  NDArray ret_data;
+  NDArray ret_indices = coo.col;
+  NDArray ret_data = data == nullptr
+                         ? NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx)
+                         : coo.data;
+
+  // compute indptr
+  IdType *const Bp = static_cast<IdType *>(ret_indptr->data);
+  Bp[0] = 0;
+
+  IdType *const fill_data =
+      data ? nullptr : static_cast<IdType *>(coo.data->data);
+
+  if (NNZ > 0) {
+    auto num_threads = omp_get_max_threads();
+    parallel_for(0, num_threads, [&](int b, int e) {
+      for (auto thread_id = b; thread_id < e; ++thread_id) {
+        // We partition the set the of non-zeros among the threads
+        const int64_t nz_chunk = (NNZ + num_threads - 1) / num_threads;
+        const int64_t nz_start = thread_id * nz_chunk;
+        const int64_t nz_end = std::min(NNZ, nz_start + nz_chunk);
+
+        // Each thread searchs the row array for a change, and marks it's
+        // location in Bp. Threads, other than the first, start at the last
+        // index covered by the previous, in order to detect changes in the row
+        // array between thread partitions. This means that each thread after
+        // the first, searches the range [nz_start-1, nz_end). That is,
+        // if we had 10 non-zeros, and 4 threads, the indexes searched by each
+        // thread would be:
+        // 0: [0, 1, 2]
+        // 1: [2, 3, 4, 5]
+        // 2: [5, 6, 7, 8]
+        // 3: [8, 9]
+        //
+        // That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each
+        // change in row would be captured by one thread:
+        //
+        // 0: [0, 0, 1] - row 0
+        // 1: [1, 2, 2, 2] - row 1
+        // 2: [2, 4, 5, 5] - rows 2, 3, and 4
+        // 3: [5, 6] - rows 5 and 6
+        //
+        int64_t row = 0;
+        if (nz_start < nz_end) {
+          row = nz_start == 0 ? 0 : row_data[nz_start - 1];
+          for (int64_t i = nz_start; i < nz_end; ++i) {
+            while (row != row_data[i]) {
+              ++row;
+              Bp[row] = i;
+            }
+          }

-  const bool row_sorted = coo.row_sorted;
-  const bool col_sorted = coo.col_sorted;
+          // We will not detect the row change for the last row, nor any empty
+          // rows at the end of the matrix, so the last active thread needs
+          // mark all remaining rows in Bp with NNZ.
+          if (nz_end == NNZ) {
+            while (row < N) {
+              ++row;
+              Bp[row] = NNZ;
+            }
+          }

-  if (row_sorted) {
-    // compute indptr
-    IdType* const Bp = static_cast<IdType*>(ret_indptr->data);
-    Bp[0] = 0;
+          if (fill_data) {
+            // TODO(minjie): Many of our current implementation assumes that CSR
+            // must have
+            //   a data array. This is a temporary workaround. Remove this
+            //   after:
+            //   - The old immutable graph implementation is deprecated.
+            //   - The old binary reduce kernel is deprecated.
+            std::iota(fill_data + nz_start, fill_data + nz_end, nz_start);
+          }
+        }
+      }
+    });
+  } else {
+    std::fill(Bp, Bp + N + 1, 0);
+  }
+
+  return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
+                   ret_data, coo.col_sorted);
+}
+
+template <class IdType> CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
+  const int64_t N = coo.num_rows;
+  const int64_t NNZ = coo.row->shape[0];
+  const IdType *const row_data = static_cast<IdType *>(coo.row->data);
+  const IdType *const col_data = static_cast<IdType *>(coo.col->data);
+  const IdType *const data =
+      COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
+
+  NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
+  NDArray ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
+  NDArray ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
+  IdType *const Bp = static_cast<IdType *>(ret_indptr->data);
+  Bp[0] = 0;
+  IdType *const Bi = static_cast<IdType *>(ret_indices->data);
+  IdType *const Bx = static_cast<IdType *>(ret_data->data);
+
+  // store sorted data and original index.
+  NDArray sorted_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
+  NDArray sorted_data_pos = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
+  IdType *const Sx = static_cast<IdType *>(sorted_data->data);
+  IdType *const Si = static_cast<IdType *>(sorted_data_pos->data);
+
+  // record row_idx in each thread.
+  std::vector<std::vector<int64_t>> p_sum;

-    if (!data) {
-      // Leave empty, and populate from inside of parallel block
-      coo.data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
+#pragma omp parallel
+  {
+    const int num_threads = omp_get_num_threads();
+    const int thread_id = omp_get_thread_num();
+    CHECK_LT(thread_id, num_threads);
+
+    const int64_t nz_chunk = (NNZ + num_threads - 1) / num_threads;
+    const int64_t nz_start = thread_id * nz_chunk;
+    const int64_t nz_end = std::min(NNZ, nz_start + nz_chunk);
+
+    const int64_t n_chunk = (N + num_threads - 1) / num_threads;
+    const int64_t n_start = thread_id * n_chunk;
+    const int64_t n_end = std::min(N, n_start + n_chunk);
+
+    // init Bp as zero and one shift is always applied when accessing Bp as
+    // its length is N+1.
+    for (auto i = n_start; i < n_end; ++i) {
+      Bp[i + 1] = 0;
    }
-    IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data);
-
-    if (NNZ > 0) {
-      auto num_threads = omp_get_max_threads();
-      parallel_for(0, num_threads, [&](int b, int e) {
-        for (auto thread_id = b; thread_id < e; ++thread_id) {
-          // We partition the set the of non-zeros among the threads
-          const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
-          const int64_t nz_start = thread_id*nz_chunk;
-          const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk);
-
-          // Each thread searchs the row array for a change, and marks it's
-          // location in Bp. Threads, other than the first, start at the last
-          // index covered by the previous, in order to detect changes in the row
-          // array between thread partitions. This means that each thread after
-          // the first, searches the range [nz_start-1, nz_end). That is,
-          // if we had 10 non-zeros, and 4 threads, the indexes searched by each
-          // thread would be:
-          // 0: [0, 1, 2]
-          // 1: [2, 3, 4, 5]
-          // 2: [5, 6, 7, 8]
-          // 3: [8, 9]
-          //
-          // That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each
-          // change in row would be captured by one thread:
-          //
-          // 0: [0, 0, 1] - row 0
-          // 1: [1, 2, 2, 2] - row 1
-          // 2: [2, 4, 5, 5] - rows 2, 3, and 4
-          // 3: [5, 6] - rows 5 and 6
-          //
-          int64_t row = 0;
-          if (nz_start < nz_end) {
-            row = nz_start == 0 ? 0 : row_data[nz_start-1];
-            for (int64_t i = nz_start; i < nz_end; ++i) {
-              while (row != row_data[i]) {
-                ++row;
-                Bp[row] = i;
-              }
-            }

-            // We will not detect the row change for the last row, nor any empty
-            // rows at the end of the matrix, so the last active thread needs
-            // mark all remaining rows in Bp with NNZ.
-            if (nz_end == NNZ) {
-              while (row < N) {
-                ++row;
-                Bp[row] = NNZ;
-              }
-            }
+#pragma omp master
+    { p_sum.resize(num_threads); }
+#pragma omp barrier

-            if (fill_data) {
-              // TODO(minjie): Many of our current implementation assumes that CSR must have
-              //   a data array. This is a temporary workaround. Remove this after:
-              //   - The old immutable graph implementation is deprecated.
-              //   - The old binary reduce kernel is deprecated.
-              std::iota(fill_data+nz_start,
-                        fill_data+nz_end,
-                        nz_start);
-            }
-          }
+    // iterate on NNZ data and count row_idx.
+    p_sum[thread_id].resize(num_threads, 0);
+    for (auto i = nz_start; i < nz_end; ++i) {
+      const int64_t row_idx = row_data[i];
+      const int64_t row_thread_id = row_idx / n_chunk;
+      ++p_sum[thread_id][row_thread_id];
+    }
+
+#pragma omp barrier
+#pragma omp master
+    // accumulate row_idx.
+    {
+      int64_t cum = 0;
+      for (size_t j = 0; j < p_sum.size(); ++j) {
+        for (size_t i = 0; i < p_sum.size(); ++i) {
+          auto tmp = p_sum[i][j];
+          p_sum[i][j] = cum;
+          cum += tmp;
        }
-      });
-    } else {
-      std::fill(Bp, Bp+N+1, 0);
+      }
+      CHECK_EQ(cum, NNZ);
    }
+#pragma omp barrier

-    // compute indices and data
-    ret_indices = coo.col;
-    ret_data = coo.data;
-  } else {
-    // compute indptr
-    IdType* const Bp = static_cast<IdType*>(ret_indptr->data);
-    Bp[0] = 0;
+    // sort data by row_idx and place into Sx/Si.
+    std::vector<int64_t> data_pos(p_sum[thread_id]);
+    for (auto i = nz_start; i < nz_end; ++i) {
+      const int64_t row_idx = row_data[i];
+      const int64_t row_thread_id = row_idx / n_chunk;
+      const int64_t pos = data_pos[row_thread_id]++;
+      Sx[pos] = data == nullptr ? i : data[i];
+      Si[pos] = i;
+    }

-    // compute indices and data
-    ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
-    ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
-    IdType* const Bi = static_cast<IdType*>(ret_indices->data);
-    IdType* const Bx = static_cast<IdType*>(ret_data->data);
+#pragma omp barrier

-    // the offset within each row, that each thread will write to
-    std::vector<std::vector<IdType>> local_ptrs;
-    std::vector<int64_t> thread_prefixsum;
+    // Now we're able to do coo2csr on sorted data in each thread in parallel.
+    // compute data number on each row_idx.
+    const int64_t i_start = p_sum[0][thread_id];
+    const int64_t i_end =
+        thread_id + 1 == num_threads ? NNZ : p_sum[0][thread_id + 1];
+    for (auto i = i_start; i < i_end; ++i) {
+      const int64_t row_idx = row_data[Si[i]];
+      ++Bp[row_idx + 1];
+    }
+
+    // accumulate on each row
+    IdType cumsum = 0;
+    for (auto i = n_start; i < n_end; ++i) {
+      const auto tmp = Bp[i + 1];
+      Bp[i + 1] = cumsum;
+      cumsum += tmp;
+    }
+
+    // update Bi/Bp/Bx
+    for (auto i = i_start; i < i_end; ++i) {
+      const int64_t row_idx = row_data[Si[i]];
+      const int64_t dest = (Bp[row_idx + 1]++) + i_start;
+      Bi[dest] = col_data[Si[i]];
+      Bx[dest] = Sx[i];
+    }
+    for (auto i = n_start; i < n_end; ++i) {
+      Bp[i + 1] += i_start;
+    }
+  }
+  return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
+                   ret_data, coo.col_sorted);
+}
+
+template <class IdType> CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
+  const int64_t N = coo.num_rows;
+  const int64_t NNZ = coo.row->shape[0];
+  const IdType *const row_data = static_cast<IdType *>(coo.row->data);
+  const IdType *const col_data = static_cast<IdType *>(coo.col->data);
+  const IdType *const data =
+      COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
+
+  NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
+  NDArray ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
+  NDArray ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
+  IdType *const Bp = static_cast<IdType *>(ret_indptr->data);
+  Bp[0] = 0;
+  IdType *const Bi = static_cast<IdType *>(ret_indices->data);
+  IdType *const Bx = static_cast<IdType *>(ret_data->data);
+
+  // the offset within each row, that each thread will write to
+  std::vector<std::vector<IdType>> local_ptrs;
+  std::vector<int64_t> thread_prefixsum;

 #pragma omp parallel
-    {
-      const int num_threads = omp_get_num_threads();
-      const int thread_id = omp_get_thread_num();
-      CHECK_LT(thread_id, num_threads);
+  {
+    const int num_threads = omp_get_num_threads();
+    const int thread_id = omp_get_thread_num();
+    CHECK_LT(thread_id, num_threads);

-      const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
-      const int64_t nz_start = thread_id*nz_chunk;
-      const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk);
+    const int64_t nz_chunk = (NNZ + num_threads - 1) / num_threads;
+    const int64_t nz_start = thread_id * nz_chunk;
+    const int64_t nz_end = std::min(NNZ, nz_start + nz_chunk);

-      const int64_t n_chunk = (N+num_threads-1)/num_threads;
-      const int64_t n_start = thread_id*n_chunk;
-      const int64_t n_end = std::min(N, n_start+n_chunk);
+    const int64_t n_chunk = (N + num_threads - 1) / num_threads;
+    const int64_t n_start = thread_id * n_chunk;
+    const int64_t n_end = std::min(N, n_start + n_chunk);

 #pragma omp master
-      {
-        local_ptrs.resize(num_threads);
-        thread_prefixsum.resize(num_threads+1);
-      }
+    {
+      local_ptrs.resize(num_threads);
+      thread_prefixsum.resize(num_threads + 1);
+    }

 #pragma omp barrier
-      local_ptrs[thread_id].resize(N, 0);
+    local_ptrs[thread_id].resize(N, 0);

-      for (int64_t i = nz_start; i < nz_end; ++i) {
-        ++local_ptrs[thread_id][row_data[i]];
-      }
+    for (int64_t i = nz_start; i < nz_end; ++i) {
+      ++local_ptrs[thread_id][row_data[i]];
+    }

 #pragma omp barrier
-      // compute prefixsum in parallel
-      int64_t sum = 0;
-      for (int64_t i = n_start; i < n_end; ++i) {
-        IdType tmp = 0;
-        for (int j = 0; j < num_threads; ++j) {
-          std::swap(tmp, local_ptrs[j][i]);
-          tmp += local_ptrs[j][i];
-        }
-        sum += tmp;
-        Bp[i+1] = sum;
+    // compute prefixsum in parallel
+    int64_t sum = 0;
+    for (int64_t i = n_start; i < n_end; ++i) {
+      IdType tmp = 0;
+      for (int j = 0; j < num_threads; ++j) {
+        std::swap(tmp, local_ptrs[j][i]);
+        tmp += local_ptrs[j][i];
      }
-      thread_prefixsum[thread_id+1] = sum;
+      sum += tmp;
+      Bp[i + 1] = sum;
+    }
+    thread_prefixsum[thread_id + 1] = sum;

 #pragma omp barrier
 #pragma omp master
-      {
-        for (int64_t i = 0; i < num_threads; ++i) {
-          thread_prefixsum[i+1] += thread_prefixsum[i];
-        }
-        CHECK_EQ(thread_prefixsum[num_threads], NNZ);
+    {
+      for (int64_t i = 0; i < num_threads; ++i) {
+        thread_prefixsum[i + 1] += thread_prefixsum[i];
      }
+      CHECK_EQ(thread_prefixsum[num_threads], NNZ);
+    }
 #pragma omp barrier

-      sum = thread_prefixsum[thread_id];
-      for (int64_t i = n_start; i < n_end; ++i) {
-        Bp[i+1] += sum;
-      }
+    sum = thread_prefixsum[thread_id];
+    for (int64_t i = n_start; i < n_end; ++i) {
+      Bp[i + 1] += sum;
+    }

 #pragma omp barrier
-      for (int64_t i = nz_start; i < nz_end; ++i) {
-        const IdType r = row_data[i];
-        const int64_t index = Bp[r] + local_ptrs[thread_id][r]++;
-        Bi[index] = col_data[i];
-        Bx[index] = data ? data[i] : i;
-      }
+    for (int64_t i = nz_start; i < nz_end; ++i) {
+      const IdType r = row_data[i];
+      const int64_t index = Bp[r] + local_ptrs[thread_id][r]++;
+      Bi[index] = col_data[i];
+      Bx[index] = data ? data[i] : i;
    }
-    CHECK_EQ(Bp[N], NNZ);
  }
+  CHECK_EQ(Bp[N], NNZ);
+
+  return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
+                   ret_data, coo.col_sorted);
+}

-  return CSRMatrix(coo.num_rows, coo.num_cols,
-                   ret_indptr, ret_indices, ret_data,
-                   col_sorted);
+}  // namespace
+
+/*
+Implementation and Complexity details. N: num_nodes, NNZ: num_edges, P:
+num_threads.
+  1. If row is sorted in COO, SortedCOOToCSR<> is applied. Time: O(NNZ/P).
+Space: O(1).
+  2. If row is NOT sorted in COO and graph is sparse (low average degree),
+UnSortedSparseCOOToCSR<> is applied. Time: O(NNZ/P + N/P + P^2), space O(NNZ +
+P^2).
+  3. If row is NOT sorted in COO and graph is dense (medium/high average
+degree), UnSortedDenseCOOToCSR<> is applied. Time: O(NNZ/P + N/P), space O(NNZ +
+N*P).
+*/
+template <DLDeviceType XPU, typename IdType>
+CSRMatrix COOToCSR(COOMatrix coo) {
+  if (!coo.row_sorted) {
+    const int64_t num_threads = omp_get_num_threads();
+    const int64_t num_nodes = coo.num_rows;
+    const int64_t num_edges = coo.row->shape[0];
+    // Besides graph density, num_threads is also taken into account. Below
+    // criteria is set-up according to the time/space complexity difference
+    // between these 2 algorithms.
+    if (num_threads * num_nodes > 4 * num_edges) {
+      return UnSortedSparseCOOToCSR<IdType>(coo);
+    }
+    return UnSortedDenseCOOToCSR<IdType>(coo);
+  }
+  return SortedCOOToCSR<IdType>(coo);
 }

 template CSRMatrix COOToCSR<kDLCPU, int32_t>(COOMatrix coo);

--- a/tests/cpp/test_spmat_coo.cc
+++ b/tests/cpp/test_spmat_coo.cc
 #include <gtest/gtest.h>
+#include <dmlc/omp.h>
 #include <dgl/array.h>
 #include "./common.h"

@@ -17,7 +18,7 @@ aten::CSRMatrix CSR1(DLContext ctx = CTX) {
  return aten::CSRMatrix(
      4, 5,
      aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 3, 2}), sizeof(IDX)*8, ctx),
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX)*8, ctx),
      aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 4, 1}), sizeof(IDX)*8, ctx),
      false);
 }
@@ -112,6 +113,41 @@ aten::COOMatrix COO3(DLContext ctx) {
      aten::VecToIdArray(std::vector<IDX>({2, 2, 1, 0, 3, 2}), sizeof(IDX)*8, ctx));
 }

+struct SparseCOOCSR {
+  static constexpr uint64_t NUM_ROWS = 100;
+  static constexpr uint64_t NUM_COLS = 150;
+  static constexpr uint64_t NUM_NZ = 5;
+  template <typename IDX>
+  static aten::COOMatrix COOSparse(const DLContext &ctx = CTX) {
+    return aten::COOMatrix(NUM_ROWS, NUM_COLS,
+                           aten::VecToIdArray(std::vector<IDX>({0, 1, 2, 3, 4}),
+                                              sizeof(IDX) * 8, ctx),
+                           aten::VecToIdArray(std::vector<IDX>({1, 2, 3, 4, 5}),
+                                              sizeof(IDX) * 8, ctx));
+  }
+
+  template <typename IDX>
+  static aten::CSRMatrix CSRSparse(const DLContext &ctx = CTX) {
+    auto &&indptr = std::vector<IDX>(NUM_ROWS + 1, NUM_NZ);
+    for (size_t i = 0; i < NUM_NZ; ++i) {
+      indptr[i + 1] = static_cast<IDX>(i + 1);
+    }
+    indptr[0] = 0;
+    return aten::CSRMatrix(NUM_ROWS, NUM_COLS,
+                           aten::VecToIdArray(indptr, sizeof(IDX) * 8, ctx),
+                           aten::VecToIdArray(std::vector<IDX>({1, 2, 3, 4, 5}),
+                                              sizeof(IDX) * 8, ctx),
+                           aten::VecToIdArray(std::vector<IDX>({1, 1, 1, 1, 1}),
+                                              sizeof(IDX) * 8, ctx),
+                           false);
+  }
+};
+
+bool isSparseCOO(const int64_t &num_threads, const int64_t &num_nodes,
+                 const int64_t &num_edges) {
+  // refer to COOToCSR<>() in ~dgl/src/array/cpu/spmat_op_impl_coo for details.
+  return num_threads * num_nodes > 4 * num_edges;
+}
 }  // namespace

 template <typename IDX>
@@ -119,9 +155,13 @@ void _TestCOOToCSR(DLContext ctx) {
  auto coo = COO1<IDX>(ctx);
  auto csr = CSR1<IDX>(ctx);
  auto tcsr = aten::COOToCSR(coo);
-  ASSERT_EQ(coo.num_rows, csr.num_rows);
-  ASSERT_EQ(coo.num_cols, csr.num_cols);
+  ASSERT_FALSE(coo.row_sorted);
+  ASSERT_FALSE(
+      isSparseCOO(omp_get_num_threads(), coo.num_rows, coo.row->shape[0]));
+  ASSERT_EQ(csr.num_rows, tcsr.num_rows);
+  ASSERT_EQ(csr.num_cols, tcsr.num_cols);
  ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(csr.indices, tcsr.indices));

  coo = COO2<IDX>(ctx);
  csr = CSR2<IDX>(ctx);
@@ -135,6 +175,7 @@ void _TestCOOToCSR(DLContext ctx) {
  auto rs_coo = aten::COOSort(coo, false);
  auto rs_csr = CSR1<IDX>(ctx);
  auto rs_tcsr = aten::COOToCSR(rs_coo);
+  ASSERT_TRUE(rs_coo.row_sorted);
  ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
  ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
  ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
@@ -173,6 +214,17 @@ void _TestCOOToCSR(DLContext ctx) {
  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));
+
+  coo = SparseCOOCSR::COOSparse<IDX>(ctx);
+  csr = SparseCOOCSR::CSRSparse<IDX>(ctx);
+  tcsr = aten::COOToCSR(coo);
+  ASSERT_FALSE(coo.row_sorted);
+  ASSERT_TRUE(
+      isSparseCOO(omp_get_num_threads(), coo.num_rows, coo.row->shape[0]));
+  ASSERT_EQ(csr.num_rows, tcsr.num_rows);
+  ASSERT_EQ(csr.num_cols, tcsr.num_cols);
+  ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(csr.indices, tcsr.indices));
 }

 TEST(SpmatTest, COOToCSR) {