[Feature] Add cuda support for Sparse Matrix multiplication, summation and masking (#2782)

* init cuda support * cuSPARSE err * passed unittest for csr_mm/SpGEMM. int64 not supported * Debugging cuSPARSE error 3 * csrgeam only supports int32? * disabling int64 for cuda * refactor and add CSRMask * lint * oops * remove todo * rewrite CSRMask with CSRGetData * lint * fix test * address comments * lint * fix * addresses comments and rename BUG_ON Co-authored-by: Israt Nisa <nisisrat@amazon.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-30-71.ec2.internal> Co-authored-by: Quan Gan <coin2028@hotmail.com> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com> Co-authored-by: Minjie Wang <wmjlyjemaine@gmail.com>

[Feature] Add cuda support for Sparse Matrix multiplication, summation and masking (#2782)
* init cuda support * cuSPARSE err * passed unittest for csr_mm/SpGEMM. int64 not supported * Debugging cuSPARSE error 3 * csrgeam only supports int32? * disabling int64 for cuda * refactor and add CSRMask * lint * oops * remove todo * rewrite CSRMask with CSRGetData * lint * fix test * address comments * lint * fix * addresses comments and rename BUG_ON Co-authored-by: Israt Nisa <nisisrat@amazon.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-30-71.ec2.internal> Co-authored-by: Quan Gan <coin2028@hotmail.com> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com> Co-authored-by: Minjie Wang <wmjlyjemaine@gmail.com>
ab2bd1f1 · Israt Nisa · GitHub · e18c2ab4 · ab2bd1f1 · ab2bd1f1
Unverified Commit ab2bd1f1 authored Apr 27, 2021 by Israt Nisa Committed by GitHub Apr 27, 2021
8 changed files
--- a/src/array/kernel_decl.h
+++ b/src/array/kernel_decl.h
@@ -96,7 +96,14 @@ void BackwardSegmentCmp(NDArray feat,
 /*!
 * \brief Sparse-sparse matrix multiplication
 *
- * \note B is transposed (i.e. in CSC format).
+ * \param A The left operand.
+ * \param A_weights The weights of matrix as a 1D tensor.
+ * \param B The right operand.
+ * \param B_weights The weights of matrix as a 1D tensor.
+ *
+ * \note GPU implementation will cast the indices to 32 bit.
+ * \note The zero entries in the result are not removed.
+ * \note The CSR matrix should not have duplicate entries.
 */
 template <int XPU, typename IdType, typename DType>
 std::pair<CSRMatrix, NDArray> CSRMM(
@@ -107,18 +114,19 @@ std::pair<CSRMatrix, NDArray> CSRMM(

 /*!
 * \brief Sparse-sparse matrix summation.
+ *
+ * \param A The sparse matrices with the same size.
+ * \param A_weights The weights of each sparse matrix as a 1D tensor.
+ *
+ * \note GPU implementation will cast the indices to 32 bit.
+ * \note The zero entries in the result are not removed.
+ * \note The CSR matrix should not have duplicate entries.
 */
 template <int XPU, typename IdType, typename DType>
 std::pair<CSRMatrix, NDArray> CSRSum(
    const std::vector<CSRMatrix>& A,
    const std::vector<NDArray>& A_weights);

-/*!
- * \brief Return a sparse matrix with the values of A but nonzero entry locations of B.
- */
-template <int XPU, typename IdType, typename DType>
-NDArray CSRMask(const CSRMatrix& A, NDArray A_weights, const CSRMatrix& B);
-
 }  // namespace aten
 }  // namespace dgl


--- a/src/graph/graph_op.cc
+++ b/src/graph/graph_op.cc
@@ -737,12 +737,12 @@ IdArray MapIds(IdArray ids, IdArray range_starts, IdArray range_ends, IdArray ty
    IdType id = ids_data[i];
    auto it = std::lower_bound(range_end_data, range_end_data + num_ranges, id);
    // The range must exist.
-    BUG_ON(it != range_end_data + num_ranges);
+    BUG_IF_FAIL(it != range_end_data + num_ranges);
    size_t range_id = it - range_end_data;
    int type_id = range_id % num_types;
    types_data[i] = type_id;
    int part_id = range_id / num_types;
-    BUG_ON(part_id < num_parts);
+    BUG_IF_FAIL(part_id < num_parts);
    if (part_id == 0) {
      per_type_ids_data[i] = id - range_start_data[range_id];
    } else {

--- a/src/graph/immutable_graph.cc
+++ b/src/graph/immutable_graph.cc
@@ -327,7 +327,7 @@ std::pair<dgl_id_t, dgl_id_t> COO::FindEdge(dgl_id_t eid) const {

 EdgeArray COO::FindEdges(IdArray eids) const {
  CHECK(aten::IsValidIdArray(eids)) << "Invalid edge id array";
-  BUG_ON(aten::IsNullArray(adj_.data)) <<
+  BUG_IF_FAIL(aten::IsNullArray(adj_.data)) <<
    "FindEdges requires the internal COO matrix not having EIDs.";
  return EdgeArray{aten::IndexSelect(adj_.row, eids),
                   aten::IndexSelect(adj_.col, eids),

--- a/src/graph/unit_graph.cc
+++ b/src/graph/unit_graph.cc
@@ -235,7 +235,7 @@ class UnitGraph::COO : public BaseHeteroGraph {

  EdgeArray FindEdges(dgl_type_t etype, IdArray eids) const override {
    CHECK(aten::IsValidIdArray(eids)) << "Invalid edge id array";
-    BUG_ON(aten::IsNullArray(adj_.data)) <<
+    BUG_IF_FAIL(aten::IsNullArray(adj_.data)) <<
      "FindEdges requires the internal COO matrix not having EIDs.";
    return EdgeArray{aten::IndexSelect(adj_.row, eids),
                     aten::IndexSelect(adj_.col, eids),

--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -17,6 +17,14 @@
 extern "C" void NDArrayDLPackDeleter(DLManagedTensor* tensor);

 namespace dgl {
+
+constexpr DLDataType DLDataTypeTraits<int32_t>::dtype;
+constexpr DLDataType DLDataTypeTraits<int64_t>::dtype;
+constexpr DLDataType DLDataTypeTraits<uint32_t>::dtype;
+constexpr DLDataType DLDataTypeTraits<uint64_t>::dtype;
+constexpr DLDataType DLDataTypeTraits<float>::dtype;
+constexpr DLDataType DLDataTypeTraits<double>::dtype;
+
 namespace runtime {

 inline void VerifyDataType(DLDataType dtype) {
@@ -251,7 +259,7 @@ template<typename T>
 NDArray NDArray::FromVector(const std::vector<T>& vec, DLContext ctx) {
  const DLDataType dtype = DLDataTypeTraits<T>::dtype;
  int64_t size = static_cast<int64_t>(vec.size());
-  NDArray ret = NDArray::Empty({size}, dtype, DLContext{kDLCPU, 0});
+  NDArray ret = NDArray::Empty({size}, dtype, ctx);
  DeviceAPI::Get(ctx)->CopyDataFromTo(
      vec.data(),
      0,

--- a/tests/compute/test_csrmm.py
+++ b/tests/compute/test_csrmm.py
+import numpy as np
+import scipy.sparse as ssp
+import dgl
+from utils import parametrize_dtype
+import backend as F
+
+def _random_simple_graph(idtype, dtype, ctx, M, N, max_nnz, srctype, dsttype, etype):
+    src = np.random.randint(0, M, (max_nnz,))
+    dst = np.random.randint(0, N, (max_nnz,))
+    val = np.random.randn(max_nnz)
+    a = ssp.csr_matrix((val, (src, dst)), shape=(M, N))
+    a.sum_duplicates()
+    a = a.tocoo()
+    A = dgl.heterograph(
+        {('A', 'AB', 'B'): (
+            F.copy_to(F.tensor(a.row, dtype=idtype), ctx),
+            F.copy_to(F.tensor(a.col, dtype=idtype), ctx))},
+        num_nodes_dict={'A': a.shape[0], 'B': a.shape[1]})
+    A.edata['w'] = F.copy_to(F.tensor(a.data, dtype=dtype), ctx)
+    return a, A
+
+@parametrize_dtype
+def test_csrmm(idtype):
+    for dtype in [F.float32, F.float64]:
+        a, A = _random_simple_graph(idtype, dtype, F.ctx(), 500, 600, 9000, 'A', 'B', 'AB')
+        b, B = _random_simple_graph(idtype, dtype, F.ctx(), 600, 700, 9000, 'B', 'C', 'BC')
+        C, C_weights = dgl.sparse.csrmm(A._graph, A.edata['w'], B._graph, B.edata['w'], 2)
+        C_adj = C.adjacency_matrix_scipy(0, True, 'csr')
+        C_adj.data = F.asnumpy(C_weights)
+        C_adj = F.tensor(C_adj.todense(), dtype=dtype)
+        c = F.tensor((a * b).todense(), dtype=dtype)
+        assert F.allclose(C_adj, c)
+
+@parametrize_dtype
+def test_csrsum(idtype):
+    for dtype in [F.float32, F.float64]:
+        a, A = _random_simple_graph(idtype, dtype, F.ctx(), 500, 600, 9000, 'A', 'B', 'AB')
+        b, B = _random_simple_graph(idtype, dtype, F.ctx(), 500, 600, 9000, 'A', 'B', 'AB')
+        C, C_weights = dgl.sparse.csrsum([A._graph, B._graph], [A.edata['w'], B.edata['w']])
+        C_adj = C.adjacency_matrix_scipy(0, True, 'csr')
+        C_adj.data = F.asnumpy(C_weights)
+        C_adj = F.tensor(C_adj.todense(), dtype=dtype)
+        c = F.tensor((a + b).todense(), dtype=dtype)
+        assert F.allclose(C_adj, c)
+
+@parametrize_dtype
+def test_csrmask(idtype):
+    for dtype in [F.float32, F.float64]:
+        a, A = _random_simple_graph(idtype, dtype, F.ctx(), 500, 600, 9000, 'A', 'B', 'AB')
+        b, B = _random_simple_graph(idtype, dtype, F.ctx(), 500, 600, 9000, 'A', 'B', 'AB')
+        C = dgl.sparse.csrmask(A._graph, A.edata['w'], B._graph)
+        c = F.tensor(a.tocsr()[b != 0], dtype)
+        assert F.allclose(C, c)
+
+if __name__ == '__main__':
+    test_csrmm(F.int32)
+    test_csrmm(F.int64)
+    test_csrsum(F.int32)
+    test_csrsum(F.int64)
+    test_csrmask(F.int32)
+    test_csrmask(F.int64)
--- a/tests/compute/test_sparse.py
+++ b/tests/compute/test_sparse.py
@@ -283,6 +283,5 @@ def test_segment_reduce(reducer):
        assert F.allclose(grad1, grad2)
        print('backward passed')

-
 if __name__ == '__main__':
    test_spmm(F.int32, graphs[0], spmm_shapes[0], 'mul', 'sum')
--- a/tests/cpp/test_csrmm.cc
+++ b/tests/cpp/test_csrmm.cc
@@ -64,9 +64,10 @@ std::pair<aten::CSRMatrix, NDArray> CSR_A(DLContext ctx = CTX) {
  auto csr = aten::CSRMatrix(
      4, 5,
      NDArray::FromVector(std::vector<IdType>({0, 2, 4, 7, 8}), ctx),
-      NDArray::FromVector(std::vector<IdType>({2, 3, 2, 3, 0, 1, 3, 4}), ctx));
+      NDArray::FromVector(std::vector<IdType>({2, 3, 2, 3, 0, 1, 3, 4}), ctx),
+      NDArray::FromVector(std::vector<IdType>({1, 0, 2, 3, 4, 5, 6, 7}), ctx));
  auto weights = NDArray::FromVector(
-      std::vector<DType>({1.0, 0.7, 0.5, 0.0, 0.4, 0.7, 0.2, 0.2}), ctx);
+      std::vector<DType>({0.7, 1.0, 0.5, 0.0, 0.4, 0.7, 0.2, 0.2}), ctx);
  return {csr, weights};
 }

@@ -162,7 +163,8 @@ template <typename IdType, typename DType>
 void _TestCsrmask(DLContext ctx = CTX) {
  auto A = CSR_A<IdType, DType>(ctx);
  auto C = CSR_C<IdType, DType>(ctx);
-  auto A_mask_C = aten::CSRMask(A.first, A.second, C.first);
+  auto C_coo = CSRToCOO(C.first, false);
+  auto A_mask_C = aten::CSRGetData<DType>(A.first, C_coo.row, C_coo.col, A.second, 0);
  auto A_mask_C2 = CSR_A_mask_C<DType>(ctx);
  ASSERT_TRUE(ArrayEQ<DType>(A_mask_C, A_mask_C2));
 }