[Sparse] Add coalesce and has_duplicate to SparseMatrix (#5071)

* add coalesce() and has_duplicate() * Add duplicate check in spspmm * black * fix * clang-format * lintrunner * minor fix * fix

[Sparse] Add coalesce and has_duplicate to SparseMatrix (#5071)
* add coalesce() and has_duplicate() * Add duplicate check in spspmm * black * fix * clang-format * lintrunner * minor fix * fix
fa5ff2fc · Minjie Wang · GitHub · 8c3e7830 · fa5ff2fc · fa5ff2fc
Unverified Commit fa5ff2fc authored Dec 28, 2022 by Minjie Wang Committed by GitHub Dec 28, 2022
7 changed files
--- a/dgl_sparse/include/sparse/sparse_matrix.h
+++ b/dgl_sparse/include/sparse/sparse_matrix.h
@@ -116,6 +116,23 @@ class SparseMatrix : public torch::CustomClassHolder {
   */
  c10::intrusive_ptr<SparseMatrix> Transpose() const;
+  /**
+   * @brief Return a new coalesced matrix.
+   *
+   * A coalesced sparse matrix satisfies the following properties:
+   *   - the indices of the non-zero elements are unique,
+   *   - the indices are sorted in lexicographical order.
+   *
+   * @return A coalesced sparse matrix.
+   */
+  c10::intrusive_ptr<SparseMatrix> Coalesce();
+  /**
+   * @brief Return true if this sparse matrix contains duplicate indices.
+   * @return A bool flag.
+   */
+  bool HasDuplicate();
 private:
  /** @brief Create the COO format for the sparse matrix internally */
  void _CreateCOO();

--- a/dgl_sparse/src/python_binding.cc
+++ b/dgl_sparse/src/python_binding.cc
@@ -29,7 +29,9 @@ TORCH_LIBRARY(dgl_sparse, m) {
      .def("coo", &SparseMatrix::COOTensors)
      .def("csr", &SparseMatrix::CSRTensors)
      .def("csc", &SparseMatrix::CSCTensors)
-      .def("transpose", &SparseMatrix::Transpose);
+      .def("transpose", &SparseMatrix::Transpose)
+      .def("coalesce", &SparseMatrix::Coalesce)
+      .def("has_duplicate", &SparseMatrix::HasDuplicate);
  m.def("create_from_coo", &CreateFromCOO)
      .def("create_from_csr", &CreateFromCSR)
      .def("create_from_csc", &CreateFromCSC)

--- a/dgl_sparse/src/sparse_matrix_coalesce.cc
+++ b/dgl_sparse/src/sparse_matrix_coalesce.cc
+/**
+ *  Copyright (c) 2022 by Contributors
+ * @file sparse_matrix_coalesce.cc
+ * @brief Operators related to sparse matrix coalescing.
+ */
+// clang-format off
+#include <sparse/dgl_headers.h>
+// clang-format on
+#include <sparse/sparse_matrix.h>
+#include "./utils.h"
+namespace dgl {
+namespace sparse {
+c10::intrusive_ptr<SparseMatrix> SparseMatrix::Coalesce() {
+  auto torch_coo = COOToTorchCOO(this->COOPtr(), this->value());
+  auto coalesced_coo = torch_coo.coalesce();
+  torch::Tensor indices = coalesced_coo.indices();
+  torch::Tensor row = indices[0];
+  torch::Tensor col = indices[1];
+  return CreateFromCOO(row, col, coalesced_coo.values(), this->shape());
+}
+bool SparseMatrix::HasDuplicate() {
+  aten::CSRMatrix dgl_csr;
+  // The format for calculation will be chosen in the following order: CSR,
+  // CSC. CSR is created if the sparse matrix only has CSC format.
+  if (HasCSR() || !HasCSC()) {
+    dgl_csr = CSRToOldDGLCSR(CSRPtr());
+  } else {
+    dgl_csr = CSRToOldDGLCSR(CSCPtr());
+  }
+  return aten::CSRHasDuplicate(dgl_csr);
+}
+}  // namespace sparse
+}  // namespace dgl
--- a/dgl_sparse/src/spspmm.cc
+++ b/dgl_sparse/src/spspmm.cc
@@ -32,17 +32,30 @@ void _SpSpMMSanityCheck(
    const c10::intrusive_ptr<SparseMatrix>& rhs_mat) {
  const auto& lhs_shape = lhs_mat->shape();
  const auto& rhs_shape = rhs_mat->shape();
-  CHECK_EQ(lhs_shape[1], rhs_shape[0])
+  TORCH_CHECK(
-      << "SpSpMM: the second dim of lhs_mat should be equal to the first dim "
+      lhs_shape[1] == rhs_shape[0],
-         "of the second matrix";
+      "SpSpMM: the second dim of lhs_mat should be equal to the first dim ",
-  CHECK_EQ(lhs_mat->value().dim(), 1)
+      "of the second matrix");
-      << "SpSpMM: the value shape of lhs_mat should be 1-D";
+  TORCH_CHECK(
-  CHECK_EQ(rhs_mat->value().dim(), 1)
+      lhs_mat->value().dim() == 1,
-      << "SpSpMM: the value shape of rhs_mat should be 1-D";
+      "SpSpMM: the value shape of lhs_mat should be 1-D");
-  CHECK_EQ(lhs_mat->device(), rhs_mat->device())
+  TORCH_CHECK(
-      << "SpSpMM: lhs_mat and rhs_mat should on the same device";
+      rhs_mat->value().dim() == 1,
-  CHECK_EQ(lhs_mat->dtype(), rhs_mat->dtype())
+      "SpSpMM: the value shape of rhs_mat should be 1-D");
-      << "SpSpMM: lhs_mat and rhs_mat should have the same dtype";
+  TORCH_CHECK(
+      lhs_mat->device() == rhs_mat->device(),
+      "SpSpMM: lhs_mat and rhs_mat should be on the same device");
+  TORCH_CHECK(
+      lhs_mat->dtype() == rhs_mat->dtype(),
+      "SpSpMM: lhs_mat and rhs_mat should have the same dtype");
+  TORCH_CHECK(
+      !lhs_mat->HasDuplicate(),
+      "SpSpMM does not support lhs_mat with duplicate indices. ",
+      "Call A = A.coalesce() to dedup first.");
+  TORCH_CHECK(
+      !rhs_mat->HasDuplicate(),
+      "SpSpMM does not support rhs_mat with duplicate indices. ",
+      "Call A = A.coalesce() to dedup first.");
 }
 // Mask select value of `mat` by `sub_mat`.

--- a/python/dgl/mock_sparse2/sparse_matrix.py
+++ b/python/dgl/mock_sparse2/sparse_matrix.py
@@ -66,6 +66,28 @@ class SparseMatrix:
        """
        return self.c_sparse_matrix.device()
+    @property
+    def row(self) -> torch.Tensor:
+        """Get the row indices of the nonzero elements.
+        Returns
+        -------
+        tensor
+            Row indices of the nonzero elements
+        """
+        return self.coo()[0]
+    @property
+    def col(self) -> torch.Tensor:
+        """Get the column indices of the nonzero elements.
+        Returns
+        -------
+        tensor
+            Column indices of the nonzero elements
+        """
+        return self.coo()[1]
    def indices(
        self, fmt: str, return_shuffle=False
    ) -> Tuple[torch.Tensor, ...]:
@@ -173,6 +195,60 @@ class SparseMatrix:
        """
        return SparseMatrix(self.c_sparse_matrix.transpose())
+    def coalesce(self):
+        """Return a coalesced sparse matrix.
+        A coalesced sparse matrix satisfies the following properties:
+          - the indices of the non-zero elements are unique,
+          - the indices are sorted in lexicographical order.
+        The coalescing process will accumulate the non-zero values of the same
+        indices by summation.
+        The function does not support autograd.
+        Returns
+        -------
+        SparseMatrix
+            The coalesced sparse matrix.
+        Examples
+        --------
+        >>> row = torch.tensor([1, 0, 0, 0, 1])
+        >>> col = torch.tensor([1, 1, 1, 2, 2])
+        >>> val = torch.tensor([0, 1, 2, 3, 4])
+        >>> A = create_from_coo(row, col, val)
+        >>> A = A.coalesce()
+        >>> print(A)
+        SparseMatrix(indices=tensor([[0, 0, 1, 1],
+                [1, 2, 1, 2]]),
+        values=tensor([3, 3, 0, 4]),
+        shape=(2, 3), nnz=4)
+        """
+        return SparseMatrix(self.c_sparse_matrix.coalesce())
+    def has_duplicate(self):
+        """Return whether this sparse matrix contains duplicate indices.
+        Returns
+        -------
+        bool
+            True if this sparse matrix contains duplicate indices.
+        Examples
+        --------
+        >>> row = torch.tensor([1, 0, 0, 0, 1])
+        >>> col = torch.tensor([1, 1, 1, 2, 2])
+        >>> val = torch.tensor([0, 1, 2, 3, 4])
+        >>> A = create_from_coo(row, col, val)
+        >>> print(A.has_duplicate())
+        True
+        >>> print(A.coalesce().has_duplicate())
+        False
+        """
+        return self.c_sparse_matrix.has_duplicate()
 def create_from_coo(
    row: torch.Tensor,

--- a/tests/pytorch/mock_sparse2/test_matmul.py
+++ b/tests/pytorch/mock_sparse2/test_matmul.py
@@ -4,7 +4,7 @@ import backend as F
 import pytest
 import torch
-from dgl.mock_sparse2 import val_like
+from dgl.mock_sparse2 import create_from_coo, val_like
 from .utils import (
    clone_detach_and_grad,
@@ -59,9 +59,7 @@ def test_spmm(create_func, shape, nnz, out_dim):
 @pytest.mark.parametrize("shape_k", [3, 4])
 @pytest.mark.parametrize("nnz1", [1, 10])
 @pytest.mark.parametrize("nnz2", [1, 10])
-def test_sparse_sparse_mm(
+def test_spspmm(create_func1, create_func2, shape_n_m, shape_k, nnz1, nnz2):
-    create_func1, create_func2, shape_n_m, shape_k, nnz1, nnz2
-):
    dev = F.ctx()
    shape1 = shape_n_m
    shape2 = (shape_n_m[1], shape_k)
@@ -89,3 +87,33 @@ def test_sparse_sparse_mm(
            torch_A2.grad.to_dense(),
            atol=1e-05,
        )
+def test_spspmm_duplicate():
+    dev = F.ctx()
+    row = torch.tensor([1, 0, 0, 0, 1]).to(dev)
+    col = torch.tensor([1, 1, 1, 2, 2]).to(dev)
+    val = torch.randn(len(row)).to(dev)
+    shape = (4, 4)
+    A1 = create_from_coo(row, col, val, shape)
+    row = torch.tensor([1, 0, 0, 1]).to(dev)
+    col = torch.tensor([1, 1, 2, 2]).to(dev)
+    val = torch.randn(len(row)).to(dev)
+    shape = (4, 4)
+    A2 = create_from_coo(row, col, val, shape)
+    try:
+        A1 @ A2
+    except:
+        pass
+    else:
+        assert False, "Should raise error."
+    try:
+        A2 @ A1
+    except:
+        pass
+    else:
+        assert False, "Should raise error."
--- a/tests/pytorch/mock_sparse2/test_sparse_matrix.py
+++ b/tests/pytorch/mock_sparse2/test_sparse_matrix.py
-import pytest
-import torch
 import sys
 import backend as F
+import pytest
+import torch
-from dgl.mock_sparse2 import create_from_coo, create_from_csr, create_from_csc, val_like
+from dgl.mock_sparse2 import (
+    create_from_coo,
+    create_from_csc,
+    create_from_csr,
+    val_like,
+)
 # TODO(#4818): Skipping tests on win.
 if not sys.platform.startswith("linux"):
@@ -337,6 +342,7 @@ def test_csr_to_csc(dense_dim, indptr, indices, shape):
    assert torch.allclose(mat_indptr, indptr)
    assert torch.allclose(mat_indices, indices)
 @pytest.mark.parametrize("val_shape", [(3), (3, 2)])
 @pytest.mark.parametrize("shape", [(3, 5), (5, 5)])
 def test_val_like(val_shape, shape):
@@ -368,3 +374,47 @@ def test_val_like(val_shape, shape):
    csc_A = create_from_csc(indptr, indices, val, shape)
    csc_B = val_like(csc_A, new_val)
    check_val_like(csc_A, csc_B)
+def test_coalesce():
+    ctx = F.ctx()
+    row = torch.tensor([1, 0, 0, 0, 1]).to(ctx)
+    col = torch.tensor([1, 1, 1, 2, 2]).to(ctx)
+    val = torch.arange(len(row)).to(ctx)
+    A = create_from_coo(row, col, val, (4, 4))
+    assert A.has_duplicate()
+    A_coalesced = A.coalesce()
+    assert A_coalesced.nnz == 4
+    assert A_coalesced.shape == (4, 4)
+    assert list(A_coalesced.row) == [0, 0, 1, 1]
+    assert list(A_coalesced.col) == [1, 2, 1, 2]
+    # Values of duplicate indices are added together.
+    assert list(A_coalesced.val) == [3, 3, 0, 4]
+    assert not A_coalesced.has_duplicate()
+def test_has_duplicate():
+    ctx = F.ctx()
+    row = torch.tensor([1, 0, 0, 0, 1]).to(ctx)
+    col = torch.tensor([1, 1, 1, 2, 2]).to(ctx)
+    val = torch.arange(len(row)).to(ctx)
+    shape = (4, 4)
+    # COO
+    coo_A = create_from_coo(row, col, val, shape)
+    assert coo_A.has_duplicate()
+    # CSR
+    indptr, indices, _ = coo_A.csr()
+    csr_A = create_from_csr(indptr, indices, val, shape)
+    assert csr_A.has_duplicate()
+    # CSC
+    indptr, indices, _ = coo_A.csc()
+    csc_A = create_from_csc(indptr, indices, val, shape)
+    assert csc_A.has_duplicate()