[Sparse] Softmax based on Reduction and BroadcastOp (#5067)

* Update * update * Update * Update * Update * format * Update * lint * Update * lint * CI * CI * Update * address comments Co-authored-by: Ubuntu <ubuntu@ip-172-31-36-188.ap-northeast-1.compute.internal>

[Sparse] Softmax based on Reduction and BroadcastOp (#5067)
* Update * update * Update * Update * Update * format * Update * lint * Update * lint * CI * CI * Update * address comments Co-authored-by: Ubuntu <ubuntu@ip-172-31-36-188.ap-northeast-1.compute.internal>
0f1bcd99 · Mufei Li · GitHub · 311bd88a · 0f1bcd99 · 0f1bcd99
Unverified Commit 0f1bcd99 authored Dec 27, 2022 by Mufei Li Committed by GitHub Dec 27, 2022
8 changed files
--- a/dgl_sparse/include/sparse/softmax.h
+++ b/dgl_sparse/include/sparse/softmax.h
+/**
+ *  Copyright (c) 2022 by Contributors
+ * @file sparse/softmax.h
+ * @brief DGL C++ Softmax operator
+ */
+#ifndef SPARSE_SOFTMAX_H_
+#define SPARSE_SOFTMAX_H_
+
+#include <sparse/sparse_matrix.h>
+
+namespace dgl {
+namespace sparse {
+
+/**
+ * @brief Apply row-wise softmax to the non-zero entries of the sparse matrix.
+ *
+ * This function supports autograd for the sparse matrix, but it does not
+ * support higher order gradient.
+ *
+ * @param sparse_mat The sparse matrix
+ *
+ * @return Sparse matrix
+ */
+c10::intrusive_ptr<SparseMatrix> Softmax(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat);
+
+}  // namespace sparse
+}  // namespace dgl
+
+#endif  // SPARSE_SOFTMAX_H_
--- a/dgl_sparse/src/matmul.cc
+++ b/dgl_sparse/src/matmul.cc
@@ -98,6 +98,54 @@ torch::Tensor SDDMMNoAutoGrad(
  return ret;
 }

+torch::Tensor BroadcastOpNoAutoGrad(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat, torch::Tensor dense_mat,
+    const std::string& op) {
+  auto sparse_val = sparse_mat->value();
+  const int64_t out_row = sparse_mat->nnz();
+  const std::vector<int64_t> shape({out_row, sparse_val.size(1)});
+  auto ret = torch::zeros(shape, sparse_val.options());
+
+  auto dgl_sparse_val = TorchTensorToDGLArray(sparse_val);
+  auto dgl_dense_mat = TorchTensorToDGLArray(dense_mat);
+  auto dgl_ret = TorchTensorToDGLArray(ret);
+
+  // The format for calculation will be chosen in the following order: COO, CSR
+  // . COO is created if the sparse matrix only has CSC format.
+  if (sparse_mat->HasCOO() || !sparse_mat->HasCSR()) {
+    // sparse_mat->COOPtr() will implicitly convert CSC to COO format if COO
+    // does not exist.
+    auto coo = COOToOldDGLCOO(sparse_mat->COOPtr());
+    aten::COOSDDMM(
+        op.c_str(), coo, dgl_sparse_val, dgl_dense_mat, dgl_ret,
+        1 /* Lhs target: e */, 0 /* rhs target: u due to transpose */);
+  } else {
+    auto csr = CSRToOldDGLCSR(sparse_mat->CSRPtr());
+    aten::CSRSDDMM(
+        op.c_str(), csr, dgl_sparse_val, dgl_dense_mat, dgl_ret,
+        1 /* Lhs target: e */, 0 /* rhs target: u due to transpose */);
+  }
+  return ret;
+}
+
+torch::Tensor BroadcastSubNoAutoGrad(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
+    torch::Tensor dense_mat) {
+  return BroadcastOpNoAutoGrad(sparse_mat, dense_mat, "sub");
+}
+
+torch::Tensor BroadcastDivNoAutoGrad(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
+    torch::Tensor dense_mat) {
+  return BroadcastOpNoAutoGrad(sparse_mat, dense_mat, "div");
+}
+
+torch::Tensor BroadcastMulNoAutoGrad(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
+    torch::Tensor dense_mat) {
+  return BroadcastOpNoAutoGrad(sparse_mat, dense_mat, "mul");
+}
+
 c10::intrusive_ptr<SparseMatrix> SpSpMMNoAutoGrad(
    const c10::intrusive_ptr<SparseMatrix>& lhs_mat, torch::Tensor lhs_val,
    const c10::intrusive_ptr<SparseMatrix>& rhs_mat, torch::Tensor rhs_val,

--- a/dgl_sparse/src/matmul.h
+++ b/dgl_sparse/src/matmul.h
@@ -9,6 +9,8 @@
 #include <sparse/sparse_matrix.h>
 #include <torch/script.h>

+#include <string>
+
 namespace dgl {
 namespace sparse {

@@ -53,6 +55,71 @@ torch::Tensor SDDMMNoAutoGrad(
    const c10::intrusive_ptr<SparseMatrix>& sparse_mat, torch::Tensor mat1,
    torch::Tensor mat2_tr);

+/**
+ * @brief Broadcast the dense feature to the nonzero entries and then compute
+ * x_e = \phi(x_e, x_v), where x_e is the nonzero value, x_v is the dense
+ * feature, and \phi is add, sub, mul, or div.
+ *
+ * This function does not take care of autograd.
+ *
+ * @param sparse_mat The sparse matrix with N rows and (nnz, D) nonzero values
+ * @param dense_mat Dense feature of shape (N, D)
+ * @param op Operator, can be add, sub, mul, or div
+ *
+ * @return Dense tensor of shape (nnz, D)
+ */
+torch::Tensor BroadcastOpNoAutoGrad(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat, torch::Tensor dense_mat,
+    const std::string& op);
+
+/**
+ * @brief Broadcast the dense feature to the nonzero entries and then compute
+ * x_e = x_e - x_v, where x_e is the nonzero value, x_v is the dense
+ * feature.
+ *
+ * This function does not take care of autograd.
+ *
+ * @param sparse_mat The sparse matrix with N rows and (nnz, D) nonzero values
+ * @param dense_mat Dense feature of shape (N, D)
+ *
+ * @return Dense tensor of shape (nnz, D)
+ */
+torch::Tensor BroadcastSubNoAutoGrad(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
+    torch::Tensor dense_mat);
+
+/**
+ * @brief Broadcast the dense feature to the nonzero entries and then compute
+ * x_e = x_e / x_v, where x_e is the nonzero value, x_v is the dense
+ * feature.
+ *
+ * This function does not take care of autograd.
+ *
+ * @param sparse_mat The sparse matrix with N rows and (nnz, D) nonzero values
+ * @param dense_mat Dense feature of shape (N, D)
+ *
+ * @return Dense tensor of shape (nnz, D)
+ */
+torch::Tensor BroadcastDivNoAutoGrad(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
+    torch::Tensor dense_mat);
+
+/**
+ * @brief Broadcast the dense feature to the nonzero entries and then compute
+ * x_e = x_e * x_v, where x_e is the nonzero value, x_v is the dense
+ * feature.
+ *
+ * This function does not take care of autograd.
+ *
+ * @param sparse_mat The sparse matrix with N rows and (nnz, D) nonzero values
+ * @param dense_mat Dense feature of shape (N, D)
+ *
+ * @return Dense tensor of shape (nnz, D)
+ */
+torch::Tensor BroadcastMulNoAutoGrad(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
+    torch::Tensor dense_mat);
+
 /**
 * @brief Perform a sparse-sparse matrix multiplication with possibly different
 * sparsities. The two sparse values must have 1-dimensional values. If the

--- a/dgl_sparse/src/python_binding.cc
+++ b/dgl_sparse/src/python_binding.cc
@@ -10,6 +10,7 @@
 #include <sparse/elementwise_op.h>
 #include <sparse/reduction.h>
 #include <sparse/sddmm.h>
+#include <sparse/softmax.h>
 #include <sparse/sparse_matrix.h>
 #include <sparse/spmm.h>
 #include <sparse/spspmm.h>
@@ -42,6 +43,7 @@ TORCH_LIBRARY(dgl_sparse, m) {
      .def("val_like", &CreateValLike)
      .def("spmm", &SpMM)
      .def("sddmm", &SDDMM)
+      .def("softmax", &Softmax)
      .def("spspmm", &SpSpMM);
 }


--- a/dgl_sparse/src/softmax.cc
+++ b/dgl_sparse/src/softmax.cc
+/**
+ *  Copyright (c) 2022 by Contributors
+ * @file softmax.cc
+ * @brief DGL C++ Softmax operator implementation
+ */
+
+#include <sparse/reduction.h>
+#include <sparse/sparse_matrix.h>
+#include <torch/script.h>
+
+#include "./matmul.h"
+#include "./utils.h"
+
+namespace dgl {
+namespace sparse {
+
+using namespace torch::autograd;
+
+class SoftmaxAutoGrad : public Function<SoftmaxAutoGrad> {
+ public:
+  static torch::Tensor forward(
+      AutogradContext* ctx, c10::intrusive_ptr<SparseMatrix> sparse_mat,
+      torch::Tensor sparse_val);
+
+  static tensor_list backward(AutogradContext* ctx, tensor_list grad_outputs);
+};
+
+torch::Tensor SoftmaxAutoGrad::forward(
+    AutogradContext* ctx, c10::intrusive_ptr<SparseMatrix> sparse_mat,
+    torch::Tensor sparse_val) {
+  // Reduce by columns with dim 1.
+  auto sparse_val_max = ReduceMax(sparse_mat, 1);
+  auto sparse_val_exp =
+      BroadcastSubNoAutoGrad(sparse_mat, sparse_val_max).exp();
+  auto sparse_val_sum = ReduceSum(CreateValLike(sparse_mat, sparse_val_exp), 1);
+  auto sparse_score = BroadcastDivNoAutoGrad(
+      CreateValLike(sparse_mat, sparse_val_exp), sparse_val_sum);
+
+  const bool sparse_requires_grad = sparse_val.requires_grad();
+  torch::Tensor cache_sparse_score;
+  if (sparse_requires_grad) {
+    cache_sparse_score = sparse_score;
+  }
+  ctx->saved_data["sparse_matrix"] = sparse_mat;
+  ctx->saved_data["sparse_requires_grad"] = sparse_requires_grad;
+  ctx->save_for_backward({cache_sparse_score});
+  return sparse_score;
+}
+
+tensor_list SoftmaxAutoGrad::backward(
+    AutogradContext* ctx, tensor_list grad_outputs) {
+  auto saved = ctx->get_saved_variables();
+  auto sparse_score = saved[0];
+  auto output_grad = grad_outputs[0];
+
+  auto sparse_mat =
+      ctx->saved_data["sparse_matrix"].toCustomClass<SparseMatrix>();
+  const bool sparse_requires_grad =
+      ctx->saved_data["sparse_requires_grad"].toBool();
+
+  torch::Tensor sparse_val_grad;
+  if (sparse_requires_grad) {
+    auto sds = sparse_score * output_grad;
+    auto accum = ReduceSum(CreateValLike(sparse_mat, sds), 1);
+    sparse_val_grad = sds - BroadcastMulNoAutoGrad(
+                                CreateValLike(sparse_mat, sparse_score), accum);
+  }
+
+  return {torch::Tensor(), sparse_val_grad};
+}
+
+c10::intrusive_ptr<SparseMatrix> Softmax(
+    const c10::intrusive_ptr<SparseMatrix>& sparse_mat) {
+  auto sparse_val = sparse_mat->value();
+  bool expand_dim = false;
+  auto new_sparse_mat = sparse_mat;
+  if (sparse_val.dim() == 1) {
+    sparse_val = sparse_val.view({-1, 1});
+    expand_dim = true;
+    new_sparse_mat = CreateValLike(sparse_mat, sparse_val);
+  }
+
+  auto new_sparse_val = SoftmaxAutoGrad::apply(new_sparse_mat, sparse_val);
+
+  if (expand_dim) {
+    new_sparse_val = new_sparse_val.view(-1);
+  }
+  return CreateValLike(sparse_mat, new_sparse_val);
+}
+
+}  // namespace sparse
+}  // namespace dgl
--- a/python/dgl/mock_sparse2/__init__.py
+++ b/python/dgl/mock_sparse2/__init__.py
@@ -12,6 +12,7 @@ from .elementwise_op_sp import *
 from .matmul import *
 from .reduction import *  # pylint: disable=W0622
 from .sddmm import *
+from .softmax import *
 from .sparse_matrix import *
 from .unary_op_diag import *
 from .unary_op_sp import *

--- a/python/dgl/mock_sparse2/softmax.py
+++ b/python/dgl/mock_sparse2/softmax.py
+"""Softmax op for SparseMatrix"""
+# pylint: disable=invalid-name
+
+import torch
+
+from .sparse_matrix import SparseMatrix
+
+__all__ = ["softmax"]
+
+
+def softmax(A: SparseMatrix) -> SparseMatrix:
+    """Apply row-wise softmax to the non-zero entries of the sparse matrix.
+
+    If :attr:`A.val` takes shape :attr:`(nnz, D)`, then the output matrix
+    :attr:`A'` and :attr:`A'.val` take the same shape as :attr:`A` and
+    :attr:`A.val`. :attr:`A'.val[:, i]` is calculated based on
+    :attr:`A.val[:, i]`.
+
+    Parameters
+    ----------
+    A : SparseMatrix
+        The input sparse matrix
+
+    Returns
+    -------
+    SparseMatrix
+        The output sparse matrix
+
+    Examples
+    --------
+
+    Case1: matrix with values of shape (nnz)
+
+    >>> row = torch.tensor([0, 0, 1, 2])
+    >>> col = torch.tensor([1, 2, 2, 0])
+    >>> nnz = len(row)
+    >>> val = torch.arange(nnz).float()
+    >>> A = create_from_coo(row, col, val)
+    >>> softmax(A)
+    SparseMatrix(indices=tensor([[0, 0, 1, 2],
+        [1, 2, 2, 0]]),
+    values=tensor([0.2689, 0.7311, 1.0000, 1.0000]),
+    shape=(3, 3), nnz=4)
+
+    Case2: matrix with values of shape (nnz, D)
+
+    >>> val = torch.tensor([[0., 7.], [1., 3.], [2., 2.], [3., 1.]])
+    >>> A = create_from_coo(row, col, val)
+    >>> softmax(A)
+    SparseMatrix(indices=tensor([[0, 0, 1, 2],
+        [1, 2, 2, 0]]),
+    values=tensor([[0.2689, 0.9820],
+        [0.7311, 0.0180],
+        [1.0000, 1.0000],
+        [1.0000, 1.0000]]),
+    shape=(3, 3), nnz=4)
+    """
+    return SparseMatrix(torch.ops.dgl_sparse.softmax(A.c_sparse_matrix))
--- a/tests/pytorch/mock_sparse2/test_softmax.py
+++ b/tests/pytorch/mock_sparse2/test_softmax.py
+import sys
+
+import backend as F
+
+import dgl
+import pytest
+import torch
+
+from dgl.mock_sparse2 import create_from_coo, softmax
+
+# TODO(#4818): Skipping tests on win.
+if not sys.platform.startswith("linux"):
+    pytest.skip("skipping tests on win", allow_module_level=True)
+
+
+@pytest.mark.parametrize("val_D", [None, 2])
+@pytest.mark.parametrize("csr", [True, False])
+def test_softmax(val_D, csr):
+    dev = F.ctx()
+    row = torch.tensor([0, 0, 1, 1]).to(dev)
+    col = torch.tensor([0, 2, 1, 2]).to(dev)
+    nnz = len(row)
+    if val_D is None:
+        val = torch.randn(nnz).to(dev)
+    else:
+        val = torch.randn(nnz, val_D).to(dev)
+
+    val_sparse = val.clone().requires_grad_()
+    A = create_from_coo(row, col, val_sparse)
+
+    if csr:
+        # Test CSR
+        A.csr()
+
+    A_max = softmax(A)
+    g = dgl.graph((col, row), num_nodes=max(A.shape))
+    val_g = val.clone().requires_grad_()
+    score = dgl.nn.functional.edge_softmax(g, val_g)
+    assert torch.allclose(A_max.val, score)
+
+    grad = torch.randn_like(score).to(dev)
+    A_max.val.backward(grad)
+    score.backward(grad)
+    assert torch.allclose(A.val.grad, val_g.grad)