Unverified Commit 0f1bcd99 authored by Mufei Li's avatar Mufei Li Committed by GitHub
Browse files

[Sparse] Softmax based on Reduction and BroadcastOp (#5067)



* Update

* update

* Update

* Update

* Update

* format

* Update

* lint

* Update

* lint

* CI

* CI

* Update

* address comments
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-36-188.ap-northeast-1.compute.internal>
parent 311bd88a
/**
* Copyright (c) 2022 by Contributors
* @file sparse/softmax.h
* @brief DGL C++ Softmax operator
*/
#ifndef SPARSE_SOFTMAX_H_
#define SPARSE_SOFTMAX_H_
#include <sparse/sparse_matrix.h>
namespace dgl {
namespace sparse {
/**
* @brief Apply row-wise softmax to the non-zero entries of the sparse matrix.
*
* This function supports autograd for the sparse matrix, but it does not
* support higher order gradient.
*
* @param sparse_mat The sparse matrix
*
* @return Sparse matrix
*/
c10::intrusive_ptr<SparseMatrix> Softmax(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat);
} // namespace sparse
} // namespace dgl
#endif // SPARSE_SOFTMAX_H_
......@@ -98,6 +98,54 @@ torch::Tensor SDDMMNoAutoGrad(
return ret;
}
torch::Tensor BroadcastOpNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat, torch::Tensor dense_mat,
const std::string& op) {
auto sparse_val = sparse_mat->value();
const int64_t out_row = sparse_mat->nnz();
const std::vector<int64_t> shape({out_row, sparse_val.size(1)});
auto ret = torch::zeros(shape, sparse_val.options());
auto dgl_sparse_val = TorchTensorToDGLArray(sparse_val);
auto dgl_dense_mat = TorchTensorToDGLArray(dense_mat);
auto dgl_ret = TorchTensorToDGLArray(ret);
// The format for calculation will be chosen in the following order: COO, CSR
// . COO is created if the sparse matrix only has CSC format.
if (sparse_mat->HasCOO() || !sparse_mat->HasCSR()) {
// sparse_mat->COOPtr() will implicitly convert CSC to COO format if COO
// does not exist.
auto coo = COOToOldDGLCOO(sparse_mat->COOPtr());
aten::COOSDDMM(
op.c_str(), coo, dgl_sparse_val, dgl_dense_mat, dgl_ret,
1 /* Lhs target: e */, 0 /* rhs target: u due to transpose */);
} else {
auto csr = CSRToOldDGLCSR(sparse_mat->CSRPtr());
aten::CSRSDDMM(
op.c_str(), csr, dgl_sparse_val, dgl_dense_mat, dgl_ret,
1 /* Lhs target: e */, 0 /* rhs target: u due to transpose */);
}
return ret;
}
torch::Tensor BroadcastSubNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
torch::Tensor dense_mat) {
return BroadcastOpNoAutoGrad(sparse_mat, dense_mat, "sub");
}
torch::Tensor BroadcastDivNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
torch::Tensor dense_mat) {
return BroadcastOpNoAutoGrad(sparse_mat, dense_mat, "div");
}
torch::Tensor BroadcastMulNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
torch::Tensor dense_mat) {
return BroadcastOpNoAutoGrad(sparse_mat, dense_mat, "mul");
}
c10::intrusive_ptr<SparseMatrix> SpSpMMNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& lhs_mat, torch::Tensor lhs_val,
const c10::intrusive_ptr<SparseMatrix>& rhs_mat, torch::Tensor rhs_val,
......
......@@ -9,6 +9,8 @@
#include <sparse/sparse_matrix.h>
#include <torch/script.h>
#include <string>
namespace dgl {
namespace sparse {
......@@ -53,6 +55,71 @@ torch::Tensor SDDMMNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat, torch::Tensor mat1,
torch::Tensor mat2_tr);
/**
* @brief Broadcast the dense feature to the nonzero entries and then compute
* x_e = \phi(x_e, x_v), where x_e is the nonzero value, x_v is the dense
* feature, and \phi is add, sub, mul, or div.
*
* This function does not take care of autograd.
*
* @param sparse_mat The sparse matrix with N rows and (nnz, D) nonzero values
* @param dense_mat Dense feature of shape (N, D)
* @param op Operator, can be add, sub, mul, or div
*
* @return Dense tensor of shape (nnz, D)
*/
torch::Tensor BroadcastOpNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat, torch::Tensor dense_mat,
const std::string& op);
/**
* @brief Broadcast the dense feature to the nonzero entries and then compute
* x_e = x_e - x_v, where x_e is the nonzero value, x_v is the dense
* feature.
*
* This function does not take care of autograd.
*
* @param sparse_mat The sparse matrix with N rows and (nnz, D) nonzero values
* @param dense_mat Dense feature of shape (N, D)
*
* @return Dense tensor of shape (nnz, D)
*/
torch::Tensor BroadcastSubNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
torch::Tensor dense_mat);
/**
* @brief Broadcast the dense feature to the nonzero entries and then compute
* x_e = x_e / x_v, where x_e is the nonzero value, x_v is the dense
* feature.
*
* This function does not take care of autograd.
*
* @param sparse_mat The sparse matrix with N rows and (nnz, D) nonzero values
* @param dense_mat Dense feature of shape (N, D)
*
* @return Dense tensor of shape (nnz, D)
*/
torch::Tensor BroadcastDivNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
torch::Tensor dense_mat);
/**
* @brief Broadcast the dense feature to the nonzero entries and then compute
* x_e = x_e * x_v, where x_e is the nonzero value, x_v is the dense
* feature.
*
* This function does not take care of autograd.
*
* @param sparse_mat The sparse matrix with N rows and (nnz, D) nonzero values
* @param dense_mat Dense feature of shape (N, D)
*
* @return Dense tensor of shape (nnz, D)
*/
torch::Tensor BroadcastMulNoAutoGrad(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat,
torch::Tensor dense_mat);
/**
* @brief Perform a sparse-sparse matrix multiplication with possibly different
* sparsities. The two sparse values must have 1-dimensional values. If the
......
......@@ -10,6 +10,7 @@
#include <sparse/elementwise_op.h>
#include <sparse/reduction.h>
#include <sparse/sddmm.h>
#include <sparse/softmax.h>
#include <sparse/sparse_matrix.h>
#include <sparse/spmm.h>
#include <sparse/spspmm.h>
......@@ -42,6 +43,7 @@ TORCH_LIBRARY(dgl_sparse, m) {
.def("val_like", &CreateValLike)
.def("spmm", &SpMM)
.def("sddmm", &SDDMM)
.def("softmax", &Softmax)
.def("spspmm", &SpSpMM);
}
......
/**
* Copyright (c) 2022 by Contributors
* @file softmax.cc
* @brief DGL C++ Softmax operator implementation
*/
#include <sparse/reduction.h>
#include <sparse/sparse_matrix.h>
#include <torch/script.h>
#include "./matmul.h"
#include "./utils.h"
namespace dgl {
namespace sparse {
using namespace torch::autograd;
class SoftmaxAutoGrad : public Function<SoftmaxAutoGrad> {
public:
static torch::Tensor forward(
AutogradContext* ctx, c10::intrusive_ptr<SparseMatrix> sparse_mat,
torch::Tensor sparse_val);
static tensor_list backward(AutogradContext* ctx, tensor_list grad_outputs);
};
torch::Tensor SoftmaxAutoGrad::forward(
AutogradContext* ctx, c10::intrusive_ptr<SparseMatrix> sparse_mat,
torch::Tensor sparse_val) {
// Reduce by columns with dim 1.
auto sparse_val_max = ReduceMax(sparse_mat, 1);
auto sparse_val_exp =
BroadcastSubNoAutoGrad(sparse_mat, sparse_val_max).exp();
auto sparse_val_sum = ReduceSum(CreateValLike(sparse_mat, sparse_val_exp), 1);
auto sparse_score = BroadcastDivNoAutoGrad(
CreateValLike(sparse_mat, sparse_val_exp), sparse_val_sum);
const bool sparse_requires_grad = sparse_val.requires_grad();
torch::Tensor cache_sparse_score;
if (sparse_requires_grad) {
cache_sparse_score = sparse_score;
}
ctx->saved_data["sparse_matrix"] = sparse_mat;
ctx->saved_data["sparse_requires_grad"] = sparse_requires_grad;
ctx->save_for_backward({cache_sparse_score});
return sparse_score;
}
tensor_list SoftmaxAutoGrad::backward(
AutogradContext* ctx, tensor_list grad_outputs) {
auto saved = ctx->get_saved_variables();
auto sparse_score = saved[0];
auto output_grad = grad_outputs[0];
auto sparse_mat =
ctx->saved_data["sparse_matrix"].toCustomClass<SparseMatrix>();
const bool sparse_requires_grad =
ctx->saved_data["sparse_requires_grad"].toBool();
torch::Tensor sparse_val_grad;
if (sparse_requires_grad) {
auto sds = sparse_score * output_grad;
auto accum = ReduceSum(CreateValLike(sparse_mat, sds), 1);
sparse_val_grad = sds - BroadcastMulNoAutoGrad(
CreateValLike(sparse_mat, sparse_score), accum);
}
return {torch::Tensor(), sparse_val_grad};
}
c10::intrusive_ptr<SparseMatrix> Softmax(
const c10::intrusive_ptr<SparseMatrix>& sparse_mat) {
auto sparse_val = sparse_mat->value();
bool expand_dim = false;
auto new_sparse_mat = sparse_mat;
if (sparse_val.dim() == 1) {
sparse_val = sparse_val.view({-1, 1});
expand_dim = true;
new_sparse_mat = CreateValLike(sparse_mat, sparse_val);
}
auto new_sparse_val = SoftmaxAutoGrad::apply(new_sparse_mat, sparse_val);
if (expand_dim) {
new_sparse_val = new_sparse_val.view(-1);
}
return CreateValLike(sparse_mat, new_sparse_val);
}
} // namespace sparse
} // namespace dgl
......@@ -12,6 +12,7 @@ from .elementwise_op_sp import *
from .matmul import *
from .reduction import * # pylint: disable=W0622
from .sddmm import *
from .softmax import *
from .sparse_matrix import *
from .unary_op_diag import *
from .unary_op_sp import *
......
"""Softmax op for SparseMatrix"""
# pylint: disable=invalid-name
import torch
from .sparse_matrix import SparseMatrix
__all__ = ["softmax"]
def softmax(A: SparseMatrix) -> SparseMatrix:
"""Apply row-wise softmax to the non-zero entries of the sparse matrix.
If :attr:`A.val` takes shape :attr:`(nnz, D)`, then the output matrix
:attr:`A'` and :attr:`A'.val` take the same shape as :attr:`A` and
:attr:`A.val`. :attr:`A'.val[:, i]` is calculated based on
:attr:`A.val[:, i]`.
Parameters
----------
A : SparseMatrix
The input sparse matrix
Returns
-------
SparseMatrix
The output sparse matrix
Examples
--------
Case1: matrix with values of shape (nnz)
>>> row = torch.tensor([0, 0, 1, 2])
>>> col = torch.tensor([1, 2, 2, 0])
>>> nnz = len(row)
>>> val = torch.arange(nnz).float()
>>> A = create_from_coo(row, col, val)
>>> softmax(A)
SparseMatrix(indices=tensor([[0, 0, 1, 2],
[1, 2, 2, 0]]),
values=tensor([0.2689, 0.7311, 1.0000, 1.0000]),
shape=(3, 3), nnz=4)
Case2: matrix with values of shape (nnz, D)
>>> val = torch.tensor([[0., 7.], [1., 3.], [2., 2.], [3., 1.]])
>>> A = create_from_coo(row, col, val)
>>> softmax(A)
SparseMatrix(indices=tensor([[0, 0, 1, 2],
[1, 2, 2, 0]]),
values=tensor([[0.2689, 0.9820],
[0.7311, 0.0180],
[1.0000, 1.0000],
[1.0000, 1.0000]]),
shape=(3, 3), nnz=4)
"""
return SparseMatrix(torch.ops.dgl_sparse.softmax(A.c_sparse_matrix))
import sys
import backend as F
import dgl
import pytest
import torch
from dgl.mock_sparse2 import create_from_coo, softmax
# TODO(#4818): Skipping tests on win.
if not sys.platform.startswith("linux"):
pytest.skip("skipping tests on win", allow_module_level=True)
@pytest.mark.parametrize("val_D", [None, 2])
@pytest.mark.parametrize("csr", [True, False])
def test_softmax(val_D, csr):
dev = F.ctx()
row = torch.tensor([0, 0, 1, 1]).to(dev)
col = torch.tensor([0, 2, 1, 2]).to(dev)
nnz = len(row)
if val_D is None:
val = torch.randn(nnz).to(dev)
else:
val = torch.randn(nnz, val_D).to(dev)
val_sparse = val.clone().requires_grad_()
A = create_from_coo(row, col, val_sparse)
if csr:
# Test CSR
A.csr()
A_max = softmax(A)
g = dgl.graph((col, row), num_nodes=max(A.shape))
val_g = val.clone().requires_grad_()
score = dgl.nn.functional.edge_softmax(g, val_g)
assert torch.allclose(A_max.val, score)
grad = torch.randn_like(score).to(dev)
A_max.val.backward(grad)
score.backward(grad)
assert torch.allclose(A.val.grad, val_g.grad)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment