[doc] Add docstring for segment reduce. (#2375)

6b02babb · Zihao Ye · GitHub · 35a3ead2 · 6b02babb · 6b02babb
Unverified Commit 6b02babb authored Nov 27, 2020 by Zihao Ye Committed by GitHub Nov 27, 2020
8 changed files
--- a/docs/source/api/python/dgl.ops.rst
+++ b/docs/source/api/python/dgl.ops.rst
@@ -239,7 +239,7 @@ Like GSpMM, GSDDMM operators support both homogeneous and bipartite graph.
 Edge Softmax module
 -------------------

-We also provide framework agnostic edge softmax module which was frequently used in
+DGL also provide framework agnostic edge softmax module which was frequently used in
 GNN-like structures, e.g. 
 `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`_,
 `Transformer <https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_,
@@ -250,6 +250,16 @@ GNN-like structures, e.g.

    edge_softmax

+Segment Reduce Module
+---------------------
+
+DGL provide operators to reduce value tensor along the first dimension by segments.
+
+.. autosummary::
+   :toctree: ../../generated/
+
+   segment_reduce
+
 Relation with Message Passing APIs
 ----------------------------------


--- a/python/dgl/backend/backend.py
+++ b/python/dgl/backend/backend.py
@@ -1512,23 +1512,27 @@ def segment_reduce(op, x, offsets):
    """Segment reduction operator.

    It aggregates the value tensor along the first dimension by segments.
-    The first argument ``seglen`` stores the length of each segment. Its
-    summation must be equal to the first dimension of the ``value`` tensor.
-    Zero-length segments are allowed.
+    The argument ``offsets`` specifies the start offset of each segment (and
+    the upper bound of the last segment). Zero-length segments are allowed.
+
+    .. math::
+      y_i = \Phi_{j=\mathrm{offsets}_i}^{\mathrm{offsets}_{i+1}-1} x_j
+
+    where :math:`\Phi` is the reduce operator.

    Parameters
    ----------
    op : str
-        Aggregation method. Can be 'sum', 'max', 'min'.
-    seglen : Tensor
-        Segment lengths.
-    value : Tensor
+        Aggregation method. Can be ``sum``, ``max``, ``min``.
+    x : Tensor
        Value to aggregate.
+    offsets : Tensor
+        The start offsets of segments.

    Returns
    -------
    Tensor
-        Aggregated tensor of shape ``(len(seglen), value.shape[1:])``.
+        Aggregated tensor of shape ``(len(offsets) - 1, value.shape[1:])``.
    """
    pass


--- a/python/dgl/ops/segment.py
+++ b/python/dgl/ops/segment.py
@@ -69,8 +69,6 @@ def segment_softmax(seglen, value):
        Segment lengths.
    value : Tensor
        Value to aggregate.
-    reducer : str, optional
-        Aggregation method. Can be 'sum', 'max', 'min', 'mean'.

    Returns
    -------

--- a/python/dgl/sparse.py
+++ b/python/dgl/sparse.py
@@ -252,18 +252,22 @@ def _segment_reduce(op, feat, offsets):
    r"""Segment reduction operator.

    It aggregates the value tensor along the first dimension by segments.
-    The first argument ``seglen`` stores the length of each segment. Its
-    summation must be equal to the first dimension of the ``value`` tensor.
-    Zero-length segments are allowed.
+    The argument ``offsets`` specifies the start offset of each segment (and
+    the upper bound of the last segment). Zero-length segments are allowed.
+
+    .. math::
+      y_i = \Phi_{j=\mathrm{offsets}_i}^{\mathrm{offsets}_{i+1}-1} x_j
+
+    where :math:`\Phi` is the reduce operator.

    Parameters
    ----------
    op : str
-        Aggregation method. Can be 'sum', 'max', 'min'.
-    seglen : Tensor
-        Segment lengths.
-    value : Tensor
+        Aggregation method. Can be ``sum``, ``max``, ``min``.
+    x : Tensor
        Value to aggregate.
+    offsets : Tensor
+        The start offsets of segments.

    Returns
    -------

--- a/src/array/cpu/segment_reduce.h
+++ b/src/array/cpu/segment_reduce.h
@@ -12,6 +12,12 @@ namespace dgl {
 namespace aten {
 namespace cpu {

+/*!
+ * \brief CPU kernel of segment sum.
+ * \param feat The input tensor.
+ * \param offsets The offset tensor storing the ranges of segments.
+ * \param out The output tensor.
+ */
 template <typename IdType, typename DType>
 void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
  int n = out->shape[0];
@@ -31,6 +37,14 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
  }
 }

+/*!
+ * \brief CPU kernel of segment min/max.
+ * \param feat The input tensor.
+ * \param offsets The offset tensor storing the ranges of segments.
+ * \param out The output tensor.
+ * \param arg An auxiliary tensor storing the argmin/max information
+ *        used in backward phase.
+ */
 template <typename IdType, typename DType, typename Cmp>
 void SegmentCmp(NDArray feat, NDArray offsets,
                NDArray out, NDArray arg) {
@@ -58,6 +72,12 @@ void SegmentCmp(NDArray feat, NDArray offsets,
  }
 }

+/*!
+ * \brief CPU kernel of backward phase of segment min/max.
+ * \param feat The input tensor.
+ * \param arg The argmin/argmax tensor.
+ * \param out The output tensor.
+ */
 template <typename IdType, typename DType>
 void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
  int n = feat->shape[0];

--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
@@ -146,7 +146,7 @@ __device__ __forceinline__ Idx BinarySearchSrc(const Idx *array, Idx length, Idx
 *       is responsible for the computation on different edges. Threadblocks
 *       on the x-axis are responsible for the computation on different positions
 *       in feature dimension.
- *       To efficiently find the source node idx and destination node index of an 
+ *       To efficiently find the source node idx and destination node index of an
 *       given edge on Csr format, it uses binary search (time complexity O(log N)).
 */
 template <typename Idx, typename DType, typename BinaryOp,
@@ -239,7 +239,7 @@ void SDDMMCoo(
          coo.num_rows, coo.num_cols, nnz, reduce_dim,
          lhs_off, rhs_off,
          lhs_len, rhs_len, len);
-    });        
+    });
  } else {
    const int ntx = FindNumThreads(len);
    const int nty = CUDA_MAX_NUM_THREADS / ntx;

--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
@@ -19,6 +19,8 @@ namespace cuda {

 /*!
 * \brief CUDA kernel of segment reduce.
+ * \note each blockthread is responsible for aggregation on a row
+ *       in the result tensor.
 */
 template <typename IdType, typename DType,
          typename ReduceOp>
@@ -41,7 +43,9 @@ __global__ void SegmentReduceKernel(
 }

 /*!
- * \brief CUDA kernel of segment reduce.
+ * \brief CUDA kernel of backward phase in segment min/max.
+ * \note each blockthread is responsible for writing a row in the
+ *       result gradient tensor by lookup the ArgMin/Max for index information.
 */
 template <typename IdType, typename DType>
 __global__ void BackwardSegmentCmpKernel(
@@ -57,6 +61,13 @@ __global__ void BackwardSegmentCmpKernel(
  }
 }

+/*!
+ * \brief CUDA implementation of forward phase of Segment Reduce.
+ * \param feat The input tensor.
+ * \param offsets The offsets tensor.
+ * \param out The output tensor.
+ * \param arg An auxiliary tensor storing ArgMax/Min information,
+ */
 template <typename IdType, typename DType, typename ReduceOp>
 void SegmentReduce(
    NDArray feat,
@@ -80,12 +91,19 @@ void SegmentReduce(
  const int nty = 1;
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);
+  // TODO(zihao): try cub's DeviceSegmentedReduce and compare the performance.
  CUDA_KERNEL_CALL((SegmentReduceKernel<IdType, DType, ReduceOp>),
      nblks, nthrs, 0, thr_entry->stream,
      feat_data, offsets_data, out_data, arg_data,
      n, dim);
 }

+/*!
+ * \brief CUDA implementation of backward phase of Segment Reduce with Min/Max reducer.
+ * \param feat The input tensor.
+ * \param arg The ArgMin/Max information, used for indexing.
+ * \param out The output tensor.
+ */
 template <typename IdType, typename DType>
 void BackwardSegmentCmp(
    NDArray feat,

--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -19,7 +19,7 @@ using namespace cuda;
 namespace aten {
 namespace cuda {

-/*! 
+/*!
 * \brief CUDA Kernel of filling the vector started from ptr of size length
 *        with val.
 * \note internal use only.
@@ -134,7 +134,7 @@ __global__ void ArgSpMMCooKernel(
 /*!
 * \brief CUDA kernel of g-SpMM on Coo format.
 * \note it uses node parallel strategy, different threadblocks (on y-axis)
- *       is responsible for the computation on different destination nodes. 
+ *       is responsible for the computation on different destination nodes.
 *       Threadblocks on the x-axis are responsible for the computation on
 *       different positions in feature dimension.
 */
@@ -191,10 +191,10 @@ __global__ void SpMMCsrKernel(
 * \param ufeat The feature on source nodes.
 * \param efeat The feature on edges.
 * \param out The result feature on destination nodes.
- * \param argu Arg-Min/Max on source nodes, which refers the source node indices 
+ * \param argu Arg-Min/Max on source nodes, which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param arge Arg-Min/Max on edges. which refers the source node indices 
+ * \param arge Arg-Min/Max on edges. which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
 */
@@ -263,10 +263,10 @@ void SpMMCoo(
 * \param ufeat The feature on source nodes.
 * \param efeat The feature on edges.
 * \param out The result feature on destination nodes.
- * \param argu Arg-Min/Max on source nodes, which refers the source node indices 
+ * \param argu Arg-Min/Max on source nodes, which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param arge Arg-Min/Max on edges. which refers the source node indices 
+ * \param arge Arg-Min/Max on edges. which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max reducer.
 */