[Doc] Add docstring for missing APIs (#3088)

* add docstring for missing API; fix some docstring * rename apis; address comments

[Doc] Add docstring for missing APIs (#3088)
* add docstring for missing API; fix some docstring * rename apis; address comments
d3e4460b · Minjie Wang · GitHub · 485c04cf · d3e4460b · d3e4460b
Unverified Commit d3e4460b authored Jul 05, 2021 by Minjie Wang Committed by GitHub Jul 05, 2021
7 changed files
--- a/docs/source/api/python/dgl.rst
+++ b/docs/source/api/python/dgl.rst
@@ -77,6 +77,8 @@ Operators for generating new graphs by manipulating the structure of the existin
    adj_product_graph
    adj_sum_graph
    reorder
+    sort_csr_by_tag
+    sort_csc_by_tag
 .. _api-batch:

--- a/docs/source/api/python/dgl.sampling.rst
+++ b/docs/source/api/python/dgl.sampling.rst
@@ -22,5 +22,6 @@ Neighbor sampling
    :toctree: ../../generated/
    sample_neighbors
+    sample_neighbors_biased
    select_topk
    PinSAGESampler
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -183,6 +183,7 @@ epub_exclude_files = ['search.html']
 # -- Extension configuration -------------------------------------------------
 autosummary_generate = True
+autodoc_member_order = 'alphabetical'
 intersphinx_mapping = {
    'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None),

--- a/python/dgl/sampling/neighbor.py
+++ b/python/dgl/sampling/neighbor.py
@@ -186,39 +186,45 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
 def sample_neighbors_biased(g, nodes, fanout, bias, edge_dir='in',
                            tag_offset_name='_TAG_OFFSET', replace=False,
                            copy_ndata=True, copy_edata=True):
-    """Sample neighboring edges of the given nodes and return the induced subgraph, where each
+    r"""Sample neighboring edges of the given nodes and return the induced subgraph, where each
-       neighbor's probability to be picked is determined by its tag.
+    neighbor's probability to be picked is determined by its tag.
    For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges
    will be randomly chosen.  The graph returned will then contain all the nodes in the
    original graph, but only the sampled edges.
    This version of neighbor sampling can support the scenario where adjacent nodes with different
-    types might have different probability to be picked. Each node is assigned an integer(tag)
+    types have different sampling probability. Each node is assigned an integer (called a *tag*)
    which represents its type. Tag is an analogue of node type under the framework of homogeneous
    graphs. Nodes with the same tag share the same probability.
-    For example, assume a node has (a+b) neighbors, and a of them have tag 0 while b of them have
+    For example, assume a node has :math:`N+M` neighbors, and :math:`N` of them
-    tag 1. Assume a node of tag 0 has an unnormalized probability p to be picked while a node of
+    have tag 0 while :math:`M` of them have tag 1. Assume a node of tag 0 has
-    tag 1 has q. This function first chooses a tag according to the unnormalized probability
+    an unnormalized probability :math:`p` to be picked while a node of tag 1
-    distribution (ap, bq), and then run a uniform sampling within the nodes with the chosen tag.
+    has :math:`q`. This function first chooses a tag according to the
+    unnormalized probability distribution
-    In order to sample efficiently, we need to first sort the CSR matrix of the graph
+    :math:`\frac{P(tag=0)}{P(tag=1)}=\frac{Np}{Mq}`, and then run a uniform
-    according to the tag (See `dgl.transform.sort_in_edges` and `dgl.transform.sort_out_edges`
+    sampling to get a node of the chosen tag.
-    for details), which will arrange the neighbors with the same tag in a consecutive range
-    and store the offset of these ranges in a node feature with tag_offset_name as its name.
+    In order to make sampling more efficient, the input graph must have its
+    CSC matrix (or CSR matrix if ``edge_dir='out'``) sorted according to the tag. The API
-    Please make sure that the graph has been sorted by the sorting function corresponding to
+    :func:`~dgl.sort_csc_by_tag` and
-    the edge direction ('in' or 'out'). This function itself will not check whether the graph is
+    :func:`~dgl.sort_csr_by_tag` are designed for this purpose, which
-    sorted. Note that the input `tag_offset_name` should be consistent with that in the sorting
+    will internally reorder the neighbors by tags so that neighbors of the same tags are
-    function.
+    stored in a consecutive range. The two APIs will also store the offsets of these ranges
+    in a node feature with :attr:`tag_offset_name` as its name.
-    Only homogeneous or bipartite graphs are supported. For bipartite graphs, only candidate
-    frontier nodes have tags(source nodes when edge_dir='in' and destination nodes when
+    **Please make sure that the CSR (or CSC) matrix of the graph has been sorted before
-    edge_dir='out'), and the offset of tags should be stored as a node feature of the seed nodes.
+    calling this function.**  This function itself will not check whether the
+    input graph is sorted. Note that the input :attr:`tag_offset_name` should
+    be consistent with that in the sorting function.
+    Only homogeneous or bipartite graphs are supported. For bipartite graphs,
+    the tag offsets of the source nodes when ``edge_dir='in'`` (or the destination
+    nodes when ``edge_dir='out'``) will be used in sampling.
    Node/edge features are not preserved. The original IDs of
-    the sampled edges are stored as the `dgl.EID` feature in the returned graph.
+    the sampled edges are stored as the ``dgl.EID`` feature in the returned graph.
    Parameters
    ----------
@@ -272,6 +278,11 @@ def sample_neighbors_biased(g, nodes, fanout, bias, edge_dir='in',
    As a result, users should avoid performing in-place operations
    on the node features of the new graph to avoid feature corruption.
+    See Also
+    --------
+    dgl.sort_csc_by_tag
+    dgl.sort_csr_by_tag
    Examples
    --------
    Assume that you have the following graph
@@ -284,7 +295,7 @@ def sample_neighbors_biased(g, nodes, fanout, bias, edge_dir='in',
    Sort the graph (necessary!)
-    >>> g_sorted = dgl.transform.sort_out_edges(g, tag)
+    >>> g_sorted = dgl.transform.sort_csr_by_tag(g, tag)
    >>> g_sorted.ndata['_TAG_OFFSET']
    tensor([[0, 1, 2],
            [0, 2, 2],

--- a/python/dgl/transform.py
+++ b/python/dgl/transform.py
@@ -45,8 +45,8 @@ __all__ = [
    'to_simple',
    'to_simple_graph',
    'as_immutable_graph',
-    'sort_out_edges',
+    'sort_csr_by_tag',
-    'sort_in_edges',
+    'sort_csc_by_tag',
    'metis_partition_assignment',
    'partition_graph_with_halo',
    'metis_partition',
@@ -2719,37 +2719,36 @@ def as_immutable_graph(hg):
                '\tdgl.as_immutable_graph will do nothing and can be removed safely in all cases.')
    return hg
-def sort_out_edges(g, tag, tag_offset_name='_TAG_OFFSET'):
+def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET'):
-    """Return a new graph which sorts the out edges of each node.
+    r"""Return a new graph whose CSR matrix is sorted by the given tag.
-    Sort the out edges according to the given destination node tags in integer.
+    Sort the internal CSR matrix of the graph so that the adjacency list of each node
-    A typical use case is to sort the edges by the destination node types, where
+    , which contains the out-edges, is sorted by the tag of the out-neighbors.
-    the tags represent destination node types. After sorting, edges sharing
+    After sorting, edges sharing the same tag will be arranged in a consecutive range in
-    the same tag will be arranged in a consecutive range in
    a node's adjacency list. Following is an example:
-        Consider a graph as follows:
+        Consider a graph as follows::
-        0 -> 0, 1, 2, 3, 4
+            0 -> 0, 1, 2, 3, 4
-        1 -> 0, 1, 2
+            1 -> 0, 1, 2
-        Given node tags [1, 1, 0, 2, 0], each node's adjacency list
+        Given node tags ``[1, 1, 0, 2, 0]``, each node's adjacency list
-        will be sorted as follows:
+        will be sorted as follows::
-        0 -> 2, 4, 0, 1, 3
+            0 -> 2, 4, 0, 1, 3
-        1 -> 2, 0, 1
+            1 -> 2, 0, 1
    The function will also returns the starting offsets of the tag
-    segments in a tensor of shape `(N, max_tag+2)`. For node `i`,
+    segments in a tensor of shape :math:`(N, max\_tag+2)`. For node ``i``,
-    its out-edges connecting to node tag `j` is stored between
+    its out-edges connecting to node tag ``j`` is stored between
-    `tag_offsets[i][j]` ~ `tag_offsets[i][j+1]`. Since the offsets
+    ``tag_offsets[i][j]`` ~ ``tag_offsets[i][j+1]``. Since the offsets
    can be viewed node data, we store it in the
-    `ndata` of the returned graph. Users can specify the
+    ``ndata`` of the returned graph. Users can specify the
-    ndata name by the `tag_pos_name` argument.
+    ndata name by the :attr:`tag_pos_name` argument.
    Note that the function will not change the edge ID neither
    how the edge features are stored. The input graph must
-    allow CSR format. Graph must be on CPU.
+    allow CSR format. The graph must be on CPU.
    If the input graph is heterogenous, it must have only one edge
    type and two node types (i.e., source and destination node types).
@@ -2757,8 +2756,26 @@ def sort_out_edges(g, tag, tag_offset_name='_TAG_OFFSET'):
    and the tag offsets are stored in the source node data.
    The sorted graph and the calculated tag offsets are needed by
-    certain operators that consider node tags. See `sample_neighbors_biased`
+    certain operators that consider node tags. See
-    for an example.
+    :func:`~dgl.sampling.sample_neighbors_biased` for an example.
+    Parameters
+    ------------
+    g : DGLGraph
+        The input graph.
+    tag : Tensor
+        Integer tensor of shape :math:`(N,)`, :math:`N` being the number of (destination) nodes.
+    tag_offset_name : str
+        The name of the node feature to store tag offsets.
+    Returns
+    -------
+    g_sorted : DGLGraph
+        A new graph whose CSR is sorted. The node/edge features of the
+        input graph is shallow-copied over.
+        - ``g_sorted.ndata[tag_offset_name]`` : Tensor of shape :math:`(N, max\_tag + 2)`.
+        - If ``g`` is heterogeneous, get from ``g_sorted.srcdata``.
    Examples
    -----------
@@ -2768,7 +2785,7 @@ def sort_out_edges(g, tag, tag_offset_name='_TAG_OFFSET'):
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
     array([0, 1, 2, 3, 4, 0, 1, 2], dtype=int32))
    >>> tag = torch.IntTensor([1,1,0,2,0])
-    >>> g_sorted = dgl.transform.sort_out_edges(g, tag)
+    >>> g_sorted = dgl.sort_csr_by_tag(g, tag)
    >>> g_sorted.adjacency_matrix(scipy_fmt='csr').nonzero()
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
     array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
@@ -2779,22 +2796,9 @@ def sort_out_edges(g, tag, tag_offset_name='_TAG_OFFSET'):
            [0, 0, 0, 0],
            [0, 0, 0, 0]])
-    Parameters
+    See Also
-    ------------
+    --------
-    g : DGLGraph
+    dgl.sampling.sample_neighbors_biased
-        The input graph.
-    tag : Tensor
-        Integer tensor of shape `(N,)`, `N` being the number of (destination) nodes.
-    tag_offset_name : str
-        The name of the node feature to store tag offsets.
-    Returns
-    -------
-    g_sorted : DGLGraph
-        A new graph whose out edges are sorted. The node/edge features of the
-        input graph is shallow-copied over.
-        - `g_sorted.ndata[tag_offset_name]` : Tensor of shape `(N, max_tag + 2)`. If
-        `g` is heterogeneous, get from `g_sorted.srcdata`.
    """
    if len(g.etypes) > 1:
        raise DGLError("Only support homograph and bipartite graph")
@@ -2806,37 +2810,37 @@ def sort_out_edges(g, tag, tag_offset_name='_TAG_OFFSET'):
    return new_g
-def sort_in_edges(g, tag, tag_offset_name='_TAG_OFFSET'):
+def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET'):
-    """Return a new graph which sorts the in edges of each node.
+    r"""Return a new graph whose CSC matrix is sorted by the given tag.
-    Sort the in edges according to the given source node tags in integer.
+    Sort the internal CSC matrix of the graph so that the adjacency list of each node
-    A typical use case is to sort the edges by the source node types, where
+    , which contains the in-edges, is sorted by the tag of the in-neighbors.
-    the tags represent source node types. After sorting, edges sharing
+    After sorting, edges sharing the same tag will be arranged in a consecutive range in
-    the same tag will be arranged in a consecutive range in
    a node's adjacency list. Following is an example:
-        Consider a graph as follows:
-        0 <- 0, 1, 2, 3, 4
+        Consider a graph as follows::
-        1 <- 0, 1, 2
-        Given node tags [1, 1, 0, 2, 0], each node's adjacency list
+            0 <- 0, 1, 2, 3, 4
-        will be sorted as follows:
+            1 <- 0, 1, 2
-        0 <- 2, 4, 0, 1, 3
+        Given node tags ``[1, 1, 0, 2, 0]``, each node's adjacency list
-        1 <- 2, 0, 1
+        will be sorted as follows::
-    The function will also returns the starting offsets of the tag
+            0 <- 2, 4, 0, 1, 3
-    segments in a tensor of shape `(N, max_tag+2)`. For node `i`,
+            1 <- 2, 0, 1
-    its in-edges connecting to node tag `j` is stored between
-    `tag_offsets[i][j]` ~ `tag_offsets[i][j+1]`. Since the offsets
+    The function will also return the starting offsets of the tag
+    segments in a tensor of shape :math:`(N, max\_tag+2)`. For a node ``i``,
+    its in-edges connecting to node tag ``j`` is stored between
+    ``tag_offsets[i][j]`` ~ ``tag_offsets[i][j+1]``. Since the offsets
    can be viewed node data, we store it in the
-    `ndata` of the returned graph. Users can specify the
+    ``ndata`` of the returned graph. Users can specify the
-    ndata name by the `tag_pos_name` argument.
+    ndata name by the ``tag_pos_name`` argument.
    Note that the function will not change the edge ID neither
    how the edge features are stored. The input graph must
-    allow CSR format. Graph must be on CPU.
+    allow CSC format. The graph must be on CPU.
    If the input graph is heterogenous, it must have only one edge
    type and two node types (i.e., source and destination node types).
@@ -2844,9 +2848,27 @@ def sort_in_edges(g, tag, tag_offset_name='_TAG_OFFSET'):
    and the tag offsets are stored in the destination node data.
    The sorted graph and the calculated tag offsets are needed by
-    certain operators that consider node tags. See `sample_neighbors_biased`
+    certain operators that consider node tags. See :func:`~dgl.sampling.sample_neighbors_biased`
    for an example.
+    Parameters
+    ------------
+    g : DGLGraph
+        The input graph.
+    tag : Tensor
+        Integer tensor of shape :math:`(N,)`, :math:`N` being the number of (source) nodes.
+    tag_offset_name : str
+        The name of the node feature to store tag offsets.
+    Returns
+    -------
+    g_sorted : DGLGraph
+        A new graph whose CSC matrix is sorted. The node/edge features of the
+        input graph is shallow-copied over.
+        - ``g_sorted.ndata[tag_offset_name]`` : Tensor of shape :math:`(N, max\_tag + 2)`.
+        - If ``g`` is heterogeneous, get from ``g_sorted.dstdata``.
    Examples
    -----------
@@ -2855,7 +2877,7 @@ def sort_in_edges(g, tag, tag_offset_name='_TAG_OFFSET'):
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
     array([0, 1, 2, 3, 4, 0, 1, 2], dtype=int32)))
    >>> tag = torch.IntTensor([1,1,0,2,0])
-    >>> g_sorted = dgl.transform.sort_in_edges(g, tag)
+    >>> g_sorted = dgl.sort_csc_by_tag(g, tag)
    >>> g_sorted.adjacency_matrix(scipy_fmt='csr', transpose=True).nonzero()
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
     array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
@@ -2866,22 +2888,9 @@ def sort_in_edges(g, tag, tag_offset_name='_TAG_OFFSET'):
            [0, 0, 0, 0],
            [0, 0, 0, 0]])
-    Parameters
+    See Also
-    ------------
+    --------
-    g : DGLGraph
+    dgl.sampling.sample_neighbors_biased
-        The input graph.
-    tag : Tensor
-        Integer tensor of shape `(N,)`, `N` being the number of (source) nodes.
-    tag_offset_name : str
-        The name of the node feature to store tag offsets.
-    Returns
-    -------
-    g_sorted : DGLGraph
-        A new graph whose out edges are sorted. The node/edge features of the
-        input graph is shallow-copied over.
-        - `g_sorted.ndata[tag_offset_name]` : Tensor of shape `(N, max_tag + 2)`. If
-        `g` is heterogeneous, get from `g_sorted.dstdata`.
    """
    if len(g.etypes) > 1:
        raise DGLError("Only support homograph and bipartite graph")

--- a/tests/compute/test_sampling.py
+++ b/tests/compute/test_sampling.py
@@ -617,7 +617,7 @@ def test_sample_neighbors_biased_homogeneous():
    tag = F.tensor(np.random.choice(4, 100))
    bias = F.tensor([0, 0.1, 10, 10], dtype=F.float32)
    # inedge / without replacement
-    g_sorted = dgl.sort_in_edges(g, tag)
+    g_sorted = dgl.sort_csc_by_tag(g, tag)
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.nodes(), 5, bias, replace=False)
        check_num(subg.edges()[0], tag)
@@ -631,7 +631,7 @@ def test_sample_neighbors_biased_homogeneous():
        check_num(subg.edges()[0], tag)
    # outedge / without replacement
-    g_sorted = dgl.sort_out_edges(g, tag)
+    g_sorted = dgl.sort_csr_by_tag(g, tag)
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.nodes(), 5, bias, edge_dir='out', replace=False)
        check_num(subg.edges()[1], tag)
@@ -661,7 +661,7 @@ def test_sample_neighbors_biased_bipartite():
    # inedge / without replacement
    tag = F.tensor(np.random.choice(4, 100))
-    g_sorted = dgl.sort_in_edges(g, tag)
+    g_sorted = dgl.sort_csc_by_tag(g, tag)
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.dstnodes(), 5, bias, replace=False)
        check_num(subg.edges()[0], tag)
@@ -676,7 +676,7 @@ def test_sample_neighbors_biased_bipartite():
    # outedge / without replacement
    tag = F.tensor(np.random.choice(4, num_dst))
-    g_sorted = dgl.sort_out_edges(g, tag)
+    g_sorted = dgl.sort_csr_by_tag(g, tag)
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.srcnodes(), 5, bias, edge_dir='out', replace=False)
        check_num(subg.edges()[1], tag)

--- a/tests/compute/test_sort.py
+++ b/tests/compute/test_sort.py
@@ -55,13 +55,13 @@ def test_sort_with_tag(idtype):
    g = create_test_heterograph(num_nodes, num_adj, idtype=idtype)
    tag = F.tensor(np.random.choice(num_tags, g.number_of_nodes()))
-    new_g = dgl.sort_out_edges(g, tag)
+    new_g = dgl.sort_csr_by_tag(g, tag)
    old_csr = g.adjacency_matrix(scipy_fmt='csr')
    new_csr = new_g.adjacency_matrix(scipy_fmt='csr')
    assert(check_sort(new_csr, tag, new_g.ndata["_TAG_OFFSET"]))
    assert(not check_sort(old_csr, tag))  # Check the original csr is not modified.
-    new_g = dgl.sort_in_edges(g, tag)
+    new_g = dgl.sort_csc_by_tag(g, tag)
    old_csc = g.adjacency_matrix(transpose=True, scipy_fmt='csr')
    new_csc = new_g.adjacency_matrix(transpose=True, scipy_fmt='csr')
    assert(check_sort(new_csc, tag, new_g.ndata["_TAG_OFFSET"]))
@@ -76,13 +76,13 @@ def test_sort_with_tag_bipartite(idtype):
    utag = F.tensor(np.random.choice(num_tags, g.number_of_nodes('_U')))
    vtag = F.tensor(np.random.choice(num_tags, g.number_of_nodes('_V')))
-    new_g = dgl.sort_out_edges(g, vtag)
+    new_g = dgl.sort_csr_by_tag(g, vtag)
    old_csr = g.adjacency_matrix(scipy_fmt='csr')
    new_csr = new_g.adjacency_matrix(scipy_fmt='csr')
    assert(check_sort(new_csr, vtag, new_g.nodes['_U'].data['_TAG_OFFSET']))
    assert(not check_sort(old_csr, vtag))
-    new_g = dgl.sort_in_edges(g, utag)
+    new_g = dgl.sort_csc_by_tag(g, utag)
    old_csc = g.adjacency_matrix(transpose=True, scipy_fmt='csr')
    new_csc = new_g.adjacency_matrix(transpose=True, scipy_fmt='csr')
    assert(check_sort(new_csc, utag, new_g.nodes['_V'].data['_TAG_OFFSET']))