[Doc] dgl.sampling docstring fixes (#1928)

823eb5be · Quan (Andy) Gan · GitHub · 451ed6d8 · 823eb5be · 823eb5be
Unverified Commit 823eb5be authored Aug 04, 2020 by Quan (Andy) Gan Committed by GitHub Aug 04, 2020
5 changed files
--- a/python/dgl/sampling/__init__.py
+++ b/python/dgl/sampling/__init__.py
-"""Sampler modules."""
+"""Sampling operators.
+This module contains the implementations of various sampling operators.
+"""
 from .randomwalks import *
 from .pinsage import *
 from .neighbor import *

--- a/python/dgl/sampling/neighbor.py
+++ b/python/dgl/sampling/neighbor.py
@@ -15,45 +15,84 @@ __all__ = [
    'MultiLayerNeighborSampler']
 def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False):
-    """Sample from the neighbors of the given nodes and return the induced subgraph.
+    """Sample neighboring edges of the given nodes and return the induced subgraph.
-    When sampling with replacement, the sampled subgraph could have parallel edges.
+    For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges
+    will be randomly chosen.  The graph returned will then contain all the nodes in the
-    For sampling without replace, if fanout > the number of neighbors, all the
+    original graph, but only the sampled edges.
-    neighbors are sampled.
    Node/edge features are not preserved. The original IDs of
    the sampled edges are stored as the `dgl.EID` feature in the returned graph.
    Parameters
    ----------
-    g : DGLHeteroGraph
+    g : DGLGraph
-        Full graph structure.
+        The graph
    nodes : tensor or dict
-        Node ids to sample neighbors from. The allowed types
+        Node IDs to sample neighbors from.
-        are dictionary of node types to node id tensors, or simply node id tensor if
-        the given graph g has only one type of nodes.
+        This argument can take a single ID tensor or a dictionary of node types and ID tensors.
+        If a single tensor is given, the graph must only have one type of nodes.
    fanout : int or dict[etype, int]
-        The number of sampled neighbors for each node on each edge type. Provide a dict
+        The number of edges to be sampled for each node on each edge type.
-        to specify different fanout values for each edge type.
+        This argument can take a single int or a dictionary of edge types and ints.
+        If a single int is given, DGL will sample this number of edges for each node for
+        every edge type.
-        If -1 is given, select all the neighbors.  ``prob`` and ``replace`` will be
+        If -1 is given for a single edge type, all the neighboring edges with that edge
-        ignored in this case.
+        type will be selected.
    edge_dir : str, optional
-        Edge direction ('in' or 'out'). If is 'in', sample from in edges. Otherwise,
+        Determines whether to sample inbound or outbound edges.
-        sample from out edges.
+        Can take either ``in`` for inbound edges or ``out`` for outbound edges.
    prob : str, optional
-        Feature name used as the probabilities associated with each neighbor of a node.
+        Feature name used as the (unnormalized) probabilities associated with each
-        Its shape should be compatible with a scalar edge feature tensor.
+        neighboring edge of a node.  The feature must have only one element for each
+        edge.
+        The features must be non-negative floats, and the sum of the features of
+        inbound/outbound edges for every node must be positive (though they don't have
+        to sum up to one).  Otherwise, the result will be undefined.
    replace : bool, optional
        If True, sample with replacement.
    Returns
    -------
-    DGLHeteroGraph
+    DGLGraph
-        A sampled subgraph containing only the sampled neighbor edges from
+        A sampled subgraph containing only the sampled neighboring edges.
-        ``nodes``. The sampled subgraph has the same metagraph as the original
-        one.
+    Examples
+    --------
+    Assume that you have the following graph
+    >>> g = dgl.graph(([0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0]))
+    And the weights
+    >>> g.edata['prob'] = torch.FloatTensor([0., 1., 0., 1., 0., 1.])
+    To sample one inbound edge for node 0 and node 1:
+    >>> sg = dgl.sampling.sample_neighbors(g, [0, 1], 1)
+    >>> sg.edges(order='eid')
+    (tensor([1, 0]), tensor([0, 1]))
+    >>> sg.edata[dgl.EID]
+    tensor([2, 0])
+    To sample one inbound edge for node 0 and node 1 with probability in edge feature
+    ``prob``:
+    >>> sg = dgl.sampling.sample_neighbors(g, [0, 1], 1, prob='prob')
+    >>> sg.edges(order='eid')
+    (tensor([2, 1]), tensor([0, 1]))
+    With ``fanout`` greater than the number of actual neighbors and without replacement,
+    DGL will take all neighbors instead:
+    >>> sg = dgl.sampling.sample_neighbors(g, [0, 1], 3)
+    >>> sg.edges(order='eid')
+    (tensor([1, 2, 0, 1]), tensor([0, 0, 1, 1]))
    """
    if not isinstance(nodes, dict):
        if len(g.ntypes) > 1:
@@ -97,40 +136,60 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False):
    return ret
 def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False):
-    """Select the neighbors with k-largest weights on the connecting edges for each given node.
+    """Select the neighboring edges with k-largest (or k-smallest) weights of the given
+    nodes and return the induced subgraph.
-    If k > the number of neighbors, all the neighbors are sampled.
+    For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges
+    with the largest (or smallest when ``ascending == True``) weights will be chosen.
+    The graph returned will then contain all the nodes in the original graph, but only
+    the sampled edges.
    Node/edge features are not preserved. The original IDs of
    the sampled edges are stored as the `dgl.EID` feature in the returned graph.
    Parameters
    ----------
-    g : DGLHeteroGraph
+    g : DGLGraph
-        Full graph structure.
+        The graph
    k : int or dict[etype, int]
-        The K value.
+        The number of edges to be selected for each node on each edge type.
+        This argument can take a single int or a dictionary of edge types and ints.
+        If a single int is given, DGL will select this number of edges for each node for
+        every edge type.
-        If -1 is given, select all the neighbors.
+        If -1 is given for a single edge type, all the neighboring edges with that edge
+        type will be selected.
    weight : str
-        Feature name of the weights associated with each edge. Its shape should be
+        Feature name of the weights associated with each edge.  The feature should have only
-        compatible with a scalar edge feature tensor.
+        one element for each edge.  The feature can be either int32/64 or float32/64.
    nodes : tensor or dict, optional
-        Node ids to sample neighbors from. The allowed types
+        Node IDs to sample neighbors from.
-        are dictionary of node types to node id tensors, or simply node id
-        tensor if the given graph g has only one type of nodes.
+        This argument can take a single ID tensor or a dictionary of node types and ID tensors.
+        If a single tensor is given, the graph must only have one type of nodes.
+        If None, DGL will select the edges for all nodes.
    edge_dir : str, optional
-        Edge direction ('in' or 'out'). If is 'in', sample from in edges.
+        Determines whether to sample inbound or outbound edges.
-        Otherwise, sample from out edges.
+        Can take either ``in`` for inbound edges or ``out`` for outbound edges.
    ascending : bool, optional
-        If true, elements are sorted by ascending order, equivalent to find
+        If True, DGL will return edges with k-smallest weights instead of
-        the K smallest values. Otherwise, find K largest values.
+        k-largest weights.
    Returns
    -------
-    DGLHeteroGraph
+    DGLGraph
-        A sampled subgraph by top k criterion. The sampled subgraph has the same
+        A sampled subgraph containing only the sampled neighboring edges.
-        metagraph as the original one.
+    Examples
+    --------
+    >>> g = dgl.graph(([0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0]))
+    >>> g.edata['weight'] = torch.FloatTensor([0, 1, 0, 1, 0, 1])
+    >>> sg = dgl.sampling.select_topk(g, 1, 'weight')
+    >>> sg.edges(order='eid')
+    (tensor([2, 1, 0]), tensor([0, 1, 2]))
    """
    # Rectify nodes to a dictionary
    if nodes is None:

--- a/python/dgl/sampling/pinsage.py
+++ b/python/dgl/sampling/pinsage.py
@@ -11,72 +11,71 @@ from ..base import EID
 class RandomWalkNeighborSampler(object):
-    """PinSAGE-like sampler extended to any heterographs, given a metapath.
+    """PinSage-like neighbor sampler extended to any heterogeneous graphs.
-    Given a heterogeneous graph, this neighbor sampler would generate a homogeneous
+    Given a heterogeneous graph and a list of nodes, this callable will generate a homogeneous
-    graph where the neighbors of each node are the most commonly visited nodes of the
+    graph where the neighbors of each given node are the most commonly visited nodes of the
-    same type by random walk with restarts.  The random walk with restarts are based
+    same type by multiple random walks starting from that given node.  Each random walk consists
-    on a given metapath, which should have the same beginning and ending node type.
+    of multiple metapath-based traversals, with a probability of termination after each traversal.
-    The homogeneous graph also has a feature that stores the number of visits to
+    The edges of the returned homogeneous graph will connect to the given nodes from their most
-    the corresponding neighbors from the seed nodes.
+    commonly visited nodes, with a feature indicating the number of visits.
-    This is a generalization of PinSAGE sampler which only works on bidirectional
+    The metapath must have the same beginning and ending node type to make the algorithm work.
-    bipartite graphs.
+    This is a generalization of PinSAGE sampler which only works on bidirectional bipartite
+    graphs.
    Parameters
    ----------
-    G : DGLHeteroGraph
+    G : DGLGraph
-        The heterogeneous graph.
+        The graph.
-    random_walk_length : int
+    num_traversals : int
-        The maximum number of steps of random walk with restarts.
+        The maximum number of metapath-based traversals for a single random walk.
-        Note that here we consider a full traversal of the given metapath as a single
-        random walk "step" (i.e. a single step may consist of multiple hops).
        Usually considered a hyperparameter.
-    random_walk_restart_prob : int
+    termination_prob : float
-        Restart probability of random walk with restarts.
+        Termination probability after each metapath-based traversal.
-        Note that the random walks only would halt after a full traversal of a metapath.
-        It will never halt in the middle of a metapath.
        Usually considered a hyperparameter.
    num_random_walks : int
-        Number of random walks to try for each seed node.
+        Number of random walks to try for each given node.
        Usually considered a hyperparameter.
    num_neighbors : int
-        Number of neighbors to select for each seed.
+        Number of neighbors (or most commonly visited nodes) to select for each given node.
    metapath : list[str] or list[tuple[str, str, str]], optional
        The metapath.
-        If not given, assumes that the graph is homogeneous.
+        If not given, DGL assumes that the graph is homogeneous and the metapath consists
+        of one step over the single edge type.
    weight_column : str, default "weights"
-        The weight of each neighbor, stored as an edge feature.
+        The name of the edge feature to be stored on the returned graph with the number of
+        visits.
    Inputs
    ------
    seed_nodes : Tensor
-        A tensor of seed node IDs of node type ``ntype``.
+        A tensor of given node IDs of node type ``ntype`` to generate neighbors from.  The
+        node type ``ntype`` is the beginning and ending node type of the given metapath.
    Outputs
    -------
-    g : DGLHeteroGraph
+    g : DGLGraph
-        A homogeneous graph constructed by selecting neighbors for each seed node according
+        A homogeneous graph constructed by selecting neighbors for each given node according
-        to PinSAGE algorithm.
+        to the algorithm above.
    Examples
    --------
    See examples in :any:`PinSAGESampler`.
    """
-    def __init__(self, G, random_walk_length, random_walk_restart_prob,
+    def __init__(self, G, num_traversals, termination_prob,
                 num_random_walks, num_neighbors, metapath=None, weight_column='weights'):
        self.G = G
        self.weight_column = weight_column
        self.num_random_walks = num_random_walks
        self.num_neighbors = num_neighbors
-        self.random_walk_length = random_walk_length
+        self.num_traversals = num_traversals
        if metapath is None:
            if len(G.ntypes) > 1 or len(G.etypes) > 1:
@@ -90,9 +89,9 @@ class RandomWalkNeighborSampler(object):
        self.metapath_hops = len(metapath)
        self.metapath = metapath
-        self.full_metapath = metapath * random_walk_length
+        self.full_metapath = metapath * num_traversals
-        restart_prob = np.zeros(self.metapath_hops * random_walk_length)
+        restart_prob = np.zeros(self.metapath_hops * num_traversals)
-        restart_prob[self.metapath_hops::self.metapath_hops] = random_walk_restart_prob
+        restart_prob[self.metapath_hops::self.metapath_hops] = termination_prob
        self.restart_prob = F.zerocopy_from_numpy(restart_prob)
    # pylint: disable=no-member
@@ -101,7 +100,7 @@ class RandomWalkNeighborSampler(object):
        paths, _ = random_walk(
            self.G, seed_nodes, metapath=self.full_metapath, restart_prob=self.restart_prob)
        src = F.reshape(paths[:, self.metapath_hops::self.metapath_hops], (-1,))
-        dst = F.repeat(paths[:, 0], self.random_walk_length, 0)
+        dst = F.repeat(paths[:, 0], self.num_traversals, 0)
        src_mask = (src != -1)
        src = F.boolean_mask(src, src_mask)
@@ -120,60 +119,60 @@ class RandomWalkNeighborSampler(object):
 class PinSAGESampler(RandomWalkNeighborSampler):
-    """PinSAGE neighbor sampler.
+    """PinSAGE-like neighbor sampler.
-    Given a bidirectional bipartite graph, PinSAGE neighbor sampler would generate
+    This callable works on a bidirectional bipartite graph with edge types
-    a homogeneous graph where the neighbors of each node are the most commonly visited
+    ``(ntype, fwtype, other_type)`` and ``(other_type, bwtype, ntype)`` (where ``ntype``,
-    nodes of the same type by random walk with restarts.
+    ``fwtype``, ``bwtype`` and ``other_type`` could be arbitrary type names).  It will generate
+    a homogeneous graph of node type ``ntype`` where the neighbors of each given node are the
+    most commonly visited nodes of the same type by multiple random walks starting from that
+    given node.  Each random walk consists of multiple metapath-based traversals, with a
+    probability of termination after each traversal.  The metapath is always ``[fwtype, bwtype]``,
+    walking from node type ``ntype`` to node type ``other_type`` then back to ``ntype``.
+    The edges of the returned homogeneous graph will connect to the given nodes from their most
+    commonly visited nodes, with a feature indicating the number of visits.
    Parameters
    ----------
-    G : DGLHeteroGraph
+    G : DGLGraph
        The bidirectional bipartite graph.
        The graph should only have two node types: ``ntype`` and ``other_type``.
        The graph should only have two edge types, one connecting from ``ntype`` to
        ``other_type``, and another connecting from ``other_type`` to ``ntype``.
-        PinSAGE works on a bidirectional bipartite graph where for each edge
-        going from node u to node v, there exists an edge going from node v to node u.
    ntype : str
        The node type for which the graph would be constructed on.
    other_type : str
        The other node type.
-    random_walk_length : int
+    num_traversals : int
-        The maximum number of steps of random walk with restarts.
+        The maximum number of metapath-based traversals for a single random walk.
-        Note that here we consider traversing from ``ntype`` to ``other_type`` then back
-        to ``ntype`` as a single step (i.e. a single step consists of two hops).
        Usually considered a hyperparameter.
-    random_walk_restart_prob : int
+    termination_prob : int
-        Restart probability of random walk with restarts.
+        Termination probability after each metapath-based traversal.
-        Note that the random walks only would halt on node type ``ntype``, and would
-        never halt on ``other_type``.
        Usually considered a hyperparameter.
    num_random_walks : int
-        Number of random walks to try for each seed node.
+        Number of random walks to try for each given node.
        Usually considered a hyperparameter.
    num_neighbors : int
-        Number of neighbors to select for each seed.
+        Number of neighbors (or most commonly visited nodes) to select for each given node.
    weight_column : str, default "weights"
-        The weight of each neighbor, stored as an edge feature.
+        The name of the edge feature to be stored on the returned graph with the number of
+        visits.
    Inputs
    ------
    seed_nodes : Tensor
-        A tensor of seed node IDs of node type ``ntype``.
+        A tensor of given node IDs of node type ``ntype`` to generate neighbors from.
    Outputs
    -------
    g : DGLHeteroGraph
-        A homogeneous graph constructed by selecting neighbors for each seed node according
+        A homogeneous graph constructed by selecting neighbors for each given node according
-        to PinSAGE algorithm.
+        to PinSage algorithm.
    Examples
    --------
@@ -184,7 +183,7 @@ class PinSAGESampler(RandomWalkNeighborSampler):
    ...     ('A', 'AB', 'B'): g,
    ...     ('B', 'BA', 'A'): g.T})
-    Then we create a PinSAGE neighbor sampler that samples a graph of node type "A".  Each
+    Then we create a PinSage neighbor sampler that samples a graph of node type "A".  Each
    node would have (a maximum of) 10 neighbors.
    >>> sampler = dgl.sampling.PinSAGESampler(G, 'A', 'B', 3, 0.5, 200, 10)
@@ -202,18 +201,19 @@ class PinSAGESampler(RandomWalkNeighborSampler):
             2, 2, 2, 2, 2, 2]))
    For an end-to-end example of PinSAGE model, including sampling on multiple layers
-    and computing with the sampled graphs, please refer to [TODO]
+    and computing with the sampled graphs, please refer to our PinSage example
+    in ``examples/pytorch/pinsage``.
    References
    ----------
    Graph Convolutional Neural Networks for Web-Scale Recommender Systems
        Ying et al., 2018, https://arxiv.org/abs/1806.01973
    """
-    def __init__(self, G, ntype, other_type, random_walk_length, random_walk_restart_prob,
+    def __init__(self, G, ntype, other_type, num_traversals, termination_prob,
                 num_random_walks, num_neighbors, weight_column='weights'):
        metagraph = G.metagraph()
        fw_etype = list(metagraph[ntype][other_type])[0]
        bw_etype = list(metagraph[other_type][ntype])[0]
-        super().__init__(G, random_walk_length,
+        super().__init__(G, num_traversals,
-                         random_walk_restart_prob, num_random_walks, num_neighbors,
+                         termination_prob, num_random_walks, num_neighbors,
                         metapath=[fw_etype, bw_etype], weight_column=weight_column)
--- a/python/dgl/sampling/randomwalks.py
+++ b/python/dgl/sampling/randomwalks.py
@@ -12,19 +12,18 @@ __all__ = [
    'pack_traces']
 def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob=None):
-    """Generate random walk traces from an array of seed nodes (or starting nodes),
+    """Generate random walk traces from an array of starting nodes based on the given metapath.
-    based on the given metapath.
-    For a single seed node, ``num_traces`` traces would be generated.  A trace would
+    For a single starting node, ``num_traces`` traces would be generated.  A trace would
-    1. Start from the given seed and set ``t`` to 0.
+    1. Start from the given node and set ``t`` to 0.
    2. Pick and traverse along edge type ``metapath[t]`` from the current node.
    3. If no edge can be found, halt.  Otherwise, increment ``t`` and go to step 2.
    The returned traces all have length ``len(metapath) + 1``, where the first node
-    is the seed node itself.
+    is the starting node itself.
-    If a random walk stops in advance, the trace is padded with -1 to have the same
+    If a random walk stops in advance, DGL pads the trace with -1 to have the same
    length.
    Parameters
@@ -35,34 +34,47 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
        Node ID tensor from which the random walk traces starts.
    metapath : list[str or tuple of str], optional
        Metapath, specified as a list of edge types.
-        If omitted, we assume that ``g`` only has one node & edge type.  In this
+        Mutually exclusive with ``length``.
+        If omitted, DGL assumes that ``g`` only has one node & edge type.  In this
        case, the argument ``length`` specifies the length of random walk traces.
    length : int, optional
        Length of random walks.
-        Affects only when ``metapath`` is omitted.
+        Mutually exclusive with ``metapath``.
+        Only used when ``metapath`` is None.
    prob : str, optional
        The name of the edge feature tensor on the graph storing the (unnormalized)
        probabilities associated with each edge for choosing the next node.
-        The feature tensor must be non-negative.
-        If omitted, we assume the neighbors are picked uniformly.
+        The feature tensor must be non-negative and the sum of the probabilities
+        must be positive for the outbound edges of all nodes (although they don't have
+        to sum up to one).  The result will be undefined otherwise.
+        If omitted, DGL assumes that the neighbors are picked uniformly.
    restart_prob : float or Tensor, optional
-        Probability to stop at each step.
+        Probability to terminate the current trace before each transition.
        If a tensor is given, ``restart_prob`` should have the same length as ``metapath``.
    Returns
    -------
    traces : Tensor
-        A 2-dimensional node ID tensor with shape (num_seeds, len(metapath) + 1).
+        A 2-dimensional node ID tensor with shape ``(num_seeds, len(metapath) + 1)``.
    types : Tensor
-        A 1-dimensional node type ID tensor with shape (len(metapath) + 1).
+        A 1-dimensional node type ID tensor with shape ``(len(metapath) + 1)``.
        The type IDs match the ones in the original graph ``g``.
    Examples
    --------
    The following creates a homogeneous graph:
    >>> g1 = dgl.graph([(0, 1), (1, 2), (1, 3), (2, 0), (3, 0)], 'user', 'follow')
    Normal random walk:
    >>> dgl.sampling.random_walk(g1, [0, 1, 2, 0], length=4)
    (tensor([[0, 1, 2, 0, 1],
             [1, 3, 0, 1, 3],
@@ -74,6 +86,7 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
    in every path.  In this case, it is returning all 0 (``user``).
    Random walk with restart:
    >>> dgl.sampling.random_walk_with_restart(g1, [0, 1, 2, 0], length=4, restart_prob=0.5)
    (tensor([[ 0, -1, -1, -1, -1],
             [ 1,  3,  0, -1, -1],
@@ -81,6 +94,7 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
             [ 0, -1, -1, -1, -1]]), tensor([0, 0, 0, 0, 0]))
    Non-uniform random walk:
    >>> g1.edata['p'] = torch.FloatTensor([1, 0, 1, 1, 1])     # disallow going from 1 to 2
    >>> dgl.sampling.random_walk(g1, [0, 1, 2, 0], length=4, prob='p')
    (tensor([[0, 1, 3, 0, 1],
@@ -89,6 +103,7 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
             [0, 1, 3, 0, 1]]), tensor([0, 0, 0, 0, 0]))
    Metapath-based random walk:
    >>> g2 = dgl.heterograph({
    ...     ('user', 'follow', 'user'): [(0, 1), (1, 2), (1, 3), (2, 0), (3, 0)],
    ...     ('user', 'view', 'item'): [(0, 0), (0, 1), (1, 1), (2, 2), (3, 2), (3, 1)],
@@ -102,6 +117,7 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
    Metapath-based random walk, with restarts only on items (i.e. after traversing a "view"
    relationship):
    >>> dgl.sampling.random_walk(
    ...     g2, [0, 1, 2, 0], metapath=['follow', 'view', 'viewed-by'] * 2,
    ...     restart_prob=torch.FloatTensor([0, 0.5, 0, 0, 0.5, 0]))
@@ -211,6 +227,7 @@ def pack_traces(traces, types):
    The third and fourth tensor indicates the length and the offset of each path.  With these
    tensors it is easy to obtain the i-th random walk path with:
    >>> vids = concat_vids.split(lengths.tolist())
    >>> vtypes = concat_vtypes.split(lengths.tolist())
    >>> vids[1], vtypes[1]

--- a/src/random/cpu/choice.cc
+++ b/src/random/cpu/choice.cc
@@ -82,7 +82,7 @@ void RandomEngine::UniformChoice(IdxType num, IdxType population, IdxType* out,
      for (IdxType i = 0; i < num; ++i)
        out[i] = i;
      for (IdxType i = num; i < population; ++i) {
-        const IdxType j = RandInt(i);
+        const IdxType j = RandInt(i + 1);
        if (j < num)
          out[j] = i;
      }