[Doc] API Doc update for mxnet and tf, remove some degree check (#2028)

* mx tf relconv * use method instead of private attr * src and dst have different fc for gat * update edgeconv * change sage and sgconv * no degree check on gin * add remainding API doc * fix pylint * infer fc_src and fc_dst, only one tensor for block * fix pytest

[Doc] API Doc update for mxnet and tf, remove some degree check (#2028)
* mx tf relconv * use method instead of private attr * src and dst have different fc for gat * update edgeconv * change sage and sgconv * no degree check on gin * add remainding API doc * fix pylint * infer fc_src and fc_dst, only one tensor for block * fix pytest
b1e69105 · Tianjun Xiao · GitHub · 4f1da61b · b1e69105 · b1e69105
Unverified Commit b1e69105 authored Aug 17, 2020 by Tianjun Xiao Committed by GitHub Aug 17, 2020
20 changed files
--- a/examples/pytorch/ogb/cluster-gat/main.py
+++ b/examples/pytorch/ogb/cluster-gat/main.py
@@ -100,9 +100,9 @@ class GAT(nn.Module):
                h = x[input_nodes].to(device)
                h_dst = h[:block.number_of_dst_nodes()].to(device)
                if l < self.n_layers - 1:
-                   h = layer(block, (h, h_dst)).flatten(1)
+                   h = layer(block, h).flatten(1)
                else:
-                    h = layer(block, (h, h_dst))
+                    h = layer(block, h)
                    h = h.mean(1)
                    h = h.log_softmax(dim=-1)


--- a/python/dgl/nn/mxnet/conv/agnnconv.py
+++ b/python/dgl/nn/mxnet/conv/agnnconv.py
@@ -6,11 +6,16 @@ from mxnet.gluon import nn
 from .... import function as fn
 from ....ops import edge_softmax
 from ..utils import normalize
+from ....base import DGLError
 from ....utils import expand_as_pair


 class AGNNConv(nn.Block):
-    r"""Attention-based Graph Neural Network layer from paper `Attention-based
+    r"""
+
+    Description
+    -----------
+    Attention-based Graph Neural Network layer from paper `Attention-based
    Graph Neural Network for Semi-Supervised Learning
    <https://arxiv.org/abs/1803.03735>`__.

@@ -22,25 +27,91 @@ class AGNNConv(nn.Block):
    .. math::
        P_{ij} = \mathrm{softmax}_i ( \beta \cdot \cos(h_i^l, h_j^l))

+    where :math:`\beta` is a single scalar parameter.
+
    Parameters
    ----------
    init_beta : float, optional
-        The :math:`\beta` in the formula.
+        The :math:`\beta` in the formula, a single scalar parameter.
    learn_beta : bool, optional
        If True, :math:`\beta` will be learnable parameter.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will be invalid
+        since no message will be passed to those nodes. This is harmful for some applications
+        causing silent performance regression. This module will raise a DGLError if it detects
+        0-in-degree nodes in input graph. By setting ``True``, it will suppress the check
+        and let the users handle it by themselves. Default: ``False``.
+
+    Notes
+    -----
+    Zero in-degree nodes will lead to invalid output value. This is because no message
+    will be passed to those nodes, the aggregation function will be appied on empty input.
+    A common practice to avoid this is to add a self-loop for each node in the graph if
+    it is homogeneous, which can be achieved by:
+
+    >>> g = ... # a DGLGraph
+    >>> g = dgl.add_self_loop(g)
+
+    Calling ``add_self_loop`` will not work for some graphs, for example, heterogeneous graph
+    since the edge type can not be decided for self_loop edges. Set ``allow_zero_in_degree``
+    to ``True`` for those cases to unblock the code and handle zere-in-degree nodes manually.
+    A common practise to handle this is to filter out the nodes with zere-in-degree when use
+    after conv.
+
+    Example
+    -------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from dgl.nn import AGNNConv
+    >>>
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> g = dgl.add_self_loop(g)
+    >>> feat = mx.nd.ones((6, 10))
+    >>> conv = AGNNConv()
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, feat)
+    >>> res
+    [[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
+    [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
+    [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
+    [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
+    [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
+    [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
+    <NDArray 6x10 @cpu(0)>
    """
    def __init__(self,
                 init_beta=1.,
-                 learn_beta=True):
+                 learn_beta=True,
+                 allow_zero_in_degree=False):
        super(AGNNConv, self).__init__()
+        self._allow_zero_in_degree = allow_zero_in_degree
        with self.name_scope():
            self.beta = self.params.get('beta',
                                        shape=(1,),
                                        grad_req='write' if learn_beta else 'null',
                                        init=mx.init.Constant(init_beta))

+    def set_allow_zero_in_degree(self, set_value):
+        r"""
+
+        Description
+        -----------
+        Set allow_zero_in_degree flag.
+
+        Parameters
+        ----------
+        set_value : bool
+            The value to be set to the flag.
+        """
+        self._allow_zero_in_degree = set_value
+
    def forward(self, graph, feat):
-        r"""Compute AGNN Layer.
+        r"""
+
+        Description
+        -----------
+        Compute AGNN layer.

        Parameters
        ----------
@@ -50,7 +121,7 @@ class AGNNConv(nn.Block):
            The input feature of shape :math:`(N, *)` :math:`N` is the
            number of nodes, and :math:`*` could be of any shape.
            If a pair of mxnet.NDArray is given, the pair must contain two tensors of shape
-            :math:`(N_{in}, *)` and :math:`(N_{out}, *})`, the the :math:`*` in the later
+            :math:`(N_{in}, *)` and :math:`(N_{out}, *)`, the :math:`*` in the later
            tensor must equal the previous one.

        Returns
@@ -58,8 +129,27 @@ class AGNNConv(nn.Block):
        mxnet.NDArray
            The output feature of shape :math:`(N, *)` where :math:`*`
            should be the same as input shape.
+
+        Raises
+        ------
+        DGLError
+            If there are 0-in-degree nodes in the input graph, it will raise DGLError
+            since no message will be passed to those nodes. This will cause invalid output.
+            The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``.
        """
        with graph.local_scope():
+            if not self._allow_zero_in_degree:
+                if graph.in_degrees().min() == 0:
+                    raise DGLError('There are 0-in-degree nodes in the graph, '
+                                   'output for those nodes will be invalid. '
+                                   'This is harmful for some applications, '
+                                   'causing silent performance regression. '
+                                   'Adding self-loop on the input graph by '
+                                   'calling `g = dgl.add_self_loop(g)` will resolve '
+                                   'the issue. Setting ``allow_zero_in_degree`` '
+                                   'to be `True` when constructing this module will '
+                                   'suppress the check and let the code run.')
+
            feat_src, feat_dst = expand_as_pair(feat, graph)
            graph.srcdata['h'] = feat_src
            graph.srcdata['norm_h'] = normalize(feat_src, p=2, axis=-1)

--- a/python/dgl/nn/mxnet/conv/appnpconv.py
+++ b/python/dgl/nn/mxnet/conv/appnpconv.py
@@ -7,25 +7,58 @@ from mxnet.gluon import nn
 from .... import function as fn

 class APPNPConv(nn.Block):
-    r"""Approximate Personalized Propagation of Neural Predictions
+    r"""
+
+    Description
+    -----------
+    Approximate Personalized Propagation of Neural Predictions
    layer from paper `Predict then Propagate: Graph Neural Networks
    meet Personalized PageRank <https://arxiv.org/pdf/1810.05997.pdf>`__.

    .. math::
-        H^{0} & = X
+        H^{0} &= X
+
+        H^{l+1} &= (1-\alpha)\left(\tilde{D}^{-1/2}
+        \tilde{A} \tilde{D}^{-1/2} H^{l}\right) + \alpha H^{0}

-        H^{t+1} & = (1-\alpha)\left(\hat{D}^{-1/2}
-        \hat{A} \hat{D}^{-1/2} H^{t}\right) + \alpha H^{0}
+    where :math:`\tilde{A}` is :math:`A` + :math:`I`.

    Parameters
    ----------
    k : int
-        Number of iterations :math:`K`.
+        The number of iterations :math:`K`.
    alpha : float
        The teleport probability :math:`\alpha`.
    edge_drop : float, optional
-        Dropout rate on edges that controls the
+        The dropout rate on edges that controls the
        messages received by each node. Default: ``0``.
+
+    Example
+    -------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from dgl.nn import APPNPConv
+    >>>
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> feat = mx.nd.ones((6, 10))
+    >>> conv = APPNPConv(k=3, alpha=0.5)
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, feat)
+    >>> res
+    [[1.         1.         1.         1.         1.         1.
+    1.         1.         1.         1.        ]
+    [1.         1.         1.         1.         1.         1.
+    1.         1.         1.         1.        ]
+    [1.         1.         1.         1.         1.         1.
+    1.         1.         1.         1.        ]
+    [1.0303301  1.0303301  1.0303301  1.0303301  1.0303301  1.0303301
+    1.0303301  1.0303301  1.0303301  1.0303301 ]
+    [0.86427665 0.86427665 0.86427665 0.86427665 0.86427665 0.86427665
+    0.86427665 0.86427665 0.86427665 0.86427665]
+    [0.5        0.5        0.5        0.5        0.5        0.5
+    0.5        0.5        0.5        0.5       ]]
+    <NDArray 6x10 @cpu(0)>
    """
    def __init__(self,
                 k,
@@ -38,14 +71,18 @@ class APPNPConv(nn.Block):
            self.edge_drop = nn.Dropout(edge_drop)

    def forward(self, graph, feat):
-        r"""Compute APPNP layer.
+        r"""
+
+        Description
+        -----------
+        Compute APPNP layer.

        Parameters
        ----------
        graph : DGLGraph
            The graph.
        feat : mx.NDArray
-            The input feature of shape :math:`(N, *)` :math:`N` is the
+            The input feature of shape :math:`(N, *)`. :math:`N` is the
            number of nodes, and :math:`*` could be of any shape.

        Returns

--- a/python/dgl/nn/mxnet/conv/chebconv.py
+++ b/python/dgl/nn/mxnet/conv/chebconv.py
@@ -9,7 +9,11 @@ from .... import laplacian_lambda_max, broadcast_nodes, function as fn


 class ChebConv(nn.Block):
-    r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
+    r"""
+
+    Description
+    -----------
+    Chebyshev Spectral Graph Convolution layer from paper `Convolutional
    Neural Networks on Graphs with Fast Localized Spectral Filtering
    <https://arxiv.org/pdf/1606.09375.pdf>`__.

@@ -18,22 +22,52 @@ class ChebConv(nn.Block):

        Z^{0, l} &= H^{l}

-        Z^{1, l} &= \hat{L} \cdot H^{l}
+        Z^{1, l} &= \tilde{L} \cdot H^{l}
+
+        Z^{k, l} &= 2 \cdot \tilde{L} \cdot Z^{k-1, l} - Z^{k-2, l}
+
+        \tilde{L} &= 2\left(I - \tilde{D}^{-1/2} \tilde{A} \tilde{D}^{-1/2}\right)/\lambda_{max} - I

-        Z^{k, l} &= 2 \cdot \hat{L} \cdot Z^{k-1, l} - Z^{k-2, l}
+    where :math:`\tilde{A}` is :math:`A` + :math:`I`, :math:`W` is learnable weight.

-        \hat{L} &= 2\left(I - \hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2}\right)/\lambda_{max} - I

    Parameters
    ----------
    in_feats: int
-        Number of input features.
+        Dimension of input features; i.e, the number of dimensions of :math:`h_i^{(l)}`.
    out_feats: int
-        Number of output features.
+        Dimension of output features :math:`h_i^{(l+1)}`.
    k : int
-        Chebyshev filter size.
+        Chebyshev filter size :math:`K`.
+    activation : function, optional
+        Activation function. Default ``ReLu``.
    bias : bool, optional
        If True, adds a learnable bias to the output. Default: ``True``.
+
+    Note
+    ----
+    ChebConv only support DGLGraph as input for now. Heterograph will report error. To be fixed.
+
+    Example
+    -------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from dgl.nn import ChebConv
+    >>>
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> feat = mx.nd.ones((6, 10))
+    >>> conv = ChebConv(10, 2, 2)
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, feat)
+    >>> res
+    [[ 0.832592   -0.738757  ]
+    [ 0.832592   -0.738757  ]
+    [ 0.832592   -0.738757  ]
+    [ 0.43377423 -1.0455742 ]
+    [ 1.1145986  -0.5218046 ]
+    [ 1.7954229   0.00196505]]
+    <NDArray 6x2 @cpu(0)>
    """
    def __init__(self,
                 in_feats,
@@ -59,7 +93,11 @@ class ChebConv(nn.Block):
                self.bias = None

    def forward(self, graph, feat, lambda_max=None):
-        r"""Compute ChebNet layer.
+        r"""
+
+        Description
+        -----------
+        Compute ChebNet layer.

        Parameters
        ----------
@@ -68,7 +106,7 @@ class ChebConv(nn.Block):
        feat : mxnet.NDArray
            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
            is size of input feature, :math:`N` is the number of nodes.
-        lambda_max : list or mxnet.NDArray or None, optional.
+        lambda_max : list or tensor or None, optional.
            A list(tensor) with length :math:`B`, stores the largest eigenvalue
            of the normalized laplacian of each individual graph in ``graph``,
            where :math:`B` is the batch size of the input graph. Default: None.

--- a/python/dgl/nn/mxnet/conv/densechebconv.py
+++ b/python/dgl/nn/mxnet/conv/densechebconv.py
@@ -7,7 +7,11 @@ from mxnet.gluon import nn


 class DenseChebConv(nn.Block):
-    r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
+    r"""
+
+    Description
+    -----------
+    Chebyshev Spectral Graph Convolution layer from paper `Convolutional
    Neural Networks on Graphs with Fast Localized Spectral Filtering
    <https://arxiv.org/pdf/1606.09375.pdf>`__.

@@ -16,17 +20,19 @@ class DenseChebConv(nn.Block):
    Parameters
    ----------
    in_feats: int
-        Number of input features.
+        Dimension of input features :math:`h_i^{(l)}`.
    out_feats: int
-        Number of output features.
+        Dimension of output features :math:`h_i^{(l+1)}`.
    k : int
        Chebyshev filter size.
+    activation : function, optional
+        Activation function, default is ReLu.
    bias : bool, optional
        If True, adds a learnable bias to the output. Default: ``True``.

    See also
    --------
-    ChebConv
+    `ChebConv <https://docs.dgl.ai/api/python/nn.pytorch.html#chebconv>`__
    """
    def __init__(self,
                 in_feats,
@@ -51,7 +57,11 @@ class DenseChebConv(nn.Block):
                self.bias = None

    def forward(self, adj, feat, lambda_max=None):
-        r"""Compute (Dense) Chebyshev Spectral Graph Convolution layer.
+        r"""
+
+        Description
+        -----------
+        Compute (Dense) Chebyshev Spectral Graph Convolution layer.

        Parameters
        ----------

--- a/python/dgl/nn/mxnet/conv/densegraphconv.py
+++ b/python/dgl/nn/mxnet/conv/densegraphconv.py
@@ -7,7 +7,11 @@ from mxnet.gluon import nn


 class DenseGraphConv(nn.Block):
-    """Graph Convolutional Network layer where the graph structure
+    """
+
+    Description
+    -----------
+    Graph Convolutional Network layer where the graph structure
    is given by an adjacency matrix.
    We recommend user to use this module when applying graph convolution on
    dense graphs.
@@ -15,23 +19,29 @@ class DenseGraphConv(nn.Block):
    Parameters
    ----------
    in_feats : int
-        Input feature size.
+        Input feature size; i.e, the number of dimensions of :math:`h_j^{(l)}`.
    out_feats : int
-        Output feature size.
+        Output feature size; i.e., the number of dimensions of :math:`h_i^{(l+1)}`.
    norm : str, optional
        How to apply the normalizer. If is `'right'`, divide the aggregated messages
        by each node's in-degrees, which is equivalent to averaging the received messages.
        If is `'none'`, no normalization is applied. Default is `'both'`,
        where the :math:`c_{ij}` in the paper is applied.
-    bias : bool
+    bias : bool, optional
        If True, adds a learnable bias to the output. Default: ``True``.
    activation : callable activation function/layer or None, optional
        If not None, applies an activation function to the updated node features.
        Default: ``None``.

+    Notes
+    -----
+    Zero in-degree nodes will lead to all-zero output. A common practice
+    to avoid this is to add a self-loop for each node in the graph,
+    which can be achieved by setting the diagonal of the adjacency matrix to be 1.
+
    See also
    --------
-    GraphConv
+    `GraphConv <https://docs.dgl.ai/api/python/nn.pytorch.html#graphconv>`__
    """
    def __init__(self,
                 in_feats,
@@ -54,7 +64,11 @@ class DenseGraphConv(nn.Block):
            self._activation = activation

    def forward(self, adj, feat):
-        r"""Compute (Dense) Graph Convolution layer.
+        r"""
+
+        Description
+        -----------
+        Compute (Dense) Graph Convolution layer.

        Parameters
        ----------
@@ -65,7 +79,7 @@ class DenseGraphConv(nn.Block):
            graph, ``adj`` should be of shape :math:`(N, N)`. In both cases,
            a row represents a destination node while a column represents a source
            node.
-        feat : torch.Tensor
+        feat : mxnet.NDArray
            The input feature.

        Returns

--- a/python/dgl/nn/mxnet/conv/densesageconv.py
+++ b/python/dgl/nn/mxnet/conv/densesageconv.py
@@ -8,18 +8,22 @@ from ....utils import check_eq_shape


 class DenseSAGEConv(nn.Block):
-    """GraphSAGE layer where the graph structure is given by an
+    """
+
+    Description
+    -----------
+    GraphSAGE layer where the graph structure is given by an
    adjacency matrix.
-    We recommend to use this module when applying GraphSAGE on dense graphs.
+    We recommend to use this module when appying GraphSAGE on dense graphs.

    Note that we only support gcn aggregator in DenseSAGEConv.

    Parameters
    ----------
    in_feats : int
-        Input feature size.
+        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.
    out_feats : int
-        Output feature size.
+        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
    feat_drop : float, optional
        Dropout rate on features. Default: 0.
    bias : bool
@@ -32,7 +36,7 @@ class DenseSAGEConv(nn.Block):

    See also
    --------
-    SAGEConv
+    `SAGEConv <https://docs.dgl.ai/api/python/nn.pytorch.html#sageconv>`__
    """
    def __init__(self,
                 in_feats,
@@ -52,7 +56,11 @@ class DenseSAGEConv(nn.Block):
                               weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)))

    def forward(self, adj, feat):
-        r"""Compute (Dense) Graph SAGE layer.
+        r"""
+
+        Description
+        -----------
+        Compute (Dense) Graph SAGE layer.

        Parameters
        ----------
@@ -64,11 +72,10 @@ class DenseSAGEConv(nn.Block):
            a row represents a destination node while a column represents a source
            node.
        feat : mxnet.NDArray or a pair of mxnet.NDArray
-            If a mxnet.NDArray is given, the input feature of shape :math:`(N, D_{in})`
-            where :math:`D_{in}` is size of input feature, :math:`N` is the number of
-            nodes.
-            If a pair of torch.Tensor is given, the pair must contain two tensors of
-            shape :math:`(N_{in}, D_{in})` and :math:`(N_{out}, D_{in})`.
+            If a mxnet.NDArray is given, the input feature of shape :math:`(N, D_{in})` where
+            :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes.
+            If a pair of mxnet.NDArray is given, the pair must contain two tensors of shape
+            :math:`(N_{in}, D_{in})` and :math:`(N_{out}, D_{in})`.

        Returns
        -------

--- a/python/dgl/nn/mxnet/conv/edgeconv.py
+++ b/python/dgl/nn/mxnet/conv/edgeconv.py
@@ -4,36 +4,106 @@ import mxnet as mx
 from mxnet.gluon import nn

 from .... import function as fn
+from ....base import DGLError
 from ....utils import expand_as_pair


 class EdgeConv(nn.Block):
-    r"""EdgeConv layer.
+    r"""
+
+    Description
+    -----------
+    EdgeConv layer.

    Introduced in "`Dynamic Graph CNN for Learning on Point Clouds
    <https://arxiv.org/pdf/1801.07829>`__".  Can be described as follows:

    .. math::
-       x_i^{(l+1)} = \max_{j \in \mathcal{N}(i)} \mathrm{ReLU}(
-       \Theta \cdot (x_j^{(l)} - x_i^{(l)}) + \Phi \cdot x_i^{(l)})
+       h_i^{(l+1)} = \max_{j \in \mathcal{N}(i)} \mathrm{ReLU}(
+       \Theta \cdot (h_j^{(l)} - h_i^{(l)}) + \Phi \cdot h_i^{(l)})

    where :math:`\mathcal{N}(i)` is the neighbor of :math:`i`.
+    :math:`\Theta` and :math:`\Phi` are linear layers.

    Parameters
    ----------
    in_feat : int
-        Input feature size.
+        Input feature size; i.e, the number of dimensions of :math:`h_j^{(l)}`.
    out_feat : int
-        Output feature size.
+        Output feature size; i.e., the number of dimensions of :math:`h_i^{(l+1)}`.
    batch_norm : bool
-        Whether to include batch normalization on messages.
+        Whether to include batch normalization on messages. Default: ``False``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will be invalid
+        since no message will be passed to those nodes. This is harmful for some applications
+        causing silent performance regression. This module will raise a DGLError if it detects
+        0-in-degree nodes in input graph. By setting ``True``, it will suppress the check
+        and let the users handle it by themselves. Default: ``False``.
+
+    Notes
+    -----
+    Zero in-degree nodes will lead to invalid output value. This is because no message
+    will be passed to those nodes, the aggregation function will be appied on empty input.
+    A common practice to avoid this is to add a self-loop for each node in the graph if
+    it is homogeneous, which can be achieved by:
+
+    >>> g = ... # a DGLGraph
+    >>> g = dgl.add_self_loop(g)
+
+    Calling ``add_self_loop`` will not work for some graphs, for example, heterogeneous graph
+    since the edge type can not be decided for self_loop edges. Set ``allow_zero_in_degree``
+    to ``True`` for those cases to unblock the code and handle zere-in-degree nodes manually.
+    A common practise to handle this is to filter out the nodes with zere-in-degree when use
+    after conv.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from mxnet import gluon
+    >>> from dgl.nn import EdgeConv
+    >>>
+    >>> # Case 1: Homogeneous graph
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> g = dgl.add_self_loop(g)
+    >>> feat = mx.nd.ones((6, 10))
+    >>> conv = EdgeConv(10, 2)
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, feat)
+    >>> res
+    [[1.0517545 0.8091326]
+    [1.0517545 0.8091326]
+    [1.0517545 0.8091326]
+    [1.0517545 0.8091326]
+    [1.0517545 0.8091326]
+    [1.0517545 0.8091326]]
+    <NDArray 6x2 @cpu(0)>
+
+    >>> # Case 2: Unidirectional bipartite graph
+    >>> u = [0, 1, 0, 0, 1]
+    >>> v = [0, 1, 2, 3, 2]
+    >>> g = dgl.bipartite((u, v))
+    >>> u_fea = mx.nd.random.randn(2, 5)
+    >>> v_fea = mx.nd.random.randn(4, 5)
+    >>> conv = EdgeConv(5, 2, 3)
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, (u_fea, v_fea))
+    >>> res
+    [[-3.4617817   0.84700686]
+    [ 1.3170856  -1.5731761 ]
+    [-2.0761423   0.56653017]
+    [-1.015364    0.78919804]]
+    <NDArray 4x2 @cpu(0)>
    """
    def __init__(self,
                 in_feat,
                 out_feat,
-                 batch_norm=False):
+                 batch_norm=False,
+                 allow_zero_in_degree=False):
        super(EdgeConv, self).__init__()
        self.batch_norm = batch_norm
+        self._allow_zero_in_degree = allow_zero_in_degree

        with self.name_scope():
            self.theta = nn.Dense(out_feat, in_units=in_feat,
@@ -51,26 +121,64 @@ class EdgeConv(nn.Block):
        phi_x = self.phi(edges.src['x'])
        return {'e': theta_x + phi_x}

+    def set_allow_zero_in_degree(self, set_value):
+        r"""
+
+        Description
+        -----------
+        Set allow_zero_in_degree flag.
+
+        Parameters
+        ----------
+        set_value : bool
+            The value to be set to the flag.
+        """
+        self._allow_zero_in_degree = set_value
+
    def forward(self, g, h):
-        r"""Forward computation
+        """
+
+        Description
+        -----------
+        Forward computation

        Parameters
        ----------
        g : DGLGraph
            The graph.
-        h : mxnet.NDArray
+        feat : mxnet.NDArray or pair of mxnet.NDArray
            :math:`(N, D)` where :math:`N` is the number of nodes and
            :math:`D` is the number of feature dimensions.

-            If a pair of tensors is given, the graph must be a uni-bipartite graph
+            If a pair of mxnet.NDArray is given, the graph must be a uni-bipartite graph
            with only one edge type, and the two tensors must have the same
            dimensionality on all except the first axis.
+
        Returns
        -------
        mxnet.NDArray
            New node features.
+
+        Raises
+        ------
+        DGLError
+            If there are 0-in-degree nodes in the input graph, it will raise DGLError
+            since no message will be passed to those nodes. This will cause invalid output.
+            The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``.
        """
        with g.local_scope():
+            if not self._allow_zero_in_degree:
+                if g.in_degrees().min() == 0:
+                    raise DGLError('There are 0-in-degree nodes in the graph, '
+                                   'output for those nodes will be invalid. '
+                                   'This is harmful for some applications, '
+                                   'causing silent performance regression. '
+                                   'Adding self-loop on the input graph by '
+                                   'calling `g = dgl.add_self_loop(g)` will resolve '
+                                   'the issue. Setting ``allow_zero_in_degree`` '
+                                   'to be `True` when constructing this module will '
+                                   'suppress the check and let the code run.')
+
            h_src, h_dst = expand_as_pair(h, g)
            g.srcdata['x'] = h_src
            g.dstdata['x'] = h_dst

--- a/python/dgl/nn/mxnet/conv/gatconv.py
+++ b/python/dgl/nn/mxnet/conv/gatconv.py
@@ -6,12 +6,17 @@ from mxnet.gluon import nn
 from mxnet.gluon.contrib.nn import Identity

 from .... import function as fn
+from ....base import DGLError
 from ....ops import edge_softmax
 from ....utils import expand_as_pair

 #pylint: enable=W0235
 class GATConv(nn.Block):
-    r"""Apply `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`__
+    r"""
+
+    Description
+    -----------
+    Apply `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`__
    over an input signal.

    .. math::
@@ -21,29 +26,117 @@ class GATConv(nn.Block):
    node :math:`j`:

    .. math::
-        \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
+        \alpha_{ij}^{l} &= \mathrm{softmax_i} (e_{ij}^{l})

-        e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
+        e_{ij}^{l} &= \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)

    Parameters
    ----------
-    in_feats : int
-        Number of input features.
+    in_feats : int, or pair of ints
+        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.
+        GATConv can be applied on homogeneous graph and unidirectional
+        `bipartite graph <https://docs.dgl.ai/generated/dgl.bipartite.html?highlight=bipartite>`__.
+        If the layer is to be applied to a unidirectional bipartite graph, ``in_feats``
+        specifies the input feature size on both the source and destination nodes.  If
+        a scalar is given, the source and destination node feature size would take the
+        same value.
    out_feats : int
-        Output feature size.
+        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
    num_heads : int
        Number of heads in Multi-Head Attention.
    feat_drop : float, optional
-        Dropout rate on feature, defaults: ``0``.
+        Dropout rate on feature. Defaults: ``0``.
    attn_drop : float, optional
-        Dropout rate on attention weight, defaults: ``0``.
+        Dropout rate on attention weight. Defaults: ``0``.
    negative_slope : float, optional
-        LeakyReLU angle of negative slope.
+        LeakyReLU angle of negative slope. Defaults: ``0.2``.
    residual : bool, optional
-        If True, use residual connection.
+        If True, use residual connection. Defaults: ``False``.
    activation : callable activation function/layer or None, optional.
        If not None, applies an activation function to the updated node features.
        Default: ``None``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will be invalid
+        since no message will be passed to those nodes. This is harmful for some applications
+        causing silent performance regression. This module will raise a DGLError if it detects
+        0-in-degree nodes in input graph. By setting ``True``, it will suppress the check
+        and let the users handle it by themselves. Defaults: ``False``.
+
+    Notes
+    -----
+    Zero in-degree nodes will lead to invalid output value. This is because no message
+    will be passed to those nodes, the aggregation function will be appied on empty input.
+    A common practice to avoid this is to add a self-loop for each node in the graph if
+    it is homogeneous, which can be achieved by:
+
+    >>> g = ... # a DGLGraph
+    >>> g = dgl.add_self_loop(g)
+
+    Calling ``add_self_loop`` will not work for some graphs, for example, heterogeneous graph
+    since the edge type can not be decided for self_loop edges. Set ``allow_zero_in_degree``
+    to ``True`` for those cases to unblock the code and handle zere-in-degree nodes manually.
+    A common practise to handle this is to filter out the nodes with zere-in-degree when use
+    after conv.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from mxnet import gluon
+    >>> from dgl.nn import GATConv
+    >>>
+    >>> # Case 1: Homogeneous graph
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> g = dgl.add_self_loop(g)
+    >>> feat = mx.nd.ones((6, 10))
+    >>> gatconv = GATConv(10, 2, num_heads=3)
+    >>> gatconv.initialize(ctx=mx.cpu(0))
+    >>> res = gatconv(g, feat)
+    >>> res
+    [[[ 0.32368395 -0.10501936]
+    [ 1.0839728   0.92690575]
+    [-0.54581136 -0.84279203]]
+    [[ 0.32368395 -0.10501936]
+    [ 1.0839728   0.92690575]
+    [-0.54581136 -0.84279203]]
+    [[ 0.32368395 -0.10501936]
+    [ 1.0839728   0.92690575]
+    [-0.54581136 -0.84279203]]
+    [[ 0.32368395 -0.10501937]
+    [ 1.0839728   0.9269058 ]
+    [-0.5458114  -0.8427921 ]]
+    [[ 0.32368395 -0.10501936]
+    [ 1.0839728   0.92690575]
+    [-0.54581136 -0.84279203]]
+    [[ 0.32368395 -0.10501936]
+    [ 1.0839728   0.92690575]
+    [-0.54581136 -0.84279203]]]
+    <NDArray 6x3x2 @cpu(0)>
+
+    >>> # Case 2: Unidirectional bipartite graph
+    >>> u = [0, 1, 0, 0, 1]
+    >>> v = [0, 1, 2, 3, 2]
+    >>> g = dgl.bipartite((u, v))
+    >>> u_feat = mx.nd.random.randn(2, 5)
+    >>> v_feat = mx.nd.random.randn(4, 10)
+    >>> gatconv = GATConv((5,10), 2, 3)
+    >>> gatconv.initialize(ctx=mx.cpu(0))
+    >>> res = gatconv(g, (u_feat, v_feat))
+    >>> res
+    [[[-1.01624     1.8138596 ]
+    [ 1.2322129  -0.8410206 ]
+    [-1.9325689   1.3824553 ]]
+    [[ 0.9915016  -1.6564168 ]
+    [-0.32610354  0.42505783]
+    [ 1.5278397  -0.92114615]]
+    [[-0.32592064  0.62067866]
+    [ 0.6162219  -0.3405491 ]
+    [-1.356375    0.9988818 ]]
+    [[-1.01624     1.8138596 ]
+    [ 1.2322129  -0.8410206 ]
+    [-1.9325689   1.3824553 ]]]
+    <NDArray 4x3x2 @cpu(0)>
    """
    def __init__(self,
                 in_feats,
@@ -53,16 +146,26 @@ class GATConv(nn.Block):
                 attn_drop=0.,
                 negative_slope=0.2,
                 residual=False,
-                 activation=None):
+                 activation=None,
+                 allow_zero_in_degree=False):
        super(GATConv, self).__init__()
        self._num_heads = num_heads
        self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats)
        self._in_feats = in_feats
        self._out_feats = out_feats
+        self._allow_zero_in_degree = allow_zero_in_degree
        with self.name_scope():
-            self.fc = nn.Dense(out_feats * num_heads, use_bias=False,
-                               weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)),
-                               in_units=in_feats)
+            if isinstance(in_feats, tuple):
+                self.fc_src = nn.Dense(out_feats * num_heads, use_bias=False,
+                                       weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)),
+                                       in_units=self._in_src_feats)
+                self.fc_dst = nn.Dense(out_feats * num_heads, use_bias=False,
+                                       weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)),
+                                       in_units=self._in_dst_feats)
+            else:
+                self.fc = nn.Dense(out_feats * num_heads, use_bias=False,
+                                   weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)),
+                                   in_units=in_feats)
            self.attn_l = self.params.get('attn_l',
                                          shape=(1, num_heads, out_feats),
                                          init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
@@ -84,33 +187,71 @@ class GATConv(nn.Block):
                self.res_fc = None
            self.activation = activation

+    def set_allow_zero_in_degree(self, set_value):
+        r"""
+
+        Description
+        -----------
+        Set allow_zero_in_degree flag.
+
+        Parameters
+        ----------
+        set_value : bool
+            The value to be set to the flag.
+        """
+        self._allow_zero_in_degree = set_value
+
    def forward(self, graph, feat):
-        r"""Compute graph attention network layer.
+        r"""
+
+        Description
+        -----------
+        Compute graph attention network layer.

        Parameters
        ----------
        graph : DGLGraph
            The graph.
-        feat : mxnet.NDArray
-            If a mxnet.NDArray is given, the input feature of shape :math:`(N, D_{in})`
-            where :math:`D_{in}` is size of input feature, :math:`N` is the number of
-            nodes.
-            If a pair of mxnet.NDArray is given, the pair must contain two tensors of
-            shape :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`.
+        feat : mxnet.NDArray or pair of mxnet.NDArray
+            If a mxnet.NDArray is given, the input feature of shape :math:`(N, D_{in})` where
+            :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes.
+            If a pair of mxnet.NDArray is given, the pair must contain two tensors of shape
+            :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`.

        Returns
        -------
        mxnet.NDArray
            The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
            is the number of heads, and :math:`D_{out}` is size of output feature.
+
+        Raises
+        ------
+        DGLError
+            If there are 0-in-degree nodes in the input graph, it will raise DGLError
+            since no message will be passed to those nodes. This will cause invalid output.
+            The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``.
        """
        with graph.local_scope():
+            if not self._allow_zero_in_degree:
+                if graph.in_degrees().min() == 0:
+                    raise DGLError('There are 0-in-degree nodes in the graph, '
+                                   'output for those nodes will be invalid. '
+                                   'This is harmful for some applications, '
+                                   'causing silent performance regression. '
+                                   'Adding self-loop on the input graph by '
+                                   'calling `g = dgl.add_self_loop(g)` will resolve '
+                                   'the issue. Setting ``allow_zero_in_degree`` '
+                                   'to be `True` when constructing this module will '
+                                   'suppress the check and let the code run.')
+
            if isinstance(feat, tuple):
                h_src = self.feat_drop(feat[0])
                h_dst = self.feat_drop(feat[1])
-                feat_src = self.fc(h_src).reshape(
+                if not hasattr(self, 'fc_src'):
+                    self.fc_src, self.fc_dst = self.fc, self.fc
+                feat_src = self.fc_src(h_src).reshape(
                    -1, self._num_heads, self._out_feats)
-                feat_dst = self.fc(h_dst).reshape(
+                feat_dst = self.fc_dst(h_dst).reshape(
                    -1, self._num_heads, self._out_feats)
            else:
                h_src = h_dst = self.feat_drop(feat)

--- a/python/dgl/nn/mxnet/conv/ginconv.py
+++ b/python/dgl/nn/mxnet/conv/ginconv.py
@@ -8,7 +8,11 @@ from ....utils import expand_as_pair


 class GINConv(nn.Block):
-    r"""Graph Isomorphism Network layer from paper `How Powerful are Graph
+    r"""
+
+    Description
+    -----------
+    Graph Isomorphism Network layer from paper `How Powerful are Graph
    Neural Networks? <https://arxiv.org/pdf/1810.00826.pdf>`__.

    .. math::
@@ -26,7 +30,37 @@ class GINConv(nn.Block):
    init_eps : float, optional
        Initial :math:`\epsilon` value, default: ``0``.
    learn_eps : bool, optional
-        If True, :math:`\epsilon` will be a learnable parameter.
+        If True, :math:`\epsilon` will be a learnable parameter. Default: ``False``.
+
+    Example
+    -------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from mxnet import gluon
+    >>> from dgl.nn import GINConv
+    >>>
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> feat = mx.nd.ones((6, 10))
+    >>> lin = gluon.nn.Dense(10)
+    >>> lin.initialize(ctx=mx.cpu(0))
+    >>> conv = GINConv(lin, 'max')
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, feat)
+    >>> res
+    [[ 0.44832918 -0.05283341  0.20823681  0.16020004  0.37311912 -0.03372726
+    -0.05716725 -0.20730163  0.14121324  0.46083626]
+    [ 0.44832918 -0.05283341  0.20823681  0.16020004  0.37311912 -0.03372726
+    -0.05716725 -0.20730163  0.14121324  0.46083626]
+    [ 0.44832918 -0.05283341  0.20823681  0.16020004  0.37311912 -0.03372726
+    -0.05716725 -0.20730163  0.14121324  0.46083626]
+    [ 0.44832918 -0.05283341  0.20823681  0.16020004  0.37311912 -0.03372726
+    -0.05716725 -0.20730163  0.14121324  0.46083626]
+    [ 0.44832918 -0.05283341  0.20823681  0.16020004  0.37311912 -0.03372726
+    -0.05716725 -0.20730163  0.14121324  0.46083626]
+    [ 0.22416459 -0.0264167   0.10411841  0.08010002  0.18655956 -0.01686363
+    -0.02858362 -0.10365082  0.07060662  0.23041813]]
+    <NDArray 6x10 @cpu(0)>
    """
    def __init__(self,
                 apply_func,
@@ -51,18 +85,21 @@ class GINConv(nn.Block):
                                       init=mx.init.Constant(init_eps))

    def forward(self, graph, feat):
-        r"""Compute Graph Isomorphism Network layer.
+        r"""
+
+        Description
+        -----------
+        Compute Graph Isomorphism Network layer.

        Parameters
        ----------
        graph : DGLGraph
            The graph.
        feat : mxnet.NDArray or a pair of mxnet.NDArray
-            If a mxnet.NDArray is given, the input feature of shape :math:`(N, D_{in})`
-            where :math:`D_{in}` is size of input feature, :math:`N` is the number of
-            nodes.
-            If a pair of mxnet.NDArray is given, the pair must contain two tensors of
-            shape :math:`(N_{in}, D_{in})` and :math:`(N_{out}, D_{in})`.
+            If a mxnet.NDArray is given, the input feature of shape :math:`(N, D_{in})` where
+            :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes.
+            If a pair of mxnet.NDArray is given, the pair must contain two tensors of shape
+            :math:`(N_{in}, D_{in})` and :math:`(N_{out}, D_{in})`.
            If ``apply_func`` is not None, :math:`D_{in}` should
            fit the input dimensionality requirement of ``apply_func``.


--- a/python/dgl/nn/mxnet/conv/gmmconv.py
+++ b/python/dgl/nn/mxnet/conv/gmmconv.py
@@ -7,33 +7,39 @@ from mxnet.gluon import nn
 from mxnet.gluon.contrib.nn import Identity

 from .... import function as fn
+from ....base import DGLError
 from ....utils import expand_as_pair


 class GMMConv(nn.Block):
-    r"""The Gaussian Mixture Model Convolution layer from `Geometric Deep
+    r"""
+
+    Description
+    -----------
+    The Gaussian Mixture Model Convolution layer from `Geometric Deep
    Learning on Graphs and Manifolds using Mixture Model CNNs
    <http://openaccess.thecvf.com/content_cvpr_2017/papers/Monti_Geometric_Deep_Learning_CVPR_2017_paper.pdf>`__.

    .. math::
-        h_i^{l+1} & = \mathrm{aggregate}\left(\left\{\frac{1}{K}
+        u_{ij} &= f(x_i, x_j), x_j \in \mathcal{N}(i)
+
+        w_k(u) &= \exp\left(-\frac{1}{2}(u-\mu_k)^T \Sigma_k^{-1} (u - \mu_k)\right)
+
+        h_i^{l+1} &= \mathrm{aggregate}\left(\left\{\frac{1}{K}
         \sum_{k}^{K} w_k(u_{ij}), \forall j\in \mathcal{N}(i)\right\}\right)

-        w_k(u) & = \exp\left(-\frac{1}{2}(u-\mu_k)^T \Sigma_k^{-1} (u - \mu_k)\right)
+    where :math:`u` denotes the pseudo-coordinates between a vertex and one of its neighbor,
+    computed using function :math:`f`, :math:`\Sigma_k^{-1}` and :math:`\mu_k` are
+    learnable parameters representing the covariance matrix and mean vector of a Gaussian kernel.

    Parameters
    ----------
-    in_feats : int, or pair of ints
-        Number of input features.
-
-        If the layer is to be applied on a unidirectional bipartite graph, ``in_feats``
-        specifies the input feature size on both the source and destination nodes.  If
-        a scalar is given, the source and destination node feature size would take the
-        same value.
+    in_feats : int
+        Number of input features; i.e., the number of dimensions of :math:`x_i`.
    out_feats : int
-        Number of output features.
+        Number of output features; i.e., the number of dimensions of :math:`h_i^{(l+1)}`.
    dim : int
-        Dimensionality of pseudo-coordinte.
+        Dimensionality of pseudo-coordinte; i.e, the number of dimensions of :math:`u_{ij}`.
    n_kernels : int
        Number of kernels :math:`K`.
    aggregator_type : str
@@ -42,6 +48,69 @@ class GMMConv(nn.Block):
        If True, use residual connection inside this layer. Default: ``False``.
    bias : bool
        If True, adds a learnable bias to the output. Default: ``True``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will be invalid
+        since no message will be passed to those nodes. This is harmful for some applications
+        causing silent performance regression. This module will raise a DGLError if it detects
+        0-in-degree nodes in input graph. By setting ``True``, it will suppress the check
+        and let the users handle it by themselves. Default: ``False``.
+
+    Notes
+    -----
+    Zero in-degree nodes will lead to invalid output value. This is because no message
+    will be passed to those nodes, the aggregation function will be appied on empty input.
+    A common practice to avoid this is to add a self-loop for each node in the graph if
+    it is homogeneous, which can be achieved by:
+
+    >>> g = ... # a DGLGraph
+    >>> g = dgl.add_self_loop(g)
+
+    Calling ``add_self_loop`` will not work for some graphs, for example, heterogeneous graph
+    since the edge type can not be decided for self_loop edges. Set ``allow_zero_in_degree``
+    to ``True`` for those cases to unblock the code and handle zere-in-degree nodes manually.
+    A common practise to handle this is to filter out the nodes with zere-in-degree when use
+    after conv.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from dgl.nn import GMMConv
+    >>>
+    >>> # Case 1: Homogeneous graph
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> g = dgl.add_self_loop(g)
+    >>> feat = mx.nd.ones((6, 10))
+    >>> conv = GMMConv(10, 2, 3, 2, 'mean')
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> pseudo = mx.nd.ones((12, 3))
+    >>> res = conv(g, feat, pseudo)
+    >>> res
+    [[-0.05083769 -0.1567954 ]
+    [-0.05083769 -0.1567954 ]
+    [-0.05083769 -0.1567954 ]
+    [-0.05083769 -0.1567954 ]
+    [-0.05083769 -0.1567954 ]
+    [-0.05083769 -0.1567954 ]]
+    <NDArray 6x2 @cpu(0)>
+
+    >>> # Case 2: Unidirectional bipartite graph
+    >>> u = [0, 1, 0, 0, 1]
+    >>> v = [0, 1, 2, 3, 2]
+    >>> g = dgl.bipartite((u, v))
+    >>> u_fea = mx.nd.random.randn(2, 5)
+    >>> v_fea = mx.nd.random.randn(4, 10)
+    >>> pseudo = mx.nd.ones((5, 3))
+    >>> conv = GMMConv((5, 10), 2, 3, 2, 'mean')
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, (u_fea, v_fea), pseudo)
+    >>> res
+    [[-0.1005067  -0.09494358]
+    [-0.0023314  -0.07597432]
+    [-0.05141905 -0.08545895]
+    [-0.1005067  -0.09494358]]
+    <NDArray 4x2 @cpu(0)>
    """
    def __init__(self,
                 in_feats,
@@ -50,13 +119,15 @@ class GMMConv(nn.Block):
                 n_kernels,
                 aggregator_type='sum',
                 residual=False,
-                 bias=True):
+                 bias=True,
+                 allow_zero_in_degree=False):
        super(GMMConv, self).__init__()

        self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats)
        self._out_feats = out_feats
        self._dim = dim
        self._n_kernels = n_kernels
+        self._allow_zero_in_degree = allow_zero_in_degree
        if aggregator_type == 'sum':
            self._reducer = fn.sum
        elif aggregator_type == 'mean':
@@ -92,8 +163,26 @@ class GMMConv(nn.Block):
            else:
                self.bias = None

+    def set_allow_zero_in_degree(self, set_value):
+        r"""
+
+        Description
+        -----------
+        Set allow_zero_in_degree flag.
+
+        Parameters
+        ----------
+        set_value : bool
+            The value to be set to the flag.
+        """
+        self._allow_zero_in_degree = set_value
+
    def forward(self, graph, feat, pseudo):
-        """Compute Gaussian Mixture Model Convolution layer.
+        """
+
+        Description
+        -----------
+        Compute Gaussian Mixture Model Convolution layer.

        Parameters
        ----------
@@ -114,7 +203,26 @@ class GMMConv(nn.Block):
        mxnet.NDArray
            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
            is the output feature size.
+
+        Raises
+        ------
+        DGLError
+            If there are 0-in-degree nodes in the input graph, it will raise DGLError
+            since no message will be passed to those nodes. This will cause invalid output.
+            The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``.
        """
+        if not self._allow_zero_in_degree:
+            if graph.in_degrees().min() == 0:
+                raise DGLError('There are 0-in-degree nodes in the graph, '
+                               'output for those nodes will be invalid. '
+                               'This is harmful for some applications, '
+                               'causing silent performance regression. '
+                               'Adding self-loop on the input graph by '
+                               'calling `g = dgl.add_self_loop(g)` will resolve '
+                               'the issue. Setting ``allow_zero_in_degree`` '
+                               'to be `True` when constructing this module will '
+                               'suppress the check and let the code run.')
+
        feat_src, feat_dst = expand_as_pair(feat, graph)
        with graph.local_scope():
            graph.srcdata['h'] = self.fc(feat_src).reshape(

--- a/python/dgl/nn/mxnet/conv/graphconv.py
+++ b/python/dgl/nn/mxnet/conv/graphconv.py
@@ -159,6 +159,20 @@ class GraphConv(gluon.Block):

        self._activation = activation

+    def set_allow_zero_in_degree(self, set_value):
+        r"""
+
+        Description
+        -----------
+        Set allow_zero_in_degree flag.
+
+        Parameters
+        ----------
+        set_value : bool
+            The value to be set to the flag.
+        """
+        self._allow_zero_in_degree = set_value
+
    def forward(self, graph, feat, weight=None):
        r"""

@@ -204,7 +218,7 @@ class GraphConv(gluon.Block):
        """
        with graph.local_scope():
            if not self._allow_zero_in_degree:
-                if (graph.in_degrees() == 0).asnumpy().any():
+                if graph.in_degrees().min() == 0:
                    raise DGLError('There are 0-in-degree nodes in the graph, '
                                   'output for those nodes will be invalid. '
                                   'This is harmful for some applications, '

--- a/python/dgl/nn/mxnet/conv/nnconv.py
+++ b/python/dgl/nn/mxnet/conv/nnconv.py
@@ -9,24 +9,32 @@ from ....utils import expand_as_pair


 class NNConv(nn.Block):
-    r"""Graph Convolution layer introduced in `Neural Message Passing
+    r"""
+
+    Description
+    -----------
+    Graph Convolution layer introduced in `Neural Message Passing
    for Quantum Chemistry <https://arxiv.org/pdf/1704.01212.pdf>`__.

    .. math::
        h_{i}^{l+1} = h_{i}^{l} + \mathrm{aggregate}\left(\left\{
        f_\Theta (e_{ij}) \cdot h_j^{l}, j\in \mathcal{N}(i) \right\}\right)

+    where :math:`e_{ij}` is the edge feature, :math:`f_\Theta` is a function
+    with learnable parameters.
+
    Parameters
    ----------
-    in_feats : int or pair of ints
-        Input feature size.
-
+    in_feats : int
+        Input feature size; i.e, the number of dimensions of :math:`h_j^{(l)}`.
+        NN can be applied on homogeneous graph and unidirectional
+        `bipartite graph <https://docs.dgl.ai/generated/dgl.bipartite.html?highlight=bipartite>`__.
        If the layer is to be applied on a unidirectional bipartite graph, ``in_feats``
        specifies the input feature size on both the source and destination nodes.  If
        a scalar is given, the source and destination node feature size would take the
        same value.
    out_feats : int
-        Output feature size.
+        Output feature size; i.e., the number of dimensions of :math:`h_i^{(l+1)}`.
    edge_func : callable activation function/layer
        Maps each edge feature to a vector of shape
        ``(in_feats * out_feats)`` as weight to compute
@@ -38,6 +46,52 @@ class NNConv(nn.Block):
        If True, use residual connection. Default: ``False``.
    bias : bool, optional
        If True, adds a learnable bias to the output. Default: ``True``.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from mxnet import gluon
+    >>> from dgl.nn import NNConv
+    >>>
+    >>> # Case 1: Homogeneous graph
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> g = dgl.add_self_loop(g)
+    >>> feat = mx.nd.ones((6, 10))
+    >>> lin = gluon.nn.Dense(20)
+    >>> lin.initialize(ctx=mx.cpu(0))
+    >>> def edge_func(efeat):
+    >>>      return lin(efeat)
+    >>> efeat = mx.nd.ones((12, 5))
+    >>> conv = NNConv(10, 2, edge_func, 'mean')
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, feat, efeat)
+    >>> res
+    [[0.39946803 0.32098457]
+    [0.39946803 0.32098457]
+    [0.39946803 0.32098457]
+    [0.39946803 0.32098457]
+    [0.39946803 0.32098457]
+    [0.39946803 0.32098457]]
+    <NDArray 6x2 @cpu(0)>
+
+    >>> # Case 2: Unidirectional bipartite graph
+    >>> u = [0, 1, 0, 0, 1]
+    >>> v = [0, 1, 2, 3, 2]
+    >>> g = dgl.bipartite((u, v))
+    >>> u_feat = mx.nd.random.randn(2, 10)
+    >>> v_feat = mx.nd.random.randn(4, 10)
+    >>> conv = NNConv(10, 2, edge_func, 'mean')
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> efeat = mx.nd.ones((5, 5))
+    >>> res = conv(g, (u_feat, v_feat), efeat)
+    >>> res
+    [[ 0.24425688  0.3238042 ]
+    [-0.11651017 -0.01738572]
+    [ 0.06387337  0.15320925]
+    [ 0.24425688  0.3238042 ]]
+    <NDArray 4x2 @cpu(0)>
    """
    def __init__(self,
                 in_feats,

--- a/python/dgl/nn/mxnet/conv/relgraphconv.py
+++ b/python/dgl/nn/mxnet/conv/relgraphconv.py
@@ -11,7 +11,11 @@ from .. import utils


 class RelGraphConv(gluon.Block):
-    r"""Relational graph convolution layer.
+    r"""
+
+    Description
+    -----------
+    Relational graph convolution layer.

    Relational graph convolution is introduced in "`Modeling Relational Data with Graph
    Convolutional Networks <https://arxiv.org/abs/1703.06103>`__"
@@ -19,8 +23,8 @@ class RelGraphConv(gluon.Block):

    .. math::

-      h_i^{(l+1)} = \sigma(\sum_{r\in\mathcal{R}}
-      \sum_{j\in\mathcal{N}^r(i)}\frac{1}{c_{i,r}}W_r^{(l)}h_j^{(l)}+W_0^{(l)}h_i^{(l)})
+       h_i^{(l+1)} = \sigma(\sum_{r\in\mathcal{R}}
+       \sum_{j\in\mathcal{N}^r(i)}\frac{1}{c_{i,r}}W_r^{(l)}h_j^{(l)}+W_0^{(l)}h_i^{(l)})

    where :math:`\mathcal{N}^r(i)` is the neighbor set of node :math:`i` w.r.t. relation
    :math:`r`. :math:`c_{i,r}` is the normalizer equal
@@ -31,38 +35,73 @@ class RelGraphConv(gluon.Block):

    .. math::

-      W_r^{(l)} = \sum_{b=1}^B a_{rb}^{(l)}V_b^{(l)}
+       W_r^{(l)} = \sum_{b=1}^B a_{rb}^{(l)}V_b^{(l)}

-    where :math:`B` is the number of bases.
+    where :math:`B` is the number of bases, :math:`V_b^{(l)}` are linearly combined
+    with coefficients :math:`a_{rb}^{(l)}`.

    The block-diagonal-decomposition regularization decomposes :math:`W_r` into :math:`B`
    number of block diagonal matrices. We refer :math:`B` as the number of bases.

+    The block regularization decomposes :math:`W_r` by:
+
+    .. math::
+
+       W_r^{(l)} = \oplus_{b=1}^B Q_{rb}^{(l)}
+
+    where :math:`B` is the number of bases, :math:`Q_{rb}^{(l)}` are block
+    bases with shape :math:`R^{(d^{(l+1)}/B)*(d^{l}/B)}`.
+
    Parameters
    ----------
    in_feat : int
-        Input feature size.
+        Input feature size; i.e, the number of dimensions of :math:`h_j^{(l)}`.
    out_feat : int
-        Output feature size.
+        Output feature size; i.e., the number of dimensions of :math:`h_i^{(l+1)}`.
    num_rels : int
-        Number of relations.
+        Number of relations. .
    regularizer : str
-        Which weight regularizer to use "basis" or "bdd"
+        Which weight regularizer to use "basis" or "bdd".
+        "basis" is short for basis-diagonal-decomposition.
+        "bdd" is short for block-diagonal-decomposition.
    num_bases : int, optional
-        Number of bases. If is none, use number of relations. Default: None.
+        Number of bases. If is none, use number of relations. Default: ``None``.
    bias : bool, optional
-        True if bias is added. Default: True
+        True if bias is added. Default: ``True``.
    activation : callable, optional
-        Activation function. Default: None
+        Activation function. Default: ``None``.
    self_loop : bool, optional
-        True to include self loop message. Default: False
+        True to include self loop message. Default: ``True``.
    low_mem : bool, optional
-        Use low-memory implementation. MXNet currently does not support this.
-        Default: False.
+        True to use low memory implementation of relation message passing function. Default: False.
+        This option trades speed with memory consumption, and will slowdown the forward/backward.
+        Turn it on when you encounter OOM problem during training or evaluation. Default: ``False``.
    dropout : float, optional
-        Dropout rate. Default: 0.0
+        Dropout rate. Default: ``0.0``
    layer_norm: float, optional
-        Add layer norm. Default: False
+        Add layer norm. Default: ``False``
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from mxnet import gluon
+    >>> from dgl.nn import RelGraphConv
+    >>>
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> feat = mx.nd.ones((6, 10))
+    >>> conv = RelGraphConv(10, 2, 3, regularizer='basis', num_bases=2)
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> etype = mx.nd.array(np.array([0,1,2,0,1,2]).astype(np.int64))
+    >>> res = conv(g, feat, etype)
+    [[ 0.561324    0.33745846]
+    [ 0.61585337  0.09992217]
+    [ 0.561324    0.33745846]
+    [-0.01557937  0.01227859]
+    [ 0.61585337  0.09992217]
+    [ 0.056508   -0.00307822]]
+    <NDArray 6x2 @cpu(0)>
    """
    def __init__(self,
                 in_feat,
@@ -72,7 +111,7 @@ class RelGraphConv(gluon.Block):
                 num_bases=None,
                 bias=True,
                 activation=None,
-                 self_loop=False,
+                 self_loop=True,
                 low_mem=False,
                 dropout=0.0,
                 layer_norm=False):
@@ -164,21 +203,26 @@ class RelGraphConv(gluon.Block):
        return {'msg': msg}

    def forward(self, g, x, etypes, norm=None):
-        r"""Forward computation
+        """
+        Description
+        -----------
+
+        Forward computation

        Parameters
        ----------
        g : DGLGraph
            The graph.
-        x : mx.ndarray.NDArray
+        feat : mx.ndarray.NDArray
            Input node features. Could be either
-              - :math:`(|V|, D)` dense tensor
-              - :math:`(|V|,)` int64 vector, representing the categorical values of each
-                node. We then treat the input feature as an one-hot encoding feature.
+
+                * :math:`(|V|, D)` dense tensor
+                * :math:`(|V|,)` int64 vector, representing the categorical values of each
+                  node. It then treat the input feature as an one-hot encoding feature.
        etypes : mx.ndarray.NDArray
            Edge type tensor. Shape: :math:`(|E|,)`
        norm : mx.ndarray.NDArray
-            Optional edge normalizer tensor. Shape: :math:`(|E|, 1)`
+            Optional edge normalizer tensor. Shape: :math:`(|E|, 1)`.

        Returns
        -------

--- a/python/dgl/nn/mxnet/conv/sageconv.py
+++ b/python/dgl/nn/mxnet/conv/sageconv.py
@@ -9,24 +9,30 @@ from .... import function as fn
 from ....utils import expand_as_pair, check_eq_shape

 class SAGEConv(nn.Block):
-    r"""GraphSAGE layer from paper `Inductive Representation Learning on
+    r"""
+
+    Description
+    -----------
+    GraphSAGE layer from paper `Inductive Representation Learning on
    Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`__.

    .. math::
-        h_{\mathcal{N}(i)}^{(l+1)} & = \mathrm{aggregate}
+        h_{\mathcal{N}(i)}^{(l+1)} &= \mathrm{aggregate}
        \left(\{h_{j}^{l}, \forall j \in \mathcal{N}(i) \}\right)

-        h_{i}^{(l+1)} & = \sigma \left(W \cdot \mathrm{concat}
-        (h_{i}^{l}, h_{\mathcal{N}(i)}^{l+1} + b) \right)
+        h_{i}^{(l+1)} &= \sigma \left(W \cdot \mathrm{concat}
+        (h_{i}^{l}, h_{\mathcal{N}(i)}^{l+1}) \right)

-        h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l})
+        h_{i}^{(l+1)} &= \mathrm{norm}(h_{i}^{l})

    Parameters
    ----------
-    in_feats : int
-        Input feature size.
+    in_feats : int, or pair of ints
+        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.

-        If the layer is to be applied on a unidirectional bipartite graph, ``in_feats``
+        GATConv can be applied on homogeneous graph and unidirectional
+        `bipartite graph <https://docs.dgl.ai/generated/dgl.bipartite.html?highlight=bipartite>`__.
+        If the layer applies on a unidirectional bipartite graph, ``in_feats``
        specifies the input feature size on both the source and destination nodes.  If
        a scalar is given, the source and destination node feature size would take the
        same value.
@@ -34,7 +40,7 @@ class SAGEConv(nn.Block):
        If aggregator type is ``gcn``, the feature size of source and destination nodes
        are required to be the same.
    out_feats : int
-        Output feature size.
+        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
    feat_drop : float
        Dropout rate on features, default: ``0``.
    aggregator_type : str
@@ -46,6 +52,45 @@ class SAGEConv(nn.Block):
    activation : callable activation function/layer or None, optional
        If not None, applies an activation function to the updated node features.
        Default: ``None``.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from dgl.nn import SAGEConv
+    >>>
+    >>> # Case 1: Homogeneous graph
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> g = dgl.add_self_loop(g)
+    >>> feat = mx.nd.ones((6, 10))
+    >>> conv = SAGEConv(10, 2, 'pool')
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, feat)
+    >>> res
+    [[ 0.32144994 -0.8729614 ]
+    [ 0.32144994 -0.8729614 ]
+    [ 0.32144994 -0.8729614 ]
+    [ 0.32144994 -0.8729614 ]
+    [ 0.32144994 -0.8729614 ]
+    [ 0.32144994 -0.8729614 ]]
+    <NDArray 6x2 @cpu(0)>
+
+    >>> # Case 2: Unidirectional bipartite graph
+    >>> u = [0, 1, 0, 0, 1]
+    >>> v = [0, 1, 2, 3, 2]
+    >>> g = dgl.bipartite((u, v))
+    >>> u_fea = mx.nd.random.randn(2, 5)
+    >>> v_fea = mx.nd.random.randn(4, 10)
+    >>> conv = SAGEConv((5, 10), 2, 'pool')
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, (u_fea, v_fea))
+    >>> res
+    [[-0.60524774  0.7196473 ]
+    [ 0.8832787  -0.5928619 ]
+    [-1.8245722   1.159798  ]
+    [-1.0509381   2.2239418 ]]
+    <NDArray 4x2 @cpu(0)>
    """
    def __init__(self,
                 in_feats,

--- a/python/dgl/nn/mxnet/conv/sgconv.py
+++ b/python/dgl/nn/mxnet/conv/sgconv.py
@@ -6,35 +6,87 @@ from mxnet import nd
 from mxnet.gluon import nn

 from .... import function as fn
+from ....base import DGLError


 class SGConv(nn.Block):
-    r"""Simplifying Graph Convolution layer from paper `Simplifying Graph
+    r"""
+
+    Description
+    -----------
+    Simplifying Graph Convolution layer from paper `Simplifying Graph
    Convolutional Networks <https://arxiv.org/pdf/1902.07153.pdf>`__.

    .. math::
-        H^{l+1} = (\hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2})^K H^{l} \Theta^{l}
+        H^{K} = (\tilde{D}^{-1/2} \tilde{A} \tilde{D}^{-1/2})^K X \Theta
+
+    where :math:`\tilde{A}` is :math:`A` + :math:`I`.
+    Thus the graph input is expected to have self-loop edges added.

    Parameters
    ----------
    in_feats : int
-        Number of input features.
+        Number of input features; i.e, the number of dimensions of :math:`X`.
    out_feats : int
-        Number of output features.
+        Number of output features; i.e, the number of dimensions of :math:`H^{K}`.
    k : int
        Number of hops :math:`K`. Defaults:``1``.
    cached : bool
        If True, the module would cache

        .. math::
-            (\hat{D}^{-\frac{1}{2}}\hat{A}\hat{D}^{-\frac{1}{2}})^K X\Theta
+            (\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}})^K X\Theta

        at the first forward call. This parameter should only be set to
        ``True`` in Transductive Learning setting.
    bias : bool
        If True, adds a learnable bias to the output. Default: ``True``.
    norm : callable activation function/layer or None, optional
-        If not None, applies normalization to the updated node features.
+        If not None, applies normalization to the updated node features.  Default: ``False``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will be invalid
+        since no message will be passed to those nodes. This is harmful for some applications
+        causing silent performance regression. This module will raise a DGLError if it detects
+        0-in-degree nodes in input graph. By setting ``True``, it will suppress the check
+        and let the users handle it by themselves. Default: ``False``.
+
+    Notes
+    -----
+    Zero in-degree nodes will lead to invalid output value. This is because no message
+    will be passed to those nodes, the aggregation function will be appied on empty input.
+    A common practice to avoid this is to add a self-loop for each node in the graph if
+    it is homogeneous, which can be achieved by:
+
+    >>> g = ... # a DGLGraph
+    >>> g = dgl.add_self_loop(g)
+
+    Calling ``add_self_loop`` will not work for some graphs, for example, heterogeneous graph
+    since the edge type can not be decided for self_loop edges. Set ``allow_zero_in_degree``
+    to ``True`` for those cases to unblock the code and handle zere-in-degree nodes manually.
+    A common practise to handle this is to filter out the nodes with zere-in-degree when use
+    after conv.
+
+    Example
+    -------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from dgl.nn import SGConv
+    >>>
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> g = dgl.add_self_loop(g)
+    >>> feat = mx.nd.ones((6, 10))
+    >>> conv = SGConv(10, 2, k=2, cached=True)
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, feat)
+    >>> res
+    [[ 2.264404   -0.26684892]
+    [ 2.264404   -0.26684892]
+    [ 2.264404   -0.26684892]
+    [ 3.2273252  -0.3803246 ]
+    [ 2.247593   -0.2648679 ]
+    [ 2.2644043  -0.26684904]]
+    <NDArray 6x2 @cpu(0)>
    """
    def __init__(self,
                 in_feats,
@@ -42,18 +94,38 @@ class SGConv(nn.Block):
                 k=1,
                 cached=False,
                 bias=True,
-                 norm=None):
+                 norm=None,
+                 allow_zero_in_degree=False):
        super(SGConv, self).__init__()
        self._cached = cached
        self._cached_h = None
        self._k = k
+        self._allow_zero_in_degree = allow_zero_in_degree
        with self.name_scope():
            self.norm = norm
            self.fc = nn.Dense(out_feats, in_units=in_feats, use_bias=bias,
                               weight_initializer=mx.init.Xavier())

+    def set_allow_zero_in_degree(self, set_value):
+        r"""
+
+        Description
+        -----------
+        Set allow_zero_in_degree flag.
+
+        Parameters
+        ----------
+        set_value : bool
+            The value to be set to the flag.
+        """
+        self._allow_zero_in_degree = set_value
+
    def forward(self, graph, feat):
-        r"""Compute Simplifying Graph Convolution layer.
+        r"""
+
+        Description
+        -----------
+        Compute Simplifying Graph Convolution layer.

        Parameters
        ----------
@@ -69,12 +141,31 @@ class SGConv(nn.Block):
            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
            is size of output feature.

+        Raises
+        ------
+        DGLError
+            If there are 0-in-degree nodes in the input graph, it will raise DGLError
+            since no message will be passed to those nodes. This will cause invalid output.
+            The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``.
+
        Notes
        -----
-        If ``cache`` is se to True, ``feat`` and ``graph`` should not change during
+        If ``cache`` is set to True, ``feat`` and ``graph`` should not change during
        training, or you will get wrong results.
        """
        with graph.local_scope():
+            if not self._allow_zero_in_degree:
+                if graph.in_degrees().min() == 0:
+                    raise DGLError('There are 0-in-degree nodes in the graph, '
+                                   'output for those nodes will be invalid. '
+                                   'This is harmful for some applications, '
+                                   'causing silent performance regression. '
+                                   'Adding self-loop on the input graph by '
+                                   'calling `g = dgl.add_self_loop(g)` will resolve '
+                                   'the issue. Setting ``allow_zero_in_degree`` '
+                                   'to be `True` when constructing this module will '
+                                   'suppress the check and let the code run.')
+
            if self._cached_h is not None:
                feat = self._cached_h
            else:

--- a/python/dgl/nn/mxnet/conv/tagconv.py
+++ b/python/dgl/nn/mxnet/conv/tagconv.py
@@ -9,23 +9,28 @@ from .... import function as fn


 class TAGConv(gluon.Block):
-    r"""Apply Topology Adaptive Graph Convolutional Network
+    r"""
+
+    Description
+    -----------
+    Topology Adaptive Graph Convolutional layer from paper `Topology
+    Adaptive Graph Convolutional Networks <https://arxiv.org/pdf/1710.10370.pdf>`__.

    .. math::
-        \mathbf{X}^{\prime} = \sum_{k=0}^K \mathbf{D}^{-1/2} \mathbf{A}
-        \mathbf{D}^{-1/2}\mathbf{X} \mathbf{\Theta}_{k},
+        H^{K} = {\sum}_{k=0}^K (D^{-1/2} A D^{-1/2})^{k} X {\Theta}_{k},

-    where :math:`\mathbf{A}` denotes the adjacency matrix and
-    :math:`D_{ii} = \sum_{j=0} A_{ij}` its diagonal degree matrix.
+    where :math:`A` denotes the adjacency matrix,
+    :math:`D_{ii} = \sum_{j=0} A_{ij}` its diagonal degree matrix,
+    :math:`{\Theta}_{k}` denotes the linear weights to sum the results of different hops together.

    Parameters
    ----------
    in_feats : int
-        Number of input features.
+        Input feature size. i.e, the number of dimensions of :math:`X`.
    out_feats : int
-        Number of output features.
+        Output feature size.  i.e, the number of dimensions of :math:`H^{K}`.
    k: int, optional
-        Number of hops :math: `k`. (default: 2)
+        Number of hops :math:`K`. Default: ``2``.
    bias: bool, optional
        If True, adds a learnable bias to the output. Default: ``True``.
    activation: callable activation function/layer or None, optional
@@ -34,10 +39,30 @@ class TAGConv(gluon.Block):

    Attributes
    ----------
-    lin : mxnet.gluon.parameter.Parameter
-        The learnable weight tensor.
-    bias : mxnet.gluon.parameter.Parameter
-        The learnable bias tensor.
+    lin : torch.Module
+        The learnable linear module.
+
+    Example
+    -------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import mxnet as mx
+    >>> from mxnet import gluon
+    >>> from dgl.nn import TAGConv
+    >>>
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> feat = mx.nd.ones((6, 10))
+    >>> conv = TAGConv(10, 2, k=2)
+    >>> conv.initialize(ctx=mx.cpu(0))
+    >>> res = conv(g, feat)
+    >>> res
+    [[-0.86147034  0.10089529]
+    [-0.86147034  0.10089529]
+    [-0.86147034  0.10089529]
+    [-0.9707841   0.0360311 ]
+    [-0.6716844   0.02247889]
+    [ 0.32964635 -0.7669234 ]]
+    <NDArray 6x2 @cpu(0)>
    """
    def __init__(self,
                 in_feats,
@@ -60,7 +85,11 @@ class TAGConv(gluon.Block):
                                          init=mx.init.Zero())

    def forward(self, graph, feat):
-        r"""Compute graph convolution
+        r"""
+
+        Description
+        -----------
+        Compute topology adaptive graph convolution.

        Parameters
        ----------

--- a/python/dgl/nn/mxnet/hetero.py
+++ b/python/dgl/nn/mxnet/hetero.py
@@ -110,8 +110,9 @@ class HeteroGraphConv(nn.Block):
            # Do not break if graph has 0-in-degree nodes.
            # Because there is no general rule to add self-loop for heterograph.
            for _, v in self.mods.items():
-                if hasattr(v, '_allow_zero_in_degree'):
-                    v._allow_zero_in_degree = True
+                set_allow_zero_in_degree_fn = getattr(v, 'set_allow_zero_in_degree', None)
+                if callable(set_allow_zero_in_degree_fn):
+                    set_allow_zero_in_degree_fn(True)
            if isinstance(aggregate, str):
                self.agg_fn = get_aggregate_fn(aggregate)
            else:

--- a/python/dgl/nn/pytorch/conv/agnnconv.py
+++ b/python/dgl/nn/pytorch/conv/agnnconv.py
@@ -90,6 +90,20 @@ class AGNNConv(nn.Module):
        else:
            self.register_buffer('beta', th.Tensor([init_beta]))

+    def set_allow_zero_in_degree(self, set_value):
+        r"""
+
+        Description
+        -----------
+        Set allow_zero_in_degree flag.
+
+        Parameters
+        ----------
+        set_value : bool
+            The value to be set to the flag.
+        """
+        self._allow_zero_in_degree = set_value
+
    def forward(self, graph, feat):
        r"""


--- a/python/dgl/nn/pytorch/conv/edgeconv.py
+++ b/python/dgl/nn/pytorch/conv/edgeconv.py
@@ -112,6 +112,20 @@ class EdgeConv(nn.Module):
        phi_x = self.phi(edges.src['x'])
        return {'e': theta_x + phi_x}

+    def set_allow_zero_in_degree(self, set_value):
+        r"""
+
+        Description
+        -----------
+        Set allow_zero_in_degree flag.
+
+        Parameters
+        ----------
+        set_value : bool
+            The value to be set to the flag.
+        """
+        self._allow_zero_in_degree = set_value
+
    def forward(self, g, feat):
        """