[Doc] NN doc refactor Conv Layers (#1672)

* go through gcn, relgcn * fix tagconv formula * fix doc in sageconv * fix sgconv doc * replace hat with tilde * more comments on gmmconv * fix agnnconv chebconv doc * modify nnconv doc * remove & * add nn conv examples * Rebase master * More merge conflicts * check homo * add back self loop for some convs, check homo in tranform * add example for denseconv * add example and doc for dotgat and cfconv * check in-degree for graphconv * add language fix * gconv address all comments * another round of change based on api template * change agnn * go through agnn, appnp, atomic, cf, cheb, dense, gat, sage modules * finish pytorch part of nn conv * mxnet graphconv done * tensorflow graphconv works * add new modules into doc * add comments to not split code * refine doc * resr * more comments * more fix * finish conv and dense conv part api * pylint fix * fix pylink * fix pylint * more fix * fix * fix test fail because zere in degree * fix test fail * sage is not update for mxnet tf Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-81.us-east-2.compute.internal>

[Doc] NN doc refactor Conv Layers (#1672)
* go through gcn, relgcn * fix tagconv formula * fix doc in sageconv * fix sgconv doc * replace hat with tilde * more comments on gmmconv * fix agnnconv chebconv doc * modify nnconv doc * remove & * add nn conv examples * Rebase master * More merge conflicts * check homo * add back self loop for some convs, check homo in tranform * add example for denseconv * add example and doc for dotgat and cfconv * check in-degree for graphconv * add language fix * gconv address all comments * another round of change based on api template * change agnn * go through agnn, appnp, atomic, cf, cheb, dense, gat, sage modules * finish pytorch part of nn conv * mxnet graphconv done * tensorflow graphconv works * add new modules into doc * add comments to not split code * refine doc * resr * more comments * more fix * finish conv and dense conv part api * pylint fix * fix pylink * fix pylint * more fix * fix * fix test fail because zere in degree * fix test fail * sage is not update for mxnet tf Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-81.us-east-2.compute.internal>
69f5869f · Tianjun Xiao · GitHub · 98c1117d · 69f5869f · 69f5869f
Unverified Commit 69f5869f authored Aug 12, 2020 by Tianjun Xiao Committed by GitHub Aug 12, 2020
8 changed files
--- a/python/dgl/nn/pytorch/conv/sageconv.py
+++ b/python/dgl/nn/pytorch/conv/sageconv.py
@@ -5,28 +5,35 @@ from torch import nn
 from torch.nn import functional as F

 from .... import function as fn
+from ....base import DGLError
 from ....utils import expand_as_pair, check_eq_shape


 class SAGEConv(nn.Module):
-    r"""GraphSAGE layer from paper `Inductive Representation Learning on
+    r"""
+
+    Description
+    -----------
+    GraphSAGE layer from paper `Inductive Representation Learning on
    Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`__.

    .. math::
-        h_{\mathcal{N}(i)}^{(l+1)} & = \mathrm{aggregate}
+        h_{\mathcal{N}(i)}^{(l+1)} &= \mathrm{aggregate}
        \left(\{h_{j}^{l}, \forall j \in \mathcal{N}(i) \}\right)

-        h_{i}^{(l+1)} & = \sigma \left(W \cdot \mathrm{concat}
-        (h_{i}^{l}, h_{\mathcal{N}(i)}^{l+1} + b) \right)
+        h_{i}^{(l+1)} &= \sigma \left(W \cdot \mathrm{concat}
+        (h_{i}^{l}, h_{\mathcal{N}(i)}^{l+1}) \right)

-        h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l})
+        h_{i}^{(l+1)} &= \mathrm{norm}(h_{i}^{l})

    Parameters
    ----------
    in_feats : int, or pair of ints
-        Input feature size.
+        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.

-        If the layer is to be applied on a unidirectional bipartite graph, ``in_feats``
+        GATConv can be applied on homogeneous graph and unidirectional
+        `bipartite graph <https://docs.dgl.ai/generated/dgl.bipartite.html?highlight=bipartite>`__.
+        If the layer applies on a unidirectional bipartite graph, ``in_feats``
        specifies the input feature size on both the source and destination nodes.  If
        a scalar is given, the source and destination node feature size would take the
        same value.
@@ -34,7 +41,7 @@ class SAGEConv(nn.Module):
        If aggregator type is ``gcn``, the feature size of source and destination nodes
        are required to be the same.
    out_feats : int
-        Output feature size.
+        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
    feat_drop : float
        Dropout rate on features, default: ``0``.
    aggregator_type : str
@@ -46,6 +53,63 @@ class SAGEConv(nn.Module):
    activation : callable activation function/layer or None, optional
        If not None, applies an activation function to the updated node features.
        Default: ``None``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will be invalid
+        since no message will be passed to those nodes. This is harmful for some applications
+        causing silent performance regression. This module will raise a DGLError if it detects
+        0-in-degree nodes in input graph. By setting ``True``, it will suppress the check
+        and let the users handle it by themselves. Default: ``False``.
+
+    Notes
+    -----
+    Zero in-degree nodes will lead to invalid output value. This is because no message
+    will be passed to those nodes, the aggregation function will be appied on empty input.
+    A common practice to avoid this is to add a self-loop for each node in the graph if
+    it is homogeneous, which can be achieved by:
+
+    >>> g = ... # a DGLGraph
+    >>> g = dgl.add_self_loop(g)
+
+    Calling ``add_self_loop`` will not work for some graphs, for example, heterogeneous graph
+    since the edge type can not be decided for self_loop edges. Set ``allow_zero_in_degree``
+    to ``True`` for those cases to unblock the code and handle zere-in-degree nodes manually.
+    A common practise to handle this is to filter out the nodes with zere-in-degree when use
+    after conv.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import torch as th
+    >>> from dgl.nn import SAGEConv
+
+    >>> # Case 1: Homogeneous graph
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> g = dgl.add_self_loop(g)
+    >>> feat = th.ones(6, 10)
+    >>> conv = SAGEConv(10, 2, 'pool')
+    >>> res = conv(g, feat)
+    >>> res
+    tensor([[-1.0888, -2.1099],
+            [-1.0888, -2.1099],
+            [-1.0888, -2.1099],
+            [-1.0888, -2.1099],
+            [-1.0888, -2.1099],
+            [-1.0888, -2.1099]], grad_fn=<AddBackward0>)
+
+    >>> # Case 2: Unidirectional bipartite graph
+    >>> u = [0, 1, 0, 0, 1]
+    >>> v = [0, 1, 2, 3, 2]
+    >>> g = dgl.bipartite((u, v))
+    >>> u_fea = th.rand(2, 5)
+    >>> v_fea = th.rand(4, 10)
+    >>> conv = SAGEConv((5, 10), 2, 'mean')
+    >>> res = conv(g, (u_fea, v_fea))
+    >>> res
+    tensor([[ 0.3163,  3.1166],
+            [ 0.3866,  2.5398],
+            [ 0.5873,  1.6597],
+            [-0.2502,  2.8068]], grad_fn=<AddBackward0>)
    """
    def __init__(self,
                 in_feats,
@@ -54,7 +118,8 @@ class SAGEConv(nn.Module):
                 feat_drop=0.,
                 bias=True,
                 norm=None,
-                 activation=None):
+                 activation=None,
+                 allow_zero_in_degree=False):
        super(SAGEConv, self).__init__()

        self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats)
@@ -63,6 +128,7 @@ class SAGEConv(nn.Module):
        self.norm = norm
        self.feat_drop = nn.Dropout(feat_drop)
        self.activation = activation
+        self._allow_zero_in_degree = allow_zero_in_degree
        # aggregator type: mean/pool/lstm/gcn
        if aggregator_type == 'pool':
            self.fc_pool = nn.Linear(self._in_src_feats, self._in_src_feats)
@@ -74,7 +140,17 @@ class SAGEConv(nn.Module):
        self.reset_parameters()

    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
+        r"""
+
+        Description
+        -----------
+        Reinitialize learnable parameters.
+
+        Notes
+        -----
+        The linear weights :math:`W^{(l)}` are initialized using Glorot uniform initialization.
+        The LSTM module is using xavier initialization method for its weights.
+        """
        gain = nn.init.calculate_gain('relu')
        if self._aggre_type == 'pool':
            nn.init.xavier_uniform_(self.fc_pool.weight, gain=gain)
@@ -97,7 +173,11 @@ class SAGEConv(nn.Module):
        return {'neigh': rst.squeeze(0)}

    def forward(self, graph, feat):
-        r"""Compute GraphSAGE layer.
+        r"""
+
+        Description
+        -----------
+        Compute GraphSAGE layer.

        Parameters
        ----------
@@ -117,6 +197,18 @@ class SAGEConv(nn.Module):
            is size of output feature.
        """
        with graph.local_scope():
+            if not self._allow_zero_in_degree:
+                if (graph.in_degrees() == 0).any():
+                    raise DGLError('There are 0-in-degree nodes in the graph, '
+                                   'output for those nodes will be invalid. '
+                                   'This is harmful for some applications, '
+                                   'causing silent performance regression. '
+                                   'Adding self-loop on the input graph by '
+                                   'calling `g = dgl.add_self_loop(g)` will resolve '
+                                   'the issue. Setting ``allow_zero_in_degree`` '
+                                   'to be `True` when constructing this module will '
+                                   'suppress the check and let the code run.')
+
            if isinstance(feat, tuple):
                feat_src = self.feat_drop(feat[0])
                feat_dst = self.feat_drop(feat[1])

--- a/python/dgl/nn/pytorch/conv/sgconv.py
+++ b/python/dgl/nn/pytorch/conv/sgconv.py
@@ -4,35 +4,84 @@ import torch as th
 from torch import nn

 from .... import function as fn
-
+from ....base import DGLError

 class SGConv(nn.Module):
-    r"""Simplifying Graph Convolution layer from paper `Simplifying Graph
+    r"""
+
+    Description
+    -----------
+    Simplifying Graph Convolution layer from paper `Simplifying Graph
    Convolutional Networks <https://arxiv.org/pdf/1902.07153.pdf>`__.

    .. math::
-        H^{l+1} = (\hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2})^K H^{l} \Theta^{l}
+        H^{K} = (\tilde{D}^{-1/2} \tilde{A} \tilde{D}^{-1/2})^K X \Theta
+
+    where :math:`\tilde{A}` is :math:`A` + :math:`I`.
+    Thus the graph input is expected to have self-loop edges added.

    Parameters
    ----------
    in_feats : int
-        Number of input features.
+        Number of input features; i.e, the number of dimensions of :math:`X`.
    out_feats : int
-        Number of output features.
+        Number of output features; i.e, the number of dimensions of :math:`H^{K}`.
    k : int
        Number of hops :math:`K`. Defaults:``1``.
    cached : bool
        If True, the module would cache

        .. math::
-            (\hat{D}^{-\frac{1}{2}}\hat{A}\hat{D}^{-\frac{1}{2}})^K X\Theta
+            (\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}})^K X\Theta

        at the first forward call. This parameter should only be set to
        ``True`` in Transductive Learning setting.
    bias : bool
        If True, adds a learnable bias to the output. Default: ``True``.
    norm : callable activation function/layer or None, optional
-        If not None, applies normalization to the updated node features.
+        If not None, applies normalization to the updated node features.  Default: ``False``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will be invalid
+        since no message will be passed to those nodes. This is harmful for some applications
+        causing silent performance regression. This module will raise a DGLError if it detects
+        0-in-degree nodes in input graph. By setting ``True``, it will suppress the check
+        and let the users handle it by themselves. Default: ``False``.
+
+    Notes
+    -----
+    Zero in-degree nodes will lead to invalid output value. This is because no message
+    will be passed to those nodes, the aggregation function will be appied on empty input.
+    A common practice to avoid this is to add a self-loop for each node in the graph if
+    it is homogeneous, which can be achieved by:
+
+    >>> g = ... # a DGLGraph
+    >>> g = dgl.add_self_loop(g)
+
+    Calling ``add_self_loop`` will not work for some graphs, for example, heterogeneous graph
+    since the edge type can not be decided for self_loop edges. Set ``allow_zero_in_degree``
+    to ``True`` for those cases to unblock the code and handle zere-in-degree nodes manually.
+    A common practise to handle this is to filter out the nodes with zere-in-degree when use
+    after conv.
+
+    Example
+    -------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import torch as th
+    >>> from dgl.nn import SGConv
+    >>>
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> g = dgl.add_self_loop(g)
+    >>> feat = th.ones(6, 10)
+    >>> conv = SGConv(10, 2, k=2, cached=True)
+    >>> res = conv(g, feat)
+    >>> res
+    tensor([[-1.9441, -0.9343],
+            [-1.9441, -0.9343],
+            [-1.9441, -0.9343],
+            [-2.7709, -1.3316],
+            [-1.9297, -0.9273],
+            [-1.9441, -0.9343]], grad_fn=<AddmmBackward>)
    """
    def __init__(self,
                 in_feats,
@@ -40,23 +89,39 @@ class SGConv(nn.Module):
                 k=1,
                 cached=False,
                 bias=True,
-                 norm=None):
+                 norm=None,
+                 allow_zero_in_degree=False):
        super(SGConv, self).__init__()
        self.fc = nn.Linear(in_feats, out_feats, bias=bias)
        self._cached = cached
        self._cached_h = None
        self._k = k
        self.norm = norm
+        self._allow_zero_in_degree = allow_zero_in_degree
        self.reset_parameters()

    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
+        r"""
+
+        Description
+        -----------
+        Reinitialize learnable parameters.
+
+        Notes
+        -----
+        The model parameters are initialized using xavier initialization
+        and the bias is initialized to be zero.
+        """
        nn.init.xavier_uniform_(self.fc.weight)
        if self.fc.bias is not None:
            nn.init.zeros_(self.fc.bias)

    def forward(self, graph, feat):
-        r"""Compute Simplifying Graph Convolution layer.
+        r"""
+
+        Description
+        -----------
+        Compute Simplifying Graph Convolution layer.

        Parameters
        ----------
@@ -72,12 +137,31 @@ class SGConv(nn.Module):
            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
            is size of output feature.

+        Raises
+        ------
+        DGLError
+            If there are 0-in-degree nodes in the input graph, it will raise DGLError
+            since no message will be passed to those nodes. This will cause invalid output.
+            The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``.
+
        Notes
        -----
-        If ``cache`` is se to True, ``feat`` and ``graph`` should not change during
+        If ``cache`` is set to True, ``feat`` and ``graph`` should not change during
        training, or you will get wrong results.
        """
        with graph.local_scope():
+            if not self._allow_zero_in_degree:
+                if (graph.in_degrees() == 0).any():
+                    raise DGLError('There are 0-in-degree nodes in the graph, '
+                                   'output for those nodes will be invalid. '
+                                   'This is harmful for some applications, '
+                                   'causing silent performance regression. '
+                                   'Adding self-loop on the input graph by '
+                                   'calling `g = dgl.add_self_loop(g)` will resolve '
+                                   'the issue. Setting ``allow_zero_in_degree`` '
+                                   'to be `True` when constructing this module will '
+                                   'suppress the check and let the code run.')
+
            if self._cached_h is not None:
                feat = self._cached_h
            else:

--- a/python/dgl/nn/pytorch/conv/tagconv.py
+++ b/python/dgl/nn/pytorch/conv/tagconv.py
@@ -7,24 +7,28 @@ from .... import function as fn


 class TAGConv(nn.Module):
-    r"""Topology Adaptive Graph Convolutional layer from paper `Topology
+    r"""
+
+    Description
+    -----------
+    Topology Adaptive Graph Convolutional layer from paper `Topology
    Adaptive Graph Convolutional Networks <https://arxiv.org/pdf/1710.10370.pdf>`__.

    .. math::
-        \mathbf{X}^{\prime} = \sum_{k=0}^K \mathbf{D}^{-1/2} \mathbf{A}
-        \mathbf{D}^{-1/2}\mathbf{X} \mathbf{\Theta}_{k},
+        H^{K} = {\sum}_{k=0}^K (D^{-1/2} A D^{-1/2})^{k} X {\Theta}_{k},

-    where :math:`\mathbf{A}` denotes the adjacency matrix and
-    :math:`D_{ii} = \sum_{j=0} A_{ij}` its diagonal degree matrix.
+    where :math:`A` denotes the adjacency matrix,
+    :math:`D_{ii} = \sum_{j=0} A_{ij}` its diagonal degree matrix,
+    :math:`{\Theta}_{k}` denotes the linear weights to sum the results of different hops together.

    Parameters
    ----------
    in_feats : int
-        Input feature size.
+        Input feature size. i.e, the number of dimensions of :math:`X`.
    out_feats : int
-        Output feature size.
+        Output feature size.  i.e, the number of dimensions of :math:`H^{K}`.
    k: int, optional
-        Number of hops :math: `k`. (default: 2)
+        Number of hops :math:`K`. Default: ``2``.
    bias: bool, optional
        If True, adds a learnable bias to the output. Default: ``True``.
    activation: callable activation function/layer or None, optional
@@ -35,13 +39,33 @@ class TAGConv(nn.Module):
    ----------
    lin : torch.Module
        The learnable linear module.
+
+    Example
+    -------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import torch as th
+    >>> from dgl.nn import TAGConv
+    >>>
+    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    >>> feat = th.ones(6, 10)
+    >>> conv = TAGConv(10, 2, k=2)
+    >>> res = conv(g, feat)
+    >>> res
+    tensor([[ 0.5490, -1.6373],
+            [ 0.5490, -1.6373],
+            [ 0.5490, -1.6373],
+            [ 0.5513, -1.8208],
+            [ 0.5215, -1.6044],
+            [ 0.3304, -1.9927]], grad_fn=<AddmmBackward>)
    """
    def __init__(self,
                 in_feats,
                 out_feats,
                 k=2,
                 bias=True,
-                 activation=None):
+                 activation=None,
+                 ):
        super(TAGConv, self).__init__()
        self._in_feats = in_feats
        self._out_feats = out_feats
@@ -52,12 +76,25 @@ class TAGConv(nn.Module):
        self.reset_parameters()

    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
+        r"""
+
+        Description
+        -----------
+        Reinitialize learnable parameters.
+
+        Notes
+        -----
+        The model parameters are initialized using Glorot uniform initialization.
+        """
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_normal_(self.lin.weight, gain=gain)

    def forward(self, graph, feat):
-        r"""Compute topology adaptive graph convolution.
+        r"""
+
+        Description
+        -----------
+        Compute topology adaptive graph convolution.

        Parameters
        ----------

--- a/python/dgl/nn/tensorflow/conv/graphconv.py
+++ b/python/dgl/nn/tensorflow/conv/graphconv.py
@@ -5,46 +5,34 @@ from tensorflow.keras import layers
 import numpy as np

 from .... import function as fn
+from ....base import DGLError
 from ....utils import expand_as_pair

 # pylint: disable=W0235


 class GraphConv(layers.Layer):
-    r"""Apply graph convolution over an input signal.
+    r"""

-    Graph convolution is introduced in `GCN <https://arxiv.org/abs/1609.02907>`__
-    and can be described as below:
+    Description
+    -----------
+    Graph convolution was introduced in `GCN <https://arxiv.org/abs/1609.02907>`__
+    and mathematically is defined as follows:

    .. math::
      h_i^{(l+1)} = \sigma(b^{(l)} + \sum_{j\in\mathcal{N}(i)}\frac{1}{c_{ij}}h_j^{(l)}W^{(l)})

-    where :math:`\mathcal{N}(i)` is the neighbor set of node :math:`i`. :math:`c_{ij}` is equal
-    to the product of the square root of node degrees:
-    :math:`\sqrt{|\mathcal{N}(i)|}\sqrt{|\mathcal{N}(j)|}`. :math:`\sigma` is an activation
-    function.
-
-    The model parameters are initialized as in the
-    `original implementation <https://github.com/tkipf/gcn/blob/master/gcn/layers.py>`__ where
-    the weight :math:`W^{(l)}` is initialized using Glorot uniform initialization
-    and the bias is initialized to be zero.
-
-    Notes
-    -----
-    Zero in degree nodes could lead to invalid normalizer. A common practice
-    to avoid this is to add a self-loop for each node in the graph, which
-    can be achieved by:
-
-    >>> g = ... # some DGLGraph
-    >>> g.add_edges(g.nodes(), g.nodes())
-
+    where :math:`\mathcal{N}(i)` is the set of neighbors of node :math:`i`,
+    :math:`c_{ij}` is the product of the square root of node degrees
+    (i.e.,  :math:`c_{ij} = \sqrt{|\mathcal{N}(i)|}\sqrt{|\mathcal{N}(j)|}`),
+    and :math:`\sigma` is an activation function.

    Parameters
    ----------
    in_feats : int
-        Input feature size.
+        Input feature size; i.e, the number of dimensions of :math:`h_j^{(l)}`.
    out_feats : int
-        Output feature size.
+        Output feature size; i.e., the number of dimensions of :math:`h_i^{(l+1)}`.
    norm : str, optional
        How to apply the normalizer. If is `'right'`, divide the aggregated messages
        by each node's in-degrees, which is equivalent to averaging the received messages.
@@ -55,25 +43,99 @@ class GraphConv(layers.Layer):
        without a weight matrix.
    bias : bool, optional
        If True, adds a learnable bias to the output. Default: ``True``.
-    activation: callable activation function/layer or None, optional
+    activation : callable activation function/layer or None, optional
        If not None, applies an activation function to the updated node features.
        Default: ``None``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will be invalid
+        since no message will be passed to those nodes. This is harmful for some applications
+        causing silent performance regression. This module will raise a DGLError if it detects
+        0-in-degree nodes in input graph. By setting ``True``, it will suppress the check
+        and let the users handle it by themselves. Default: ``False``.

    Attributes
    ----------
-    weight : tf.Tensor
+    weight : torch.Tensor
        The learnable weight tensor.
-    bias : tf.Tensor
+    bias : torch.Tensor
        The learnable bias tensor.
-    """

+    Notes
+    -----
+    Zero in-degree nodes will lead to invalid output value. This is because no message
+    will be passed to those nodes, the aggregation function will be appied on empty input.
+    A common practice to avoid this is to add a self-loop for each node in the graph if
+    it is homogeneous, which can be achieved by:
+
+    >>> g = ... # a DGLGraph
+    >>> g = dgl.add_self_loop(g)
+
+    Calling ``add_self_loop`` will not work for some graphs, for example, heterogeneous graph
+    since the edge type can not be decided for self_loop edges. Set ``allow_zero_in_degree``
+    to ``True`` for those cases to unblock the code and handle zere-in-degree nodes manually.
+    A common practise to handle this is to filter out the nodes with zere-in-degree when use
+    after conv.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import numpy as np
+    >>> import tensorflow as tf
+    >>> from dgl.nn import GraphConv
+
+    >>> # Case 1: Homogeneous graph
+    >>> with tf.device("CPU:0"):
+    ...     g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    ...     g = dgl.add_self_loop(g)
+    ...     feat = tf.ones((6, 10))
+    ...     conv = GraphConv(10, 2, norm='both', weight=True, bias=True)
+    ...     res = conv(g, feat)
+    >>> print(res)
+    <tf.Tensor: shape=(6, 2), dtype=float32, numpy=
+    array([[ 0.6208475 , -0.4896223 ],
+        [ 0.68356586, -0.5390842 ],
+        [ 0.6208475 , -0.4896223 ],
+        [ 0.7859846 , -0.61985517],
+        [ 0.8251371 , -0.65073216],
+        [ 0.48335412, -0.38119012]], dtype=float32)>
+    >>> # allow_zero_in_degree example
+    >>> with tf.device("CPU:0"):
+    ...     g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3]))
+    ...     conv = GraphConv(10, 2, norm='both', weight=True, bias=True, allow_zero_in_degree=True)
+    ...     res = conv(g, feat)
+    >>> print(res)
+        <tf.Tensor: shape=(6, 2), dtype=float32, numpy=
+        array([[ 0.6208475 , -0.4896223 ],
+            [ 0.68356586, -0.5390842 ],
+            [ 0.6208475 , -0.4896223 ],
+            [ 0.7859846 , -0.61985517],
+            [ 0.8251371 , -0.65073216],
+            [ 0., 0.]], dtype=float32)>
+
+    >>> # Case 2: Unidirectional bipartite graph
+    >>> u = [0, 1, 0, 0, 1]
+    >>> v = [0, 1, 2, 3, 2]
+    >>> with tf.device("CPU:0"):
+    ...     g = dgl.bipartite((u, v))
+    ...     u_fea = th.rand(2, 5)
+    ...     v_fea = th.rand(4, 5)
+    ...     conv = GraphConv(5, 2, norm='both', weight=True, bias=True)
+    ...     res = conv(g, (u_fea, v_fea))
+    >>> res
+    <tf.Tensor: shape=(4, 2), dtype=float32, numpy=
+    array([[ 1.3607183, -0.1636453],
+        [ 1.6665325, -0.2004239],
+        [ 2.1405895, -0.2574358],
+        [ 1.3607183, -0.1636453]], dtype=float32)>
+    """
    def __init__(self,
                 in_feats,
                 out_feats,
                 norm='both',
                 weight=True,
                 bias=True,
-                 activation=None):
+                 activation=None,
+                 allow_zero_in_degree=False):
        super(GraphConv, self).__init__()
        if norm not in ('none', 'both', 'right'):
            raise DGLError('Invalid norm value. Must be either "none", "both" or "right".'
@@ -81,6 +143,7 @@ class GraphConv(layers.Layer):
        self._in_feats = in_feats
        self._out_feats = out_feats
        self._norm = norm
+        self._allow_zero_in_degree = allow_zero_in_degree

        if weight:
            xinit = tf.keras.initializers.glorot_uniform()
@@ -99,37 +162,59 @@ class GraphConv(layers.Layer):
        self._activation = activation

    def call(self, graph, feat, weight=None):
-        r"""Compute graph convolution.
+        r"""

-        Notes
-        -----
-        * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
-          dimensions, :math:`N` is the number of nodes.
-        * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
-          the same shape as the input.
-        * Weight shape: :math:`(\text{in_feats}, \text{out_feats})`.
+        Description
+        -----------
+        Compute graph convolution.

        Parameters
        ----------
        graph : DGLGraph
            The graph.
-        feat : tf.Tensor or pair of tf.Tensor
-            If a single tensor is given, the input feature of shape :math:`(N, D_{in})` where
-            :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes.
-            If a pair of tensors are given, the pair must contain two tensors of shape
-            :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`.
-
-            Note that in the special case of graph convolutional networks, if a pair of
-            tensors is given, the latter element will not participate in computation.
+        feat : torch.Tensor or pair of torch.Tensor
+            If a torch.Tensor is given, it represents the input feature of shape
+            :math:`(N, D_{in})`
+            where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes.
+            If a pair of torch.Tensor is given, which is the case for bipartite graph, the pair
+            must contain two tensors of shape :math:`(N_{in}, D_{in_{src}})` and
+            :math:`(N_{out}, D_{in_{dst}})`.
        weight : torch.Tensor, optional
            Optional external weight tensor.

        Returns
        -------
-        tf.Tensor
+        torch.Tensor
            The output feature
+
+        Raises
+        ------
+        DGLError
+            If there are 0-in-degree nodes in the input graph, it will raise DGLError
+            since no message will be passed to those nodes. This will cause invalid output.
+            The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``.
+
+        Notes
+        -----
+        * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
+          dimensions, :math:`N` is the number of nodes.
+        * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
+          the same shape as the input.
+        * Weight shape: :math:`(\text{in_feats}, \text{out_feats})`.
        """
        with graph.local_scope():
+            if not self._allow_zero_in_degree:
+                if  tf.math.count_nonzero(graph.in_degrees() == 0) > 0:
+                    raise DGLError('There are 0-in-degree nodes in the graph, '
+                                   'output for those nodes will be invalid. '
+                                   'This is harmful for some applications, '
+                                   'causing silent performance regression. '
+                                   'Adding self-loop on the input graph by '
+                                   'calling `g = dgl.add_self_loop(g)` will resolve '
+                                   'the issue. Setting ``allow_zero_in_degree`` '
+                                   'to be `True` when constructing this module will '
+                                   'suppress the check and let the code run.')
+
            feat_src, feat_dst = expand_as_pair(feat, graph)

            if self._norm == 'both':

--- a/tests/mxnet/test_nn.py
+++ b/tests/mxnet/test_nn.py
@@ -314,7 +314,7 @@ def test_dense_cheb_conv():

 @parametrize_dtype
 @pytest.mark.parametrize('norm_type', ['both', 'right', 'none'])
-@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite']))
+@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite'], exclude=['zero-degree']))
 def test_dense_graph_conv(idtype, g, norm_type):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -692,9 +692,9 @@ def test_hetero_conv(agg, idtype):
        ('store', 'sells', 'game'): [(0, 0), (0, 3), (1, 1), (1, 2)]},
        idtype=idtype, device=F.ctx())
    conv = nn.HeteroGraphConv({
-        'follows': nn.GraphConv(2, 3),
-        'plays': nn.GraphConv(2, 4),
-        'sells': nn.GraphConv(3, 4)},
+        'follows': nn.GraphConv(2, 3, allow_zero_in_degree=True),
+        'plays': nn.GraphConv(2, 4, allow_zero_in_degree=True),
+        'sells': nn.GraphConv(3, 4, allow_zero_in_degree=True)},
        agg)
    conv.initialize(ctx=F.ctx())
    print(conv)

--- a/tests/pytorch/test_nn.py
+++ b/tests/pytorch/test_nn.py
@@ -306,6 +306,7 @@ def test_rgcn():
    rgc_basis_low = nn.RelGraphConv(I, O, R, "basis", B, low_mem=True).to(ctx)
    rgc_basis_low.weight = rgc_basis.weight
    rgc_basis_low.w_comp = rgc_basis.w_comp
+    rgc_basis_low.loop_weight = rgc_basis.loop_weight
    h = th.randn((100, I)).to(ctx)
    r = th.tensor(etype).to(ctx)
    h_new = rgc_basis(g, h, r)
@@ -317,6 +318,7 @@ def test_rgcn():
    rgc_bdd = nn.RelGraphConv(I, O, R, "bdd", B).to(ctx)
    rgc_bdd_low = nn.RelGraphConv(I, O, R, "bdd", B, low_mem=True).to(ctx)
    rgc_bdd_low.weight = rgc_bdd.weight
+    rgc_bdd_low.loop_weight = rgc_bdd.loop_weight
    h = th.randn((100, I)).to(ctx)
    r = th.tensor(etype).to(ctx)
    h_new = rgc_bdd(g, h, r)
@@ -332,6 +334,7 @@ def test_rgcn():
    rgc_basis_low = nn.RelGraphConv(I, O, R, "basis", B, low_mem=True).to(ctx)
    rgc_basis_low.weight = rgc_basis.weight
    rgc_basis_low.w_comp = rgc_basis.w_comp
+    rgc_basis_low.loop_weight = rgc_basis.loop_weight
    h = th.randn((100, I)).to(ctx)
    r = th.tensor(etype).to(ctx)
    h_new = rgc_basis(g, h, r, norm)
@@ -343,6 +346,7 @@ def test_rgcn():
    rgc_bdd = nn.RelGraphConv(I, O, R, "bdd", B).to(ctx)
    rgc_bdd_low = nn.RelGraphConv(I, O, R, "bdd", B, low_mem=True).to(ctx)
    rgc_bdd_low.weight = rgc_bdd.weight
+    rgc_bdd_low.loop_weight = rgc_bdd.loop_weight
    h = th.randn((100, I)).to(ctx)
    r = th.tensor(etype).to(ctx)
    h_new = rgc_bdd(g, h, r, norm)
@@ -356,6 +360,7 @@ def test_rgcn():
    rgc_basis_low = nn.RelGraphConv(I, O, R, "basis", B, low_mem=True).to(ctx)
    rgc_basis_low.weight = rgc_basis.weight
    rgc_basis_low.w_comp = rgc_basis.w_comp
+    rgc_basis_low.loop_weight = rgc_basis.loop_weight
    h = th.randint(0, I, (100,)).to(ctx)
    r = th.tensor(etype).to(ctx)
    h_new = rgc_basis(g, h, r)
@@ -365,7 +370,7 @@ def test_rgcn():
    assert F.allclose(h_new, h_new_low)

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite']))
+@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite'], exclude=['zero-degree']))
 def test_gat_conv(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -376,7 +381,7 @@ def test_gat_conv(g, idtype):
    assert h.shape == (g.number_of_nodes(), 4, 2)

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['bipartite']))
+@pytest.mark.parametrize('g', get_cases(['bipartite'], exclude=['zero-degree']))
 def test_gat_conv_bi(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -387,7 +392,7 @@ def test_gat_conv_bi(g, idtype):
    assert h.shape == (g.number_of_dst_nodes(), 4, 2)

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite']))
+@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite'], exclude=['zero-degree']))
 @pytest.mark.parametrize('aggre_type', ['mean', 'pool', 'gcn', 'lstm'])
 def test_sage_conv(idtype, g, aggre_type):
    g = g.astype(idtype).to(F.ctx())
@@ -398,7 +403,7 @@ def test_sage_conv(idtype, g, aggre_type):
    assert h.shape[-1] == 10

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['bipartite']))
+@pytest.mark.parametrize('g', get_cases(['bipartite'], exclude=['zero-degree']))
 @pytest.mark.parametrize('aggre_type', ['mean', 'pool', 'gcn', 'lstm'])
 def test_sage_conv_bi(idtype, g, aggre_type):
    g = g.astype(idtype).to(F.ctx())
@@ -417,14 +422,14 @@ def test_sage_conv2(idtype):
    g = dgl.bipartite([], num_nodes=(5, 3))
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
-    sage = nn.SAGEConv((3, 3), 2, 'gcn')
+    sage = nn.SAGEConv((3, 3), 2, 'gcn', allow_zero_in_degree=True)
    feat = (F.randn((5, 3)), F.randn((3, 3)))
    sage = sage.to(ctx)
    h = sage(g, (F.copy_to(feat[0], F.ctx()), F.copy_to(feat[1], F.ctx())))
    assert h.shape[-1] == 2
    assert h.shape[0] == 3
    for aggre_type in ['mean', 'pool', 'lstm']:
-        sage = nn.SAGEConv((3, 1), 2, aggre_type)
+        sage = nn.SAGEConv((3, 1), 2, aggre_type, allow_zero_in_degree=True)
        feat = (F.randn((5, 3)), F.randn((3, 1)))
        sage = sage.to(ctx)
        h = sage(g, feat)
@@ -463,7 +468,7 @@ def test_appnp_conv():
    assert h.shape[-1] == 5

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite']))
+@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite'], exclude=['zero-degree']))
 @pytest.mark.parametrize('aggregator_type', ['mean', 'max', 'sum'])
 def test_gin_conv(g, idtype, aggregator_type):
    g = g.astype(idtype).to(F.ctx())
@@ -478,7 +483,7 @@ def test_gin_conv(g, idtype, aggregator_type):
    assert h.shape == (g.number_of_nodes(), 12)

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['bipartite']))
+@pytest.mark.parametrize('g', get_cases(['bipartite'], exclude=['zero-degree']))
 @pytest.mark.parametrize('aggregator_type', ['mean', 'max', 'sum'])
 def test_gin_conv_bi(g, idtype, aggregator_type):
    g = g.astype(idtype).to(F.ctx())
@@ -493,7 +498,7 @@ def test_gin_conv_bi(g, idtype, aggregator_type):
    assert h.shape == (g.number_of_dst_nodes(), 12)

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite']))
+@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite'], exclude=['zero-degree']))
 def test_agnn_conv(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -504,7 +509,7 @@ def test_agnn_conv(g, idtype):
    assert h.shape == (g.number_of_nodes(), 5)

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['bipartite']))
+@pytest.mark.parametrize('g', get_cases(['bipartite'], exclude=['zero-degree']))
 def test_agnn_conv_bi(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -529,7 +534,7 @@ def test_gated_graph_conv():
    assert h.shape[-1] == 10

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite']))
+@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite'], exclude=['zero-degree']))
 def test_nn_conv(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -543,7 +548,7 @@ def test_nn_conv(g, idtype):
    assert h.shape[-1] == 10

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['bipartite']))
+@pytest.mark.parametrize('g', get_cases(['bipartite'], exclude=['zero-degree']))
 def test_nn_conv_bi(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -559,7 +564,7 @@ def test_nn_conv_bi(g, idtype):
    assert h.shape[-1] == 10

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['homo']))
+@pytest.mark.parametrize('g', get_cases(['homo'], exclude=['zero-degree']))
 def test_gmm_conv(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -572,7 +577,7 @@ def test_gmm_conv(g, idtype):
    assert h.shape[-1] == 10

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['bipartite', 'block-bipartite']))
+@pytest.mark.parametrize('g', get_cases(['bipartite', 'block-bipartite'], exclude=['zero-degree']))
 def test_gmm_conv_bi(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -587,7 +592,7 @@ def test_gmm_conv_bi(g, idtype):

 @parametrize_dtype
 @pytest.mark.parametrize('norm_type', ['both', 'right', 'none'])
-@pytest.mark.parametrize('g', get_cases(['homo', 'bipartite']))
+@pytest.mark.parametrize('g', get_cases(['homo', 'bipartite'], exclude=['zero-degree']))
 def test_dense_graph_conv(norm_type, g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -605,7 +610,7 @@ def test_dense_graph_conv(norm_type, g, idtype):
    assert F.allclose(out_conv, out_dense_conv)

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['homo', 'bipartite']))
+@pytest.mark.parametrize('g', get_cases(['homo', 'bipartite'], exclude=['zero-degree']))
 def test_dense_sage_conv(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -628,7 +633,7 @@ def test_dense_sage_conv(g, idtype):
    assert F.allclose(out_sage, out_dense_sage), g

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite']))
+@pytest.mark.parametrize('g', get_cases(['homo', 'block-bipartite'], exclude=['zero-degree']))
 def test_edge_conv(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -639,7 +644,7 @@ def test_edge_conv(g, idtype):
    assert h1.shape == (g.number_of_nodes(), 2)

 @parametrize_dtype
-@pytest.mark.parametrize('g', get_cases(['bipartite']))
+@pytest.mark.parametrize('g', get_cases(['bipartite'], exclude=['zero-degree']))
 def test_edge_conv_bi(g, idtype):
    g = g.astype(idtype).to(F.ctx())
    ctx = F.ctx()
@@ -770,9 +775,9 @@ def test_hetero_conv(agg, idtype):
        ('store', 'sells', 'game'): [(0, 0), (0, 3), (1, 1), (1, 2)]},
        idtype=idtype, device=F.ctx())
    conv = nn.HeteroGraphConv({
-        'follows': nn.GraphConv(2, 3),
-        'plays': nn.GraphConv(2, 4),
-        'sells': nn.GraphConv(3, 4)},
+        'follows': nn.GraphConv(2, 3, allow_zero_in_degree=True),
+        'plays': nn.GraphConv(2, 4, allow_zero_in_degree=True),
+        'sells': nn.GraphConv(3, 4, allow_zero_in_degree=True)},
        agg)
    conv = conv.to(F.ctx())
    uf = F.randn((4, 2))
@@ -808,9 +813,9 @@ def test_hetero_conv(agg, idtype):

    # test with pair input
    conv = nn.HeteroGraphConv({
-        'follows': nn.SAGEConv(2, 3, 'mean'),
-        'plays': nn.SAGEConv((2, 4), 4, 'mean'),
-        'sells': nn.SAGEConv(3, 4, 'mean')},
+        'follows': nn.SAGEConv(2, 3, 'mean', allow_zero_in_degree=True),
+        'plays': nn.SAGEConv((2, 4), 4, 'mean', allow_zero_in_degree=True),
+        'sells': nn.SAGEConv(3, 4, 'mean', allow_zero_in_degree=True)},
        agg)
    conv = conv.to(F.ctx())

@@ -886,4 +891,4 @@ if __name__ == '__main__':
    test_dense_cheb_conv()
    test_sequential()
    test_atomic_conv()
-    test_cf_conv()
+    test_cf_conv()
\ No newline at end of file
--- a/tests/tensorflow/test_nn.py
+++ b/tests/tensorflow/test_nn.py
@@ -380,9 +380,9 @@ def test_hetero_conv(agg, idtype):
        ('store', 'sells', 'game'): [(0, 0), (0, 3), (1, 1), (1, 2)]},
        idtype=idtype, device=F.ctx())
    conv = nn.HeteroGraphConv({
-        'follows': nn.GraphConv(2, 3),
-        'plays': nn.GraphConv(2, 4),
-        'sells': nn.GraphConv(3, 4)},
+        'follows': nn.GraphConv(2, 3, allow_zero_in_degree=True),
+        'plays': nn.GraphConv(2, 4, allow_zero_in_degree=True),
+        'sells': nn.GraphConv(3, 4, allow_zero_in_degree=True)},
        agg)
    uf = F.randn((4, 2))
    gf = F.randn((4, 4))

--- a/tests/test_utils/graph_cases.py
+++ b/tests/test_utils/graph_cases.py
@@ -69,9 +69,9 @@ def heterograph0():

 @register_case(['batched', 'homo'])
 def batched_graph0():
-    g1 = dgl.graph(([0, 1, 2], [1, 2, 3]))
-    g2 = dgl.graph(([1, 1], [2, 0]))
-    g3 = dgl.graph(([0], [1]))
+    g1 = dgl.add_self_loop(dgl.graph(([0, 1, 2], [1, 2, 3])))
+    g2 = dgl.add_self_loop(dgl.graph(([1, 1], [2, 0])))
+    g3 = dgl.add_self_loop(dgl.graph(([0], [1])))
    return dgl.batch([g1, g2, g3])

 @register_case(['block', 'bipartite', 'block-biparitite'])