[Refactor] Break NN modules into files (#859)

* break nn modules into files * break mxnet nn modules * fix lint * fix lint

[Refactor] Break NN modules into files (#859)
* break nn modules into files * break mxnet nn modules * fix lint * fix lint
ddb5d804 · VoVAllen · GitHub · 4cd5c19e · ddb5d804 · ddb5d804
Unverified Commit ddb5d804 authored Sep 16, 2019 by VoVAllen Committed by GitHub Sep 16, 2019
20 changed files
--- a/python/dgl/nn/mxnet/conv/__init__.py
+++ b/python/dgl/nn/mxnet/conv/__init__.py
+"""MXNet modules for graph convolutions."""
+# pylint: disable= no-member, arguments-differ, invalid-name
+
+from .graphconv import GraphConv
+from .relgraphconv import RelGraphConv
+from .tagconv import TAGConv
+
+__all__ = ['GraphConv', 'TAGConv', 'RelGraphConv']
--- a/python/dgl/nn/mxnet/conv/graphconv.py
+++ b/python/dgl/nn/mxnet/conv/graphconv.py
+"""MXNet modules for graph convolutions(GCN)"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import math
+
+import mxnet as mx
+from mxnet import gluon
+
+from .... import function as fn
+
+
+class GraphConv(gluon.Block):
+    r"""Apply graph convolution over an input signal.
+
+    Graph convolution is introduced in `GCN <https://arxiv.org/abs/1609.02907>`__
+    and can be described as below:
+
+    .. math::
+      h_i^{(l+1)} = \sigma(b^{(l)} + \sum_{j\in\mathcal{N}(i)}\frac{1}{c_{ij}}h_j^{(l)}W^{(l)})
+
+    where :math:`\mathcal{N}(i)` is the neighbor set of node :math:`i`. :math:`c_{ij}` is equal
+    to the product of the square root of node degrees:
+    :math:`\sqrt{|\mathcal{N}(i)|}\sqrt{|\mathcal{N}(j)|}`. :math:`\sigma` is an activation
+    function.
+
+    The model parameters are initialized as in the
+    `original implementation <https://github.com/tkipf/gcn/blob/master/gcn/layers.py>`__ where
+    the weight :math:`W^{(l)}` is initialized using Glorot uniform initialization
+    and the bias is initialized to be zero.
+
+    Notes
+    -----
+    Zero in degree nodes could lead to invalid normalizer. A common practice
+    to avoid this is to add a self-loop for each node in the graph, which
+    can be achieved by:
+
+    >>> g = ... # some DGLGraph
+    >>> g.add_edges(g.nodes(), g.nodes())
+
+
+    Parameters
+    ----------
+    in_feats : int
+        Number of input features.
+    out_feats : int
+        Number of output features.
+    norm : bool, optional
+        If True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
+    bias : bool, optional
+        If True, adds a learnable bias to the output. Default: ``True``.
+    activation: callable activation function/layer or None, optional
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+
+    Attributes
+    ----------
+    weight : mxnet.gluon.parameter.Parameter
+        The learnable weight tensor.
+    bias : mxnet.gluon.parameter.Parameter
+        The learnable bias tensor.
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 norm=True,
+                 bias=True,
+                 activation=None):
+        super(GraphConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._norm = norm
+
+        with self.name_scope():
+            self.weight = self.params.get('weight', shape=(in_feats, out_feats),
+                                          init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
+            if bias:
+                self.bias = self.params.get('bias', shape=(out_feats,),
+                                            init=mx.init.Zero())
+            else:
+                self.bias = None
+
+        self._activation = activation
+
+    def forward(self, graph, feat):
+        r"""Compute graph convolution.
+
+        Notes
+        -----
+            * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
+              dimensions, :math:`N` is the number of nodes.
+            * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
+              the same shape as the input.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : mxnet.NDArray
+            The input feature
+
+        Returns
+        -------
+        mxnet.NDArray
+            The output feature
+        """
+        graph = graph.local_var()
+        if self._norm:
+            degs = graph.in_degrees().astype('float32')
+            norm = mx.nd.power(mx.nd.clip(degs, a_min=1, a_max=float("inf")), -0.5)
+            shp = norm.shape + (1,) * (feat.ndim - 1)
+            norm = norm.reshape(shp).as_in_context(feat.context)
+            feat = feat * norm
+
+        if self._in_feats > self._out_feats:
+            # mult W first to reduce the feature size for aggregation.
+            feat = mx.nd.dot(feat, self.weight.data(feat.context))
+            graph.ndata['h'] = feat
+            graph.update_all(fn.copy_src(src='h', out='m'),
+                             fn.sum(msg='m', out='h'))
+            rst = graph.ndata.pop('h')
+        else:
+            # aggregate first then mult W
+            graph.ndata['h'] = feat
+            graph.update_all(fn.copy_src(src='h', out='m'),
+                             fn.sum(msg='m', out='h'))
+            rst = graph.ndata.pop('h')
+            rst = mx.nd.dot(rst, self.weight.data(feat.context))
+
+        if self._norm:
+            rst = rst * norm
+
+        if self.bias is not None:
+            rst = rst + self.bias.data(rst.context)
+
+        if self._activation is not None:
+            rst = self._activation(rst)
+
+        return rst
+
+    def __repr__(self):
+        summary = 'GraphConv('
+        summary += 'in={:d}, out={:d}, normalization={}, activation={}'.format(
+            self._in_feats, self._out_feats,
+            self._norm, self._activation)
+        summary += '\n)'
+        return summary
--- a/python/dgl/nn/mxnet/conv.py
+++ b/python/dgl/nn/mxnet/conv.py
-"""MXNet modules for graph convolutions."""
+"""MXNet module for RelGraphConv"""
 # pylint: disable= no-member, arguments-differ, invalid-name
 import math
+import numpy as np
+
 import mxnet as mx
 from mxnet import gluon, nd
 from mxnet.gluon import nn
-import numpy as np
-
-from . import utils
-from ... import function as fn
-
-__all__ = ['GraphConv', 'TAGConv', 'RelGraphConv']
-
-class GraphConv(gluon.Block):
-    r"""Apply graph convolution over an input signal.
-
-    Graph convolution is introduced in `GCN <https://arxiv.org/abs/1609.02907>`__
-    and can be described as below:
-
-    .. math::
-      h_i^{(l+1)} = \sigma(b^{(l)} + \sum_{j\in\mathcal{N}(i)}\frac{1}{c_{ij}}h_j^{(l)}W^{(l)})
-
-    where :math:`\mathcal{N}(i)` is the neighbor set of node :math:`i`. :math:`c_{ij}` is equal
-    to the product of the square root of node degrees:
-    :math:`\sqrt{|\mathcal{N}(i)|}\sqrt{|\mathcal{N}(j)|}`. :math:`\sigma` is an activation
-    function.
-
-    The model parameters are initialized as in the
-    `original implementation <https://github.com/tkipf/gcn/blob/master/gcn/layers.py>`__ where
-    the weight :math:`W^{(l)}` is initialized using Glorot uniform initialization
-    and the bias is initialized to be zero.
-
-    Notes
-    -----
-    Zero in degree nodes could lead to invalid normalizer. A common practice
-    to avoid this is to add a self-loop for each node in the graph, which
-    can be achieved by:
-
-    >>> g = ... # some DGLGraph
-    >>> g.add_edges(g.nodes(), g.nodes())
-
-
-    Parameters
-    ----------
-    in_feats : int
-        Number of input features.
-    out_feats : int
-        Number of output features.
-    norm : bool, optional
-        If True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
-    bias : bool, optional
-        If True, adds a learnable bias to the output. Default: ``True``.
-    activation: callable activation function/layer or None, optional
-        If not None, applies an activation function to the updated node features.
-        Default: ``None``.
-
-    Attributes
-    ----------
-    weight : mxnet.gluon.parameter.Parameter
-        The learnable weight tensor.
-    bias : mxnet.gluon.parameter.Parameter
-        The learnable bias tensor.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 norm=True,
-                 bias=True,
-                 activation=None):
-        super(GraphConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self._norm = norm
-
-        with self.name_scope():
-            self.weight = self.params.get('weight', shape=(in_feats, out_feats),
-                                          init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
-            if bias:
-                self.bias = self.params.get('bias', shape=(out_feats,),
-                                            init=mx.init.Zero())
-            else:
-                self.bias = None
-
-        self._activation = activation
-
-    def forward(self, graph, feat):
-        r"""Compute graph convolution.
-
-        Notes
-        -----
-            * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
-              dimensions, :math:`N` is the number of nodes.
-            * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
-              the same shape as the input.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : mxnet.NDArray
-            The input feature
-
-        Returns
-        -------
-        mxnet.NDArray
-            The output feature
-        """
-        graph = graph.local_var()
-        if self._norm:
-            degs = graph.in_degrees().astype('float32')
-            norm = mx.nd.power(mx.nd.clip(degs, a_min=1, a_max=float("inf")), -0.5)
-            shp = norm.shape + (1,) * (feat.ndim - 1)
-            norm = norm.reshape(shp).as_in_context(feat.context)
-            feat = feat * norm
-
-        if self._in_feats > self._out_feats:
-            # mult W first to reduce the feature size for aggregation.
-            feat = mx.nd.dot(feat, self.weight.data(feat.context))
-            graph.ndata['h'] = feat
-            graph.update_all(fn.copy_src(src='h', out='m'),
-                             fn.sum(msg='m', out='h'))
-            rst = graph.ndata.pop('h')
-        else:
-            # aggregate first then mult W
-            graph.ndata['h'] = feat
-            graph.update_all(fn.copy_src(src='h', out='m'),
-                             fn.sum(msg='m', out='h'))
-            rst = graph.ndata.pop('h')
-            rst = mx.nd.dot(rst, self.weight.data(feat.context))
-
-        if self._norm:
-            rst = rst * norm
-
-        if self.bias is not None:
-            rst = rst + self.bias.data(rst.context)
-
-        if self._activation is not None:
-            rst = self._activation(rst)
-
-        return rst
-
-    def __repr__(self):
-        summary = 'GraphConv('
-        summary += 'in={:d}, out={:d}, normalization={}, activation={}'.format(
-            self._in_feats, self._out_feats,
-            self._norm, self._activation)
-        summary += '\n)'
-        return summary
-
-class TAGConv(gluon.Block):
-    r"""Apply Topology Adaptive Graph Convolutional Network
-
-    .. math::
-        \mathbf{X}^{\prime} = \sum_{k=0}^K \mathbf{D}^{-1/2} \mathbf{A}
-        \mathbf{D}^{-1/2}\mathbf{X} \mathbf{\Theta}_{k},
-
-    where :math:`\mathbf{A}` denotes the adjacency matrix and
-    :math:`D_{ii} = \sum_{j=0} A_{ij}` its diagonal degree matrix.
-
-    Parameters
-    ----------
-    in_feats : int
-        Number of input features.
-    out_feats : int
-        Number of output features.
-    k: int, optional
-        Number of hops :math: `k`. (default: 2)
-    bias: bool, optional
-        If True, adds a learnable bias to the output. Default: ``True``.
-    activation: callable activation function/layer or None, optional
-        If not None, applies an activation function to the updated node features.
-        Default: ``None``.
-
-    Attributes
-    ----------
-    lin : mxnet.gluon.parameter.Parameter
-        The learnable weight tensor.
-    bias : mxnet.gluon.parameter.Parameter
-        The learnable bias tensor.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 k=2,
-                 bias=True,
-                 activation=None):
-        super(TAGConv, self).__init__()
-        self.out_feats = out_feats
-        self.k = k
-        self.bias = bias
-        self.activation = activation
-        self.in_feats = in_feats
-
-        self.lin = self.params.get(
-            'weight', shape=(self.in_feats * (self.k + 1), self.out_feats),
-            init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
-        if self.bias:
-            self.h_bias = self.params.get('bias', shape=(out_feats,),
-                                          init=mx.init.Zero())
-
-    def forward(self, graph, feat):
-        r"""Compute graph convolution
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : mxnet.NDArray
-            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
-            is size of input feature, :math:`N` is the number of nodes.
-
-        Returns
-        -------
-        mxnet.NDArray
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is size of output feature.
-        """
-        graph = graph.local_var()
-
-        degs = graph.in_degrees().astype('float32')
-        norm = mx.nd.power(mx.nd.clip(degs, a_min=1, a_max=float("inf")), -0.5)
-        shp = norm.shape + (1,) * (feat.ndim - 1)
-        norm = norm.reshape(shp).as_in_context(feat.context)
-
-        rst = feat
-        for _ in range(self.k):
-            rst = rst * norm
-            graph.ndata['h'] = rst
-
-            graph.update_all(fn.copy_src(src='h', out='m'),
-                             fn.sum(msg='m', out='h'))
-            rst = graph.ndata['h']
-            rst = rst * norm
-            feat = mx.nd.concat(feat, rst, dim=-1)
-
-        rst = mx.nd.dot(feat, self.lin.data(feat.context))
-        if self.bias is not None:
-            rst = rst + self.h_bias.data(rst.context)
-
-        if self.activation is not None:
-            rst = self.activation(rst)
+from .... import function as fn
+from .. import utils

-        return rst

 class RelGraphConv(gluon.Block):
    r"""Relational graph convolution layer.

--- a/python/dgl/nn/mxnet/conv/tagconv.py
+++ b/python/dgl/nn/mxnet/conv/tagconv.py
+"""MXNet module for TAGConv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import math
+
+import mxnet as mx
+from mxnet import gluon
+
+from .... import function as fn
+
+
+class TAGConv(gluon.Block):
+    r"""Apply Topology Adaptive Graph Convolutional Network
+
+    .. math::
+        \mathbf{X}^{\prime} = \sum_{k=0}^K \mathbf{D}^{-1/2} \mathbf{A}
+        \mathbf{D}^{-1/2}\mathbf{X} \mathbf{\Theta}_{k},
+
+    where :math:`\mathbf{A}` denotes the adjacency matrix and
+    :math:`D_{ii} = \sum_{j=0} A_{ij}` its diagonal degree matrix.
+
+    Parameters
+    ----------
+    in_feats : int
+        Number of input features.
+    out_feats : int
+        Number of output features.
+    k: int, optional
+        Number of hops :math: `k`. (default: 2)
+    bias: bool, optional
+        If True, adds a learnable bias to the output. Default: ``True``.
+    activation: callable activation function/layer or None, optional
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+
+    Attributes
+    ----------
+    lin : mxnet.gluon.parameter.Parameter
+        The learnable weight tensor.
+    bias : mxnet.gluon.parameter.Parameter
+        The learnable bias tensor.
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 k=2,
+                 bias=True,
+                 activation=None):
+        super(TAGConv, self).__init__()
+        self.out_feats = out_feats
+        self.k = k
+        self.bias = bias
+        self.activation = activation
+        self.in_feats = in_feats
+
+        self.lin = self.params.get(
+            'weight', shape=(self.in_feats * (self.k + 1), self.out_feats),
+            init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
+        if self.bias:
+            self.h_bias = self.params.get('bias', shape=(out_feats,),
+                                          init=mx.init.Zero())
+
+    def forward(self, graph, feat):
+        r"""Compute graph convolution
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : mxnet.NDArray
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        mxnet.NDArray
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        graph = graph.local_var()
+
+        degs = graph.in_degrees().astype('float32')
+        norm = mx.nd.power(mx.nd.clip(degs, a_min=1, a_max=float("inf")), -0.5)
+        shp = norm.shape + (1,) * (feat.ndim - 1)
+        norm = norm.reshape(shp).as_in_context(feat.context)
+
+        rst = feat
+        for _ in range(self.k):
+            rst = rst * norm
+            graph.ndata['h'] = rst
+
+            graph.update_all(fn.copy_src(src='h', out='m'),
+                             fn.sum(msg='m', out='h'))
+            rst = graph.ndata['h']
+            rst = rst * norm
+            feat = mx.nd.concat(feat, rst, dim=-1)
+
+        rst = mx.nd.dot(feat, self.lin.data(feat.context))
+        if self.bias is not None:
+            rst = rst + self.h_bias.data(rst.context)
+
+        if self.activation is not None:
+            rst = self.activation(rst)
+
+        return rst
--- a/python/dgl/nn/pytorch/conv.py
+++ b/python/dgl/nn/pytorch/conv.py
-"""Torch modules for graph convolutions."""
-# pylint: disable= no-member, arguments-differ, invalid-name
-import torch as th
-from torch import nn
-from torch.nn import init
-import torch.nn.functional as F
-
-from . import utils
-from ... import function as fn
-from ...batched_graph import broadcast_nodes
-from ...transform import laplacian_lambda_max
-from .softmax import edge_softmax
-
-__all__ = ['GraphConv', 'GATConv', 'TAGConv', 'RelGraphConv', 'SAGEConv',
-           'SGConv', 'APPNPConv', 'GINConv', 'GatedGraphConv', 'GMMConv',
-           'ChebConv', 'AGNNConv', 'NNConv', 'DenseGraphConv', 'DenseSAGEConv',
-           'DenseChebConv', 'EdgeConv']
-
-# pylint: disable=W0235
-class Identity(nn.Module):
-    """A placeholder identity operator that is argument-insensitive.
-    (Identity has already been supported by PyTorch 1.2, we will directly
-    import torch.nn.Identity in the future)
-    """
-    def __init__(self):
-        super(Identity, self).__init__()
-
-    def forward(self, x):
-        """Return input"""
-        return x
-
-# pylint: enable=W0235
-class GraphConv(nn.Module):
-    r"""Apply graph convolution over an input signal.
-
-    Graph convolution is introduced in `GCN <https://arxiv.org/abs/1609.02907>`__
-    and can be described as below:
-
-    .. math::
-      h_i^{(l+1)} = \sigma(b^{(l)} + \sum_{j\in\mathcal{N}(i)}\frac{1}{c_{ij}}h_j^{(l)}W^{(l)})
-
-    where :math:`\mathcal{N}(i)` is the neighbor set of node :math:`i`. :math:`c_{ij}` is equal
-    to the product of the square root of node degrees:
-    :math:`\sqrt{|\mathcal{N}(i)|}\sqrt{|\mathcal{N}(j)|}`. :math:`\sigma` is an activation
-    function.
-
-    The model parameters are initialized as in the
-    `original implementation <https://github.com/tkipf/gcn/blob/master/gcn/layers.py>`__ where
-    the weight :math:`W^{(l)}` is initialized using Glorot uniform initialization
-    and the bias is initialized to be zero.
-
-    Notes
-    -----
-    Zero in degree nodes could lead to invalid normalizer. A common practice
-    to avoid this is to add a self-loop for each node in the graph, which
-    can be achieved by:
-
-    >>> g = ... # some DGLGraph
-    >>> g.add_edges(g.nodes(), g.nodes())
-
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    norm : bool, optional
-        If True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
-    bias : bool, optional
-        If True, adds a learnable bias to the output. Default: ``True``.
-    activation: callable activation function/layer or None, optional
-        If not None, applies an activation function to the updated node features.
-        Default: ``None``.
-
-    Attributes
-    ----------
-    weight : torch.Tensor
-        The learnable weight tensor.
-    bias : torch.Tensor
-        The learnable bias tensor.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 norm=True,
-                 bias=True,
-                 activation=None):
-        super(GraphConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self._norm = norm
-
-        self.weight = nn.Parameter(th.Tensor(in_feats, out_feats))
-        if bias:
-            self.bias = nn.Parameter(th.Tensor(out_feats))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-
-        self._activation = activation
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        init.xavier_uniform_(self.weight)
-        if self.bias is not None:
-            init.zeros_(self.bias)
-
-    def forward(self, graph, feat):
-        r"""Compute graph convolution.
-
-        Notes
-        -----
-        * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
-          dimensions, :math:`N` is the number of nodes.
-        * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
-          the same shape as the input.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature
-        """
-        graph = graph.local_var()
-        if self._norm:
-            norm = th.pow(graph.in_degrees().float().clamp(min=1), -0.5)
-            shp = norm.shape + (1,) * (feat.dim() - 1)
-            norm = th.reshape(norm, shp).to(feat.device)
-            feat = feat * norm
-
-        if self._in_feats > self._out_feats:
-            # mult W first to reduce the feature size for aggregation.
-            feat = th.matmul(feat, self.weight)
-            graph.ndata['h'] = feat
-            graph.update_all(fn.copy_src(src='h', out='m'),
-                             fn.sum(msg='m', out='h'))
-            rst = graph.ndata['h']
-        else:
-            # aggregate first then mult W
-            graph.ndata['h'] = feat
-            graph.update_all(fn.copy_src(src='h', out='m'),
-                             fn.sum(msg='m', out='h'))
-            rst = graph.ndata['h']
-            rst = th.matmul(rst, self.weight)
-
-        if self._norm:
-            rst = rst * norm
-
-        if self.bias is not None:
-            rst = rst + self.bias
-
-        if self._activation is not None:
-            rst = self._activation(rst)
-
-        return rst
-
-    def extra_repr(self):
-        """Set the extra representation of the module,
-        which will come into effect when printing the model.
-        """
-        summary = 'in={_in_feats}, out={_out_feats}'
-        summary += ', normalization={_norm}'
-        if '_activation' in self.__dict__:
-            summary += ', activation={_activation}'
-        return summary.format(**self.__dict__)
-
-class GATConv(nn.Module):
-    r"""Apply `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`__
-    over an input signal.
-
-    .. math::
-        h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
-
-    where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and
-    node :math:`j`:
-
-    .. math::
-        \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
-
-        e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    num_heads : int
-        Number of heads in Multi-Head Attention.
-    feat_drop : float, optional
-        Dropout rate on feature, defaults: ``0``.
-    attn_drop : float, optional
-        Dropout rate on attention weight, defaults: ``0``.
-    negative_slope : float, optional
-        LeakyReLU angle of negative slope.
-    residual : bool, optional
-        If True, use residual connection.
-    activation : callable activation function/layer or None, optional.
-        If not None, applies an activation function to the updated node features.
-        Default: ``None``.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 num_heads,
-                 feat_drop=0.,
-                 attn_drop=0.,
-                 negative_slope=0.2,
-                 residual=False,
-                 activation=None):
-        super(GATConv, self).__init__()
-        self._num_heads = num_heads
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self.fc = nn.Linear(in_feats, out_feats * num_heads, bias=False)
-        self.attn_l = nn.Parameter(th.FloatTensor(size=(1, num_heads, out_feats)))
-        self.attn_r = nn.Parameter(th.FloatTensor(size=(1, num_heads, out_feats)))
-        self.feat_drop = nn.Dropout(feat_drop)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.leaky_relu = nn.LeakyReLU(negative_slope)
-        if residual:
-            if in_feats != out_feats:
-                self.res_fc = nn.Linear(in_feats, num_heads * out_feats, bias=False)
-            else:
-                self.res_fc = Identity()
-        else:
-            self.register_buffer('res_fc', None)
-        self.reset_parameters()
-        self.activation = activation
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain('relu')
-        nn.init.xavier_normal_(self.fc.weight, gain=gain)
-        nn.init.xavier_normal_(self.attn_l, gain=gain)
-        nn.init.xavier_normal_(self.attn_r, gain=gain)
-        if isinstance(self.res_fc, nn.Linear):
-            nn.init.xavier_normal_(self.res_fc.weight, gain=gain)
-
-    def forward(self, graph, feat):
-        r"""Compute graph attention network layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
-            is size of input feature, :math:`N` is the number of nodes.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
-            is the number of heads, and :math:`D_{out}` is size of output feature.
-        """
-        graph = graph.local_var()
-        h = self.feat_drop(feat)
-        feat = self.fc(h).view(-1, self._num_heads, self._out_feats)
-        el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1)
-        er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1)
-        graph.ndata.update({'ft': feat, 'el': el, 'er': er})
-        # compute edge attention
-        graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
-        e = self.leaky_relu(graph.edata.pop('e'))
-        # compute softmax
-        graph.edata['a'] = self.attn_drop(edge_softmax(graph, e))
-        # message passing
-        graph.update_all(fn.u_mul_e('ft', 'a', 'm'),
-                         fn.sum('m', 'ft'))
-        rst = graph.ndata['ft']
-        # residual
-        if self.res_fc is not None:
-            resval = self.res_fc(h).view(h.shape[0], -1, self._out_feats)
-            rst = rst + resval
-        # activation
-        if self.activation:
-            rst = self.activation(rst)
-        return rst
-
-
-class TAGConv(nn.Module):
-    r"""Topology Adaptive Graph Convolutional layer from paper `Topology
-    Adaptive Graph Convolutional Networks <https://arxiv.org/pdf/1710.10370.pdf>`__.
-
-    .. math::
-        \mathbf{X}^{\prime} = \sum_{k=0}^K \mathbf{D}^{-1/2} \mathbf{A}
-        \mathbf{D}^{-1/2}\mathbf{X} \mathbf{\Theta}_{k},
-
-    where :math:`\mathbf{A}` denotes the adjacency matrix and
-    :math:`D_{ii} = \sum_{j=0} A_{ij}` its diagonal degree matrix.
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    k: int, optional
-        Number of hops :math: `k`. (default: 2)
-    bias: bool, optional
-        If True, adds a learnable bias to the output. Default: ``True``.
-    activation: callable activation function/layer or None, optional
-        If not None, applies an activation function to the updated node features.
-        Default: ``None``.
-
-    Attributes
-    ----------
-    lin : torch.Module
-        The learnable linear module.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 k=2,
-                 bias=True,
-                 activation=None):
-        super(TAGConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self._k = k
-        self._activation = activation
-        self.lin = nn.Linear(in_feats * (self._k + 1), out_feats, bias=bias)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain('relu')
-        nn.init.xavier_normal_(self.lin.weight, gain=gain)
-
-    def forward(self, graph, feat):
-        r"""Compute topology adaptive graph convolution.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
-            is size of input feature, :math:`N` is the number of nodes.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is size of output feature.
-        """
-        graph = graph.local_var()
-
-        norm = th.pow(graph.in_degrees().float().clamp(min=1), -0.5)
-        shp = norm.shape + (1,) * (feat.dim() - 1)
-        norm = th.reshape(norm, shp).to(feat.device)
-
-        #D-1/2 A D -1/2 X
-        fstack = [feat]
-        for _ in range(self._k):
-
-            rst = fstack[-1] * norm
-            graph.ndata['h'] = rst
-
-            graph.update_all(fn.copy_src(src='h', out='m'),
-                             fn.sum(msg='m', out='h'))
-            rst = graph.ndata['h']
-            rst = rst * norm
-            fstack.append(rst)
-
-        rst = self.lin(th.cat(fstack, dim=-1))
-
-        if self._activation is not None:
-            rst = self._activation(rst)
-
-        return rst
-
-class RelGraphConv(nn.Module):
-    r"""Relational graph convolution layer.
-
-    Relational graph convolution is introduced in "`Modeling Relational Data with Graph
-    Convolutional Networks <https://arxiv.org/abs/1703.06103>`__"
-    and can be described as below:
-
-    .. math::
-
-       h_i^{(l+1)} = \sigma(\sum_{r\in\mathcal{R}}
-       \sum_{j\in\mathcal{N}^r(i)}\frac{1}{c_{i,r}}W_r^{(l)}h_j^{(l)}+W_0^{(l)}h_i^{(l)})
-
-    where :math:`\mathcal{N}^r(i)` is the neighbor set of node :math:`i` w.r.t. relation
-    :math:`r`. :math:`c_{i,r}` is the normalizer equal
-    to :math:`|\mathcal{N}^r(i)|`. :math:`\sigma` is an activation function. :math:`W_0`
-    is the self-loop weight.
-
-    The basis regularization decomposes :math:`W_r` by:
-
-    .. math::
-
-       W_r^{(l)} = \sum_{b=1}^B a_{rb}^{(l)}V_b^{(l)}
-
-    where :math:`B` is the number of bases.
-
-    The block-diagonal-decomposition regularization decomposes :math:`W_r` into :math:`B`
-    number of block diagonal matrices. We refer :math:`B` as the number of bases.
-
-    Parameters
-    ----------
-    in_feat : int
-        Input feature size.
-    out_feat : int
-        Output feature size.
-    num_rels : int
-        Number of relations.
-    regularizer : str
-        Which weight regularizer to use "basis" or "bdd"
-    num_bases : int, optional
-        Number of bases. If is none, use number of relations. Default: None.
-    bias : bool, optional
-        True if bias is added. Default: True
-    activation : callable, optional
-        Activation function. Default: None
-    self_loop : bool, optional
-        True to include self loop message. Default: False
-    dropout : float, optional
-        Dropout rate. Default: 0.0
-    """
-    def __init__(self,
-                 in_feat,
-                 out_feat,
-                 num_rels,
-                 regularizer="basis",
-                 num_bases=None,
-                 bias=True,
-                 activation=None,
-                 self_loop=False,
-                 dropout=0.0):
-        super(RelGraphConv, self).__init__()
-        self.in_feat = in_feat
-        self.out_feat = out_feat
-        self.num_rels = num_rels
-        self.regularizer = regularizer
-        self.num_bases = num_bases
-        if self.num_bases is None or self.num_bases > self.num_rels or self.num_bases < 0:
-            self.num_bases = self.num_rels
-        self.bias = bias
-        self.activation = activation
-        self.self_loop = self_loop
-
-        if regularizer == "basis":
-            # add basis weights
-            self.weight = nn.Parameter(th.Tensor(self.num_bases, self.in_feat, self.out_feat))
-            if self.num_bases < self.num_rels:
-                # linear combination coefficients
-                self.w_comp = nn.Parameter(th.Tensor(self.num_rels, self.num_bases))
-            nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
-            if self.num_bases < self.num_rels:
-                nn.init.xavier_uniform_(self.w_comp,
-                                        gain=nn.init.calculate_gain('relu'))
-            # message func
-            self.message_func = self.basis_message_func
-        elif regularizer == "bdd":
-            if in_feat % num_bases != 0 or out_feat % num_bases != 0:
-                raise ValueError('Feature size must be a multiplier of num_bases.')
-            # add block diagonal weights
-            self.submat_in = in_feat // self.num_bases
-            self.submat_out = out_feat // self.num_bases
-
-            # assuming in_feat and out_feat are both divisible by num_bases
-            self.weight = nn.Parameter(th.Tensor(
-                self.num_rels, self.num_bases * self.submat_in * self.submat_out))
-            nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
-            # message func
-            self.message_func = self.bdd_message_func
-        else:
-            raise ValueError("Regularizer must be either 'basis' or 'bdd'")
-
-        # bias
-        if self.bias:
-            self.h_bias = nn.Parameter(th.Tensor(out_feat))
-            nn.init.zeros_(self.h_bias)
-
-        # weight for self loop
-        if self.self_loop:
-            self.loop_weight = nn.Parameter(th.Tensor(in_feat, out_feat))
-            nn.init.xavier_uniform_(self.loop_weight,
-                                    gain=nn.init.calculate_gain('relu'))
-
-        self.dropout = nn.Dropout(dropout)
-
-    def basis_message_func(self, edges):
-        """Message function for basis regularizer"""
-        if self.num_bases < self.num_rels:
-            # generate all weights from bases
-            weight = self.weight.view(self.num_bases,
-                                      self.in_feat * self.out_feat)
-            weight = th.matmul(self.w_comp, weight).view(
-                self.num_rels, self.in_feat, self.out_feat)
-        else:
-            weight = self.weight
-
-        msg = utils.bmm_maybe_select(edges.src['h'], weight, edges.data['type'])
-        if 'norm' in edges.data:
-            msg = msg * edges.data['norm']
-        return {'msg': msg}
-
-    def bdd_message_func(self, edges):
-        """Message function for block-diagonal-decomposition regularizer"""
-        if edges.src['h'].dtype == th.int64 and len(edges.src['h'].shape) == 1:
-            raise TypeError('Block decomposition does not allow integer ID feature.')
-        weight = self.weight.index_select(0, edges.data['type']).view(
-            -1, self.submat_in, self.submat_out)
-        node = edges.src['h'].view(-1, 1, self.submat_in)
-        msg = th.bmm(node, weight).view(-1, self.out_feat)
-        if 'norm' in edges.data:
-            msg = msg * edges.data['norm']
-        return {'msg': msg}
-
-    def forward(self, g, x, etypes, norm=None):
-        """ Forward computation
-
-        Parameters
-        ----------
-        g : DGLGraph
-            The graph.
-        x : torch.Tensor
-            Input node features. Could be either
-                * :math:`(|V|, D)` dense tensor
-                * :math:`(|V|,)` int64 vector, representing the categorical values of each
-                  node. We then treat the input feature as an one-hot encoding feature.
-        etypes : torch.Tensor
-            Edge type tensor. Shape: :math:`(|E|,)`
-        norm : torch.Tensor
-            Optional edge normalizer tensor. Shape: :math:`(|E|, 1)`
-
-        Returns
-        -------
-        torch.Tensor
-            New node features.
-        """
-        g = g.local_var()
-        g.ndata['h'] = x
-        g.edata['type'] = etypes
-        if norm is not None:
-            g.edata['norm'] = norm
-        if self.self_loop:
-            loop_message = utils.matmul_maybe_select(x, self.loop_weight)
-        # message passing
-        g.update_all(self.message_func, fn.sum(msg='msg', out='h'))
-        # apply bias and activation
-        node_repr = g.ndata['h']
-        if self.bias:
-            node_repr = node_repr + self.h_bias
-        if self.self_loop:
-            node_repr = node_repr + loop_message
-        if self.activation:
-            node_repr = self.activation(node_repr)
-        node_repr = self.dropout(node_repr)
-        return node_repr
-
-
-class EdgeConv(nn.Module):
-    r"""EdgeConv layer.
-
-    Introduced in "`Dynamic Graph CNN for Learning on Point Clouds
-    <https://arxiv.org/pdf/1801.07829>`__".  Can be described as follows:
-
-    .. math::
-
-       x_i^{(l+1)} = \max_{j \in \mathcal{N}(i)} \mathrm{ReLU}(
-       \Theta \cdot (x_j^{(l)} - x_i^{(l)}) + \Phi \cdot x_i^{(l)})
-
-    where :math:`\mathcal{N}(i)` is the neighbor of :math:`i`.
-
-    Parameters
-    ----------
-    in_feat : int
-        Input feature size.
-    out_feat : int
-        Output feature size.
-    batch_norm : bool
-        Whether to include batch normalization on messages.
-    """
-    def __init__(self, in_feat, out_feat, batch_norm=False):
-        super(EdgeConv, self).__init__()
-        self.batch_norm = batch_norm
-
-        self.theta = nn.Linear(in_feat, out_feat)
-        self.phi = nn.Linear(in_feat, out_feat)
-
-        if batch_norm:
-            self.bn = nn.BatchNorm1d(out_feat)
-
-    def message(self, edges):
-        """The message computation function.
-        """
-        theta_x = self.theta(edges.dst['x'] - edges.src['x'])
-        phi_x = self.phi(edges.src['x'])
-        return {'e': theta_x + phi_x}
-
-    def forward(self, g, h):
-        """Forward computation
-
-        Parameters
-        ----------
-        g : DGLGraph
-            The graph.
-        h : Tensor
-            :math:`(N, D)` where :math:`N` is the number of nodes and
-            :math:`D` is the number of feature dimensions.
-        Returns
-        -------
-        torch.Tensor
-            New node features.
-        """
-        with g.local_scope():
-            g.ndata['x'] = h
-            if not self.batch_norm:
-                g.update_all(self.message, fn.max('e', 'x'))
-            else:
-                g.apply_edges(self.message)
-                # Although the official implementation includes a per-edge
-                # batch norm within EdgeConv, I choose to replace it with a
-                # global batch norm for a number of reasons:
-                #
-                # (1) When the point clouds within each batch do not have the
-                #     same number of points, batch norm would not work.
-                #
-                # (2) Even if the point clouds always have the same number of
-                #     points, the points may as well be shuffled even with the
-                #     same (type of) object (and the official implementation
-                #     *does* shuffle the points of the same example for each
-                #     epoch).
-                #
-                #     For example, the first point of a point cloud of an
-                #     airplane does not always necessarily reside at its nose.
-                #
-                #     In this case, the learned statistics of each position
-                #     by batch norm is not as meaningful as those learned from
-                #     images.
-                g.edata['e'] = self.bn(g.edata['e'])
-                g.update_all(fn.copy_e('e', 'e'), fn.max('e', 'x'))
-            return g.ndata['x']
-
-
-class SAGEConv(nn.Module):
-    r"""GraphSAGE layer from paper `Inductive Representation Learning on
-    Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`__.
-
-    .. math::
-        h_{\mathcal{N}(i)}^{(l+1)} & = \mathrm{aggregate}
-        \left(\{h_{j}^{l}, \forall j \in \mathcal{N}(i) \}\right)
-
-        h_{i}^{(l+1)} & = \sigma \left(W \cdot \mathrm{concat}
-        (h_{i}^{l}, h_{\mathcal{N}(i)}^{l+1} + b) \right)
-
-        h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l})
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    feat_drop : float
-        Dropout rate on features, default: ``0``.
-    aggregator_type : str
-        Aggregator type to use (``mean``, ``gcn``, ``pool``, ``lstm``).
-    bias : bool
-        If True, adds a learnable bias to the output. Default: ``True``.
-    norm : callable activation function/layer or None, optional
-        If not None, applies normalization to the updated node features.
-    activation : callable activation function/layer or None, optional
-        If not None, applies an activation function to the updated node features.
-        Default: ``None``.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 aggregator_type,
-                 feat_drop=0.,
-                 bias=True,
-                 norm=None,
-                 activation=None):
-        super(SAGEConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self._aggre_type = aggregator_type
-        self.norm = norm
-        self.feat_drop = nn.Dropout(feat_drop)
-        self.activation = activation
-        # aggregator type: mean/pool/lstm/gcn
-        if aggregator_type == 'pool':
-            self.fc_pool = nn.Linear(in_feats, in_feats)
-        if aggregator_type == 'lstm':
-            self.lstm = nn.LSTM(in_feats, in_feats, batch_first=True)
-        if aggregator_type != 'gcn':
-            self.fc_self = nn.Linear(in_feats, out_feats, bias=bias)
-        self.fc_neigh = nn.Linear(in_feats, out_feats, bias=bias)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain('relu')
-        if self._aggre_type == 'pool':
-            nn.init.xavier_uniform_(self.fc_pool.weight, gain=gain)
-        if self._aggre_type == 'lstm':
-            self.lstm.reset_parameters()
-        if self._aggre_type != 'gcn':
-            nn.init.xavier_uniform_(self.fc_self.weight, gain=gain)
-        nn.init.xavier_uniform_(self.fc_neigh.weight, gain=gain)
-
-    def _lstm_reducer(self, nodes):
-        """LSTM reducer
-        NOTE(zihao): lstm reducer with default schedule (degree bucketing)
-        is slow, we could accelerate this with degree padding in the future.
-        """
-        m = nodes.mailbox['m'] # (B, L, D)
-        batch_size = m.shape[0]
-        h = (m.new_zeros((1, batch_size, self._in_feats)),
-             m.new_zeros((1, batch_size, self._in_feats)))
-        _, (rst, _) = self.lstm(m, h)
-        return {'neigh': rst.squeeze(0)}
-
-    def forward(self, graph, feat):
-        r"""Compute GraphSAGE layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
-            is size of input feature, :math:`N` is the number of nodes.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is size of output feature.
-        """
-        graph = graph.local_var()
-        feat = self.feat_drop(feat)
-        h_self = feat
-        if self._aggre_type == 'mean':
-            graph.ndata['h'] = feat
-            graph.update_all(fn.copy_src('h', 'm'), fn.mean('m', 'neigh'))
-            h_neigh = graph.ndata['neigh']
-        elif self._aggre_type == 'gcn':
-            graph.ndata['h'] = feat
-            graph.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'neigh'))
-            # divide in_degrees
-            degs = graph.in_degrees().float()
-            degs = degs.to(feat.device)
-            h_neigh = (graph.ndata['neigh'] + graph.ndata['h']) / (degs.unsqueeze(-1) + 1)
-        elif self._aggre_type == 'pool':
-            graph.ndata['h'] = F.relu(self.fc_pool(feat))
-            graph.update_all(fn.copy_src('h', 'm'), fn.max('m', 'neigh'))
-            h_neigh = graph.ndata['neigh']
-        elif self._aggre_type == 'lstm':
-            graph.ndata['h'] = feat
-            graph.update_all(fn.copy_src('h', 'm'), self._lstm_reducer)
-            h_neigh = graph.ndata['neigh']
-        else:
-            raise KeyError('Aggregator type {} not recognized.'.format(self._aggre_type))
-        # GraphSAGE GCN does not require fc_self.
-        if self._aggre_type == 'gcn':
-            rst = self.fc_neigh(h_neigh)
-        else:
-            rst = self.fc_self(h_self) + self.fc_neigh(h_neigh)
-        # activation
-        if self.activation is not None:
-            rst = self.activation(rst)
-        # normalization
-        if self.norm is not None:
-            rst = self.norm(rst)
-        return rst
-
-
-class GatedGraphConv(nn.Module):
-    r"""Gated Graph Convolution layer from paper `Gated Graph Sequence
-    Neural Networks <https://arxiv.org/pdf/1511.05493.pdf>`__.
-
-    .. math::
-        h_{i}^{0} & = [ x_i \| \mathbf{0} ]
-
-        a_{i}^{t} & = \sum_{j\in\mathcal{N}(i)} W_{e_{ij}} h_{j}^{t}
-
-        h_{i}^{t+1} & = \mathrm{GRU}(a_{i}^{t}, h_{i}^{t})
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    n_steps : int
-        Number of recurrent steps.
-    n_etypes : int
-        Number of edge types.
-    bias : bool
-        If True, adds a learnable bias to the output. Default: ``True``.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 n_steps,
-                 n_etypes,
-                 bias=True):
-        super(GatedGraphConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self._n_steps = n_steps
-        self.edge_embed = nn.Embedding(n_etypes, out_feats * out_feats)
-        self.gru = nn.GRUCell(out_feats, out_feats, bias=bias)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        gain = init.calculate_gain('relu')
-        self.gru.reset_parameters()
-        init.xavier_normal_(self.edge_embed.weight, gain=gain)
-
-    def forward(self, graph, feat, etypes):
-        """Compute Gated Graph Convolution layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`N`
-            is the number of nodes of the graph and :math:`D_{in}` is the
-            input feature size.
-        etypes : torch.LongTensor
-            The edge type tensor of shape :math:`(E,)` where :math:`E` is
-            the number of edges of the graph.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is the output feature size.
-        """
-        graph = graph.local_var()
-        zero_pad = feat.new_zeros((feat.shape[0], self._out_feats - feat.shape[1]))
-        feat = th.cat([feat, zero_pad], -1)
-        # NOTE(zihao): there is still room to optimize, we may do kernel fusion
-        # for such operations in the future.
-        graph.edata['w'] = self.edge_embed(etypes).view(-1, self._out_feats, self._out_feats)
-        for _ in range(self._n_steps):
-            graph.ndata['h'] = feat.unsqueeze(-1) # (N, D, 1)
-            graph.update_all(fn.u_mul_e('h', 'w', 'm'),
-                             fn.sum('m', 'a'))
-            a = graph.ndata.pop('a').sum(dim=1) # (N, D)
-            feat = self.gru(a, feat)
-        return feat
-
-
-class GMMConv(nn.Module):
-    r"""The Gaussian Mixture Model Convolution layer from `Geometric Deep
-    Learning on Graphs and Manifolds using Mixture Model CNNs
-    <http://openaccess.thecvf.com/content_cvpr_2017/papers/Monti_Geometric_Deep_Learning_CVPR_2017_paper.pdf>`__.
-
-    .. math::
-        h_i^{l+1} & = \mathrm{aggregate}\left(\left\{\frac{1}{K}
-         \sum_{k}^{K} w_k(u_{ij}), \forall j\in \mathcal{N}(i)\right\}\right)
-
-        w_k(u) & = \exp\left(-\frac{1}{2}(u-\mu_k)^T \Sigma_k^{-1} (u - \mu_k)\right)
-
-    Parameters
-    ----------
-    in_feats : int
-        Number of input features.
-    out_feats : int
-        Number of output features.
-    dim : int
-        Dimensionality of pseudo-coordinte.
-    n_kernels : int
-        Number of kernels :math:`K`.
-    aggregator_type : str
-        Aggregator type (``sum``, ``mean``, ``max``).
-    residual : bool
-        If True, use residual connection inside this layer.
-    bias : bool
-        If True, adds a learnable bias to the output. Default: ``True``.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 dim,
-                 n_kernels,
-                 aggregator_type,
-                 residual=True,
-                 bias=True):
-        super(GMMConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self._dim = dim
-        self._n_kernels = n_kernels
-        if aggregator_type == 'sum':
-            self._reducer = fn.sum
-        elif aggregator_type == 'mean':
-            self._reducer = fn.mean
-        elif aggregator_type == 'max':
-            self._reducer = fn.max
-        else:
-            raise KeyError("Aggregator type {} not recognized.".format(aggregator_type))
-
-        self.mu = nn.Parameter(th.Tensor(n_kernels, dim))
-        self.inv_sigma = nn.Parameter(th.Tensor(n_kernels, dim))
-        self.fc = nn.Linear(in_feats, n_kernels * out_feats, bias=False)
-        if residual:
-            if in_feats != out_feats:
-                self.res_fc = nn.Linear(in_feats, out_feats, bias=False)
-            else:
-                self.res_fc = Identity()
-        else:
-            self.register_buffer('res_fc', None)
-
-        if bias:
-            self.bias = nn.Parameter(th.Tensor(out_feats))
-        else:
-            self.register_buffer('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        gain = init.calculate_gain('relu')
-        init.xavier_normal_(self.fc.weight, gain=gain)
-        if isinstance(self.res_fc, nn.Linear):
-            init.xavier_normal_(self.res_fc.weight, gain=gain)
-        init.normal_(self.mu.data, 0, 0.1)
-        init.normal_(self.inv_sigma.data, 1, 0.1)
-        if self.bias is not None:
-            init.zeros_(self.bias.data)
-
-    def forward(self, graph, feat, pseudo):
-        """Compute Gaussian Mixture Model Convolution layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`N`
-            is the number of nodes of the graph and :math:`D_{in}` is the
-            input feature size.
-        pseudo : torch.Tensor
-            The pseudo coordinate tensor of shape :math:`(E, D_{u})` where
-            :math:`E` is the number of edges of the graph and :math:`D_{u}`
-            is the dimensionality of pseudo coordinate.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is the output feature size.
-        """
-        graph = graph.local_var()
-        graph.ndata['h'] = self.fc(feat).view(-1, self._n_kernels, self._out_feats)
-        E = graph.number_of_edges()
-        # compute gaussian weight
-        gaussian = -0.5 * ((pseudo.view(E, 1, self._dim) -
-                            self.mu.view(1, self._n_kernels, self._dim)) ** 2)
-        gaussian = gaussian * (self.inv_sigma.view(1, self._n_kernels, self._dim) ** 2)
-        gaussian = th.exp(gaussian.sum(dim=-1, keepdim=True)) # (E, K, 1)
-        graph.edata['w'] = gaussian
-        graph.update_all(fn.u_mul_e('h', 'w', 'm'), self._reducer('m', 'h'))
-        rst = graph.ndata['h'].sum(1)
-        # residual connection
-        if self.res_fc is not None:
-            rst = rst + self.res_fc(feat)
-        # bias
-        if self.bias is not None:
-            rst = rst + self.bias
-        return rst
-
-
-class GINConv(nn.Module):
-    r"""Graph Isomorphism Network layer from paper `How Powerful are Graph
-    Neural Networks? <https://arxiv.org/pdf/1810.00826.pdf>`__.
-
-    .. math::
-        h_i^{(l+1)} = f_\Theta \left((1 + \epsilon) h_i^{l} +
-        \mathrm{aggregate}\left(\left\{h_j^{l}, j\in\mathcal{N}(i)
-        \right\}\right)\right)
-
-    Parameters
-    ----------
-    apply_func : callable activation function/layer or None
-        If not None, apply this function to the updated node feature,
-        the :math:`f_\Theta` in the formula.
-    aggregator_type : str
-        Aggregator type to use (``sum``, ``max`` or ``mean``).
-    init_eps : float, optional
-        Initial :math:`\epsilon` value, default: ``0``.
-    learn_eps : bool, optional
-        If True, :math:`\epsilon` will be a learnable parameter.
-    """
-    def __init__(self,
-                 apply_func,
-                 aggregator_type,
-                 init_eps=0,
-                 learn_eps=False):
-        super(GINConv, self).__init__()
-        self.apply_func = apply_func
-        if aggregator_type == 'sum':
-            self._reducer = fn.sum
-        elif aggregator_type == 'max':
-            self._reducer = fn.max
-        elif aggregator_type == 'mean':
-            self._reducer = fn.mean
-        else:
-            raise KeyError('Aggregator type {} not recognized.'.format(aggregator_type))
-        # to specify whether eps is trainable or not.
-        if learn_eps:
-            self.eps = th.nn.Parameter(th.FloatTensor([init_eps]))
-        else:
-            self.register_buffer('eps', th.FloatTensor([init_eps]))
-
-    def forward(self, graph, feat):
-        r"""Compute Graph Isomorphism Network layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D)` where :math:`D`
-            could be any positive integer, :math:`N` is the number
-            of nodes. If ``apply_func`` is not None, :math:`D` should
-            fit the input dimensionality requirement of ``apply_func``.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where
-            :math:`D_{out}` is the output dimensionality of ``apply_func``.
-            If ``apply_func`` is None, :math:`D_{out}` should be the same
-            as input dimensionality.
-        """
-        graph = graph.local_var()
-        graph.ndata['h'] = feat
-        graph.update_all(fn.copy_u('h', 'm'), self._reducer('m', 'neigh'))
-        rst = (1 + self.eps) * feat + graph.ndata['neigh']
-        if self.apply_func is not None:
-            rst = self.apply_func(rst)
-        return rst
-
-
-class ChebConv(nn.Module):
-    r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
-    Neural Networks on Graphs with Fast Localized Spectral Filtering
-    <https://arxiv.org/pdf/1606.09375.pdf>`__.
-
-    .. math::
-        h_i^{l+1} &= \sum_{k=0}^{K-1} W^{k, l}z_i^{k, l}
-
-        Z^{0, l} &= H^{l}
-
-        Z^{1, l} &= \hat{L} \cdot H^{l}
-
-        Z^{k, l} &= 2 \cdot \hat{L} \cdot Z^{k-1, l} - Z^{k-2, l}
-
-        \hat{L} &= 2\left(I - \hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2}\right)/\lambda_{max} - I
-
-    Parameters
-    ----------
-    in_feats: int
-        Number of input features.
-    out_feats: int
-        Number of output features.
-    k : int
-        Chebyshev filter size.
-    bias : bool, optional
-        If True, adds a learnable bias to the output. Default: ``True``.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 k,
-                 bias=True):
-        super(ChebConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self.fc = nn.ModuleList([
-            nn.Linear(in_feats, out_feats, bias=False) for _ in range(k)
-        ])
-        self._k = k
-        if bias:
-            self.bias = nn.Parameter(th.Tensor(out_feats))
-        else:
-            self.register_buffer('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        if self.bias is not None:
-            init.zeros_(self.bias)
-        for module in self.fc.modules():
-            if isinstance(module, nn.Linear):
-                init.xavier_normal_(module.weight, init.calculate_gain('relu'))
-                if module.bias is not None:
-                    init.zeros_(module.bias)
-
-    def forward(self, graph, feat, lambda_max=None):
-        r"""Compute ChebNet layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph or BatchedDGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
-            is size of input feature, :math:`N` is the number of nodes.
-        lambda_max : list or tensor or None, optional.
-            A list(tensor) with length :math:`B`, stores the largest eigenvalue
-            of the normalized laplacian of each individual graph in ``graph``,
-            where :math:`B` is the batch size of the input graph. Default: None.
-            If None, this method would compute the list by calling
-            ``dgl.laplacian_lambda_max``.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is size of output feature.
-        """
-        with graph.local_scope():
-            norm = th.pow(
-                graph.in_degrees().float().clamp(min=1), -0.5).unsqueeze(-1).to(feat.device)
-            if lambda_max is None:
-                lambda_max = laplacian_lambda_max(graph)
-            if isinstance(lambda_max, list):
-                lambda_max = th.Tensor(lambda_max).to(feat.device)
-            if lambda_max.dim() < 1:
-                lambda_max = lambda_max.unsqueeze(-1) # (B,) to (B, 1)
-            # broadcast from (B, 1) to (N, 1)
-            lambda_max = broadcast_nodes(graph, lambda_max)
-            # T0(X)
-            Tx_0 = feat
-            rst = self.fc[0](Tx_0)
-            # T1(X)
-            if self._k > 1:
-                graph.ndata['h'] = Tx_0 * norm
-                graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
-                h = graph.ndata.pop('h') * norm
-                # Λ = 2 * (I - D ^ -1/2 A D ^ -1/2) / lambda_max - I
-                #   = - 2(D ^ -1/2 A D ^ -1/2) / lambda_max + (2 / lambda_max - 1) I
-                Tx_1 = -2. * h / lambda_max + Tx_0 * (2. / lambda_max - 1)
-                rst = rst + self.fc[1](Tx_1)
-            # Ti(x), i = 2...k
-            for i in range(2, self._k):
-                graph.ndata['h'] = Tx_1 * norm
-                graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
-                h = graph.ndata.pop('h') * norm
-                # Tx_k = 2 * Λ * Tx_(k-1) - Tx_(k-2)
-                #      = - 4(D ^ -1/2 A D ^ -1/2) / lambda_max Tx_(k-1) +
-                #        (4 / lambda_max - 2) Tx_(k-1) -
-                #        Tx_(k-2)
-                Tx_2 = -4. * h / lambda_max + Tx_1 * (4. / lambda_max - 2) - Tx_0
-                rst = rst + self.fc[i](Tx_2)
-                Tx_1, Tx_0 = Tx_2, Tx_1
-            # add bias
-            if self.bias is not None:
-                rst = rst + self.bias
-            return rst
-
-
-class SGConv(nn.Module):
-    r"""Simplifying Graph Convolution layer from paper `Simplifying Graph
-    Convolutional Networks <https://arxiv.org/pdf/1902.07153.pdf>`__.
-
-    .. math::
-        H^{l+1} = (\hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2})^K H^{l} \Theta^{l}
-
-    Parameters
-    ----------
-    in_feats : int
-        Number of input features.
-    out_feats : int
-        Number of output features.
-    k : int
-        Number of hops :math:`K`. Defaults:``1``.
-    cached : bool
-        If True, the module would cache
-
-        .. math::
-            (\hat{D}^{-\frac{1}{2}}\hat{A}\hat{D}^{-\frac{1}{2}})^K X\Theta
-
-        at the first forward call. This parameter should only be set to
-        ``True`` in Transductive Learning setting.
-    bias : bool
-        If True, adds a learnable bias to the output. Default: ``True``.
-    norm : callable activation function/layer or None, optional
-        If not None, applies normalization to the updated node features.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 k=1,
-                 cached=False,
-                 bias=True,
-                 norm=None):
-        super(SGConv, self).__init__()
-        self.fc = nn.Linear(in_feats, out_feats, bias=bias)
-        self._cached = cached
-        self._cached_h = None
-        self._k = k
-        self.norm = norm
-
-    def forward(self, graph, feat):
-        r"""Compute Simplifying Graph Convolution layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
-            is size of input feature, :math:`N` is the number of nodes.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is size of output feature.
-
-        Notes
-        -----
-        If ``cache`` is se to True, ``feat`` and ``graph`` should not change during
-        training, or you will get wrong results.
-        """
-        graph = graph.local_var()
-        if self._cached_h is not None:
-            feat = self._cached_h
-        else:
-            # compute normalization
-            degs = graph.in_degrees().float().clamp(min=1)
-            norm = th.pow(degs, -0.5)
-            norm[th.isinf(norm)] = 0
-            norm = norm.to(feat.device).unsqueeze(1)
-            # compute (D^-1 A D) X
-            for _ in range(self._k):
-                feat = feat * norm
-                graph.ndata['h'] = feat
-                graph.update_all(fn.copy_u('h', 'm'),
-                                 fn.sum('m', 'h'))
-                feat = graph.ndata.pop('h')
-                feat = feat * norm
-
-            if self.norm is not None:
-                feat = self.norm(feat)
-
-            # cache feature
-            if self._cached:
-                self._cached_h = feat
-        return self.fc(feat)
-
-
-class NNConv(nn.Module):
-    r"""Graph Convolution layer introduced in `Neural Message Passing
-    for Quantum Chemistry <https://arxiv.org/pdf/1704.01212.pdf>`__.
-
-    .. math::
-        h_{i}^{l+1} = h_{i}^{l} + \mathrm{aggregate}\left(\left\{
-        f_\Theta (e_{ij}) \cdot h_j^{l}, j\in \mathcal{N}(i) \right\}\right)
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    edge_func : callable activation function/layer
-        Maps each edge feature to a vector of shape
-        ``(in_feats * out_feats)`` as weight to compute
-        messages.
-        Also is the :math:`f_\Theta` in the formula.
-    aggregator_type : str
-        Aggregator type to use (``sum``, ``mean`` or ``max``).
-    residual : bool, optional
-        If True, use residual connection. Default: ``False``.
-    bias : bool, optional
-        If True, adds a learnable bias to the output. Default: ``True``.
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 edge_func,
-                 aggregator_type,
-                 residual=False,
-                 bias=True):
-        super(NNConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self.edge_nn = edge_func
-        if aggregator_type == 'sum':
-            self.reducer = fn.sum
-        elif aggregator_type == 'mean':
-            self.reducer = fn.mean
-        elif aggregator_type == 'max':
-            self.reducer = fn.max
-        else:
-            raise KeyError('Aggregator type {} not recognized: '.format(aggregator_type))
-        self._aggre_type = aggregator_type
-        if residual:
-            if in_feats != out_feats:
-                self.res_fc = nn.Linear(in_feats, out_feats, bias=False)
-            else:
-                self.res_fc = Identity()
-        else:
-            self.register_buffer('res_fc', None)
-        if bias:
-            self.bias = nn.Parameter(th.Tensor(out_feats))
-        else:
-            self.register_buffer('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        gain = init.calculate_gain('relu')
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-        if isinstance(self.res_fc, nn.Linear):
-            nn.init.xavier_normal_(self.res_fc.weight, gain=gain)
-
-    def forward(self, graph, feat, efeat):
-        r"""Compute MPNN Graph Convolution layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`N`
-            is the number of nodes of the graph and :math:`D_{in}` is the
-            input feature size.
-        efeat : torch.Tensor
-            The edge feature of shape :math:`(N, *)`, should fit the input
-            shape requirement of ``edge_nn``.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is the output feature size.
-        """
-        graph = graph.local_var()
-        # (n, d_in, 1)
-        graph.ndata['h'] = feat.unsqueeze(-1)
-        # (n, d_in, d_out)
-        graph.edata['w'] = self.edge_nn(efeat).view(-1, self._in_feats, self._out_feats)
-        # (n, d_in, d_out)
-        graph.update_all(fn.u_mul_e('h', 'w', 'm'), self.reducer('m', 'neigh'))
-        rst = graph.ndata.pop('neigh').sum(dim=1) # (n, d_out)
-        # residual connection
-        if self.res_fc is not None:
-            rst = rst + self.res_fc(feat)
-        # bias
-        if self.bias is not None:
-            rst = rst + self.bias
-        return rst
-
-
-class APPNPConv(nn.Module):
-    r"""Approximate Personalized Propagation of Neural Predictions
-    layer from paper `Predict then Propagate: Graph Neural Networks
-    meet Personalized PageRank <https://arxiv.org/pdf/1810.05997.pdf>`__.
-
-    .. math::
-        H^{0} & = X
-
-        H^{t+1} & = (1-\alpha)\left(\hat{D}^{-1/2}
-        \hat{A} \hat{D}^{-1/2} H^{t} + \alpha H^{0}\right)
-
-    Parameters
-    ----------
-    k : int
-        Number of iterations :math:`K`.
-    alpha : float
-        The teleport probability :math:`\alpha`.
-    edge_drop : float, optional
-        Dropout rate on edges that controls the
-        messages received by each node. Default: ``0``.
-    """
-    def __init__(self,
-                 k,
-                 alpha,
-                 edge_drop=0.):
-        super(APPNPConv, self).__init__()
-        self._k = k
-        self._alpha = alpha
-        self.edge_drop = nn.Dropout(edge_drop) if edge_drop > 0 else Identity()
-
-    def forward(self, graph, feat):
-        r"""Compute APPNP layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, *)` :math:`N` is the
-            number of nodes, and :math:`*` could be of any shape.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, *)` where :math:`*`
-            should be the same as input shape.
-        """
-        graph = graph.local_var()
-        norm = th.pow(graph.in_degrees().float().clamp(min=1), -0.5)
-        norm = norm.unsqueeze(-1).to(feat.device)
-        feat_0 = feat
-        for _ in range(self._k):
-            # normalization by src
-            feat = feat * norm
-            graph.ndata['h'] = feat
-            graph.edata['w'] = self.edge_drop(
-                th.ones(graph.number_of_edges(), 1).to(feat.device))
-            graph.update_all(fn.u_mul_e('h', 'w', 'm'),
-                             fn.sum('m', 'h'))
-            feat = graph.ndata.pop('h')
-            # normalization by dst
-            feat = feat * norm
-            feat = (1 - self._alpha) * feat + self._alpha * feat_0
-        return feat
-
-
-class AGNNConv(nn.Module):
-    r"""Attention-based Graph Neural Network layer from paper `Attention-based
-    Graph Neural Network for Semi-Supervised Learning
-    <https://arxiv.org/abs/1803.03735>`__.
-
-    .. math::
-        H^{l+1} = P H^{l}
-
-    where :math:`P` is computed as:
-
-    .. math::
-        P_{ij} = \mathrm{softmax}_i ( \beta \cdot \cos(h_i^l, h_j^l))
-
-    Parameters
-    ----------
-    init_beta : float, optional
-        The :math:`\beta` in the formula.
-    learn_beta : bool, optional
-        If True, :math:`\beta` will be learnable parameter.
-    """
-    def __init__(self,
-                 init_beta=1.,
-                 learn_beta=True):
-        super(AGNNConv, self).__init__()
-        if learn_beta:
-            self.beta = nn.Parameter(th.Tensor([init_beta]))
-        else:
-            self.register_buffer('beta', th.Tensor([init_beta]))
-
-    def forward(self, graph, feat):
-        r"""Compute AGNN layer.
-
-        Parameters
-        ----------
-        graph : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, *)` :math:`N` is the
-            number of nodes, and :math:`*` could be of any shape.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, *)` where :math:`*`
-            should be the same as input shape.
-        """
-        graph = graph.local_var()
-        graph.ndata['h'] = feat
-        graph.ndata['norm_h'] = F.normalize(feat, p=2, dim=-1)
-        # compute cosine distance
-        graph.apply_edges(fn.u_mul_v('norm_h', 'norm_h', 'cos'))
-        cos = graph.edata.pop('cos').sum(-1)
-        e = self.beta * cos
-        graph.edata['p'] = edge_softmax(graph, e)
-        graph.update_all(fn.u_mul_e('h', 'p', 'm'), fn.sum('m', 'h'))
-        return graph.ndata.pop('h')
-
-
-class DenseGraphConv(nn.Module):
-    """Graph Convolutional Network layer where the graph structure
-    is given by an adjacency matrix.
-    We recommend user to use this module when inducing graph convolution
-    on dense graphs / k-hop graphs.
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    norm : bool
-        If True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
-    bias : bool
-        If True, adds a learnable bias to the output. Default: ``True``.
-    activation : callable activation function/layer or None, optional
-        If not None, applies an activation function to the updated node features.
-        Default: ``None``.
-
-    See also
-    --------
-    GraphConv
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 norm=True,
-                 bias=True,
-                 activation=None):
-        super(DenseGraphConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self._norm = norm
-        self.weight = nn.Parameter(th.Tensor(in_feats, out_feats))
-        if bias:
-            self.bias = nn.Parameter(th.Tensor(out_feats))
-        else:
-            self.register_buffer('bias', None)
-
-        self.reset_parameters()
-        self._activation = activation
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        init.xavier_uniform_(self.weight)
-        if self.bias is not None:
-            init.zeros_(self.bias)
-
-    def forward(self, adj, feat):
-        r"""Compute (Dense) Graph Convolution layer.
-
-        Parameters
-        ----------
-        adj : torch.Tensor
-            The adjacency matrix of the graph to apply Graph Convolution on,
-            should be of shape :math:`(N, N)`, where a row represents the destination
-            and a column represents the source.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
-            is size of input feature, :math:`N` is the number of nodes.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is size of output feature.
-        """
-        adj = adj.float().to(feat.device)
-        if self._norm:
-            in_degrees = adj.sum(dim=1)
-            norm = th.pow(in_degrees, -0.5)
-            shp = norm.shape + (1,) * (feat.dim() - 1)
-            norm = th.reshape(norm, shp).to(feat.device)
-            feat = feat * norm
-
-        if self._in_feats > self._out_feats:
-            # mult W first to reduce the feature size for aggregation.
-            feat = th.matmul(feat, self.weight)
-            rst = adj @ feat
-        else:
-            # aggregate first then mult W
-            rst = adj @ feat
-            rst = th.matmul(rst, self.weight)
-
-        if self._norm:
-            rst = rst * norm
-
-        if self.bias is not None:
-            rst = rst + self.bias
-
-        if self._activation is not None:
-            rst = self._activation(rst)
-
-        return rst
-
-
-class DenseSAGEConv(nn.Module):
-    """GraphSAGE layer where the graph structure is given by an
-    adjacency matrix.
-    We recommend to use this module when inducing GraphSAGE operations
-    on dense graphs / k-hop graphs.
-
-    Note that we only support gcn aggregator in DenseSAGEConv.
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    feat_drop : float, optional
-        Dropout rate on features. Default: 0.
-    bias : bool
-        If True, adds a learnable bias to the output. Default: ``True``.
-    norm : callable activation function/layer or None, optional
-        If not None, applies normalization to the updated node features.
-    activation : callable activation function/layer or None, optional
-        If not None, applies an activation function to the updated node features.
-        Default: ``None``.
-
-    See also
-    --------
-    SAGEConv
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 feat_drop=0.,
-                 bias=True,
-                 norm=None,
-                 activation=None):
-        super(DenseSAGEConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self._norm = norm
-        self.feat_drop = nn.Dropout(feat_drop)
-        self.activation = activation
-        self.fc = nn.Linear(in_feats, out_feats, bias=bias)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain('relu')
-        nn.init.xavier_uniform_(self.fc.weight, gain=gain)
-
-    def forward(self, adj, feat):
-        r"""Compute (Dense) Graph SAGE layer.
-
-        Parameters
-        ----------
-        adj : torch.Tensor
-            The adjacency matrix of the graph to apply Graph Convolution on,
-            should be of shape :math:`(N, N)`, where a row represents the destination
-            and a column represents the source.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
-            is size of input feature, :math:`N` is the number of nodes.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is size of output feature.
-        """
-        adj = adj.float().to(feat.device)
-        feat = self.feat_drop(feat)
-        in_degrees = adj.sum(dim=1).unsqueeze(-1)
-        h_neigh = (adj @ feat + feat) / (in_degrees + 1)
-        rst = self.fc(h_neigh)
-        # activation
-        if self.activation is not None:
-            rst = self.activation(rst)
-        # normalization
-        if self._norm is not None:
-            rst = self._norm(rst)
-
-        return rst
-
-
-class DenseChebConv(nn.Module):
-    r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
-    Neural Networks on Graphs with Fast Localized Spectral Filtering
-    <https://arxiv.org/pdf/1606.09375.pdf>`__.
-
-    We recommend to use this module when inducing ChebConv operations on dense
-    graphs / k-hop graphs.
-
-    Parameters
-    ----------
-    in_feats: int
-        Number of input features.
-    out_feats: int
-        Number of output features.
-    k : int
-        Chebyshev filter size.
-    bias : bool, optional
-        If True, adds a learnable bias to the output. Default: ``True``.
-
-    See also
-    --------
-    ChebConv
-    """
-    def __init__(self,
-                 in_feats,
-                 out_feats,
-                 k,
-                 bias=True):
-        super(DenseChebConv, self).__init__()
-        self._in_feats = in_feats
-        self._out_feats = out_feats
-        self._k = k
-        self.W = nn.Parameter(th.Tensor(k, in_feats, out_feats))
-        if bias:
-            self.bias = nn.Parameter(th.Tensor(out_feats))
-        else:
-            self.register_buffer('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        if self.bias is not None:
-            init.zeros_(self.bias)
-        for i in range(self._k):
-            init.xavier_normal_(self.W[i], init.calculate_gain('relu'))
-
-    def forward(self, adj, feat, lambda_max=None):
-        r"""Compute (Dense) Chebyshev Spectral Graph Convolution layer.
-
-        Parameters
-        ----------
-        adj : torch.Tensor
-            The adjacency matrix of the graph to apply Graph Convolution on,
-            should be of shape :math:`(N, N)`, where a row represents the destination
-            and a column represents the source.
-        feat : torch.Tensor
-            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
-            is size of input feature, :math:`N` is the number of nodes.
-        lambda_max : float or None, optional
-            A float value indicates the largest eigenvalue of given graph.
-            Default: None.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
-            is size of output feature.
-        """
-        A = adj.to(feat)
-        num_nodes = A.shape[0]
-
-        in_degree = 1 / A.sum(dim=1).clamp(min=1).sqrt()
-        D_invsqrt = th.diag(in_degree)
-        I = th.eye(num_nodes).to(A)
-        L = I - D_invsqrt @ A @ D_invsqrt
-
-        if lambda_max is None:
-            lambda_ = th.eig(L)[0][:, 0]
-            lambda_max = lambda_.max()
-
-        L_hat = 2 * L / lambda_max - I
-        Z = [th.eye(num_nodes).to(A)]
-        for i in range(1, self._k):
-            if i == 1:
-                Z.append(L_hat)
-            else:
-                Z.append(2 * L_hat @ Z[-1] - Z[-2])
-
-        Zs = th.stack(Z, 0)  # (k, n, n)
-
-        Zh = (Zs @ feat.unsqueeze(0) @ self.W)
-        Zh = Zh.sum(0)
-
-        if self.bias is not None:
-            Zh = Zh + self.bias
-        return Zh
--- a/python/dgl/nn/pytorch/conv/__init__.py
+++ b/python/dgl/nn/pytorch/conv/__init__.py
+"""Torch modules for graph convolutions."""
+# pylint: disable= no-member, arguments-differ, invalid-name
+
+from .agnnconv import AGNNConv
+from .appnpconv import APPNPConv
+from .chebconv import ChebConv
+from .edgeconv import EdgeConv
+from .gatconv import GATConv
+from .ginconv import GINConv
+from .gmmconv import GMMConv
+from .graphconv import GraphConv
+from .nnconv import NNConv
+from .relgraphconv import RelGraphConv
+from .sageconv import SAGEConv
+from .sgconv import SGConv
+from .tagconv import TAGConv
+from .gatedgraphconv import GatedGraphConv
+from .densechebconv import DenseChebConv
+from .densegraphconv import DenseGraphConv
+from .densesageconv import DenseSAGEConv
+
+__all__ = ['GraphConv', 'GATConv', 'TAGConv', 'RelGraphConv', 'SAGEConv',
+           'SGConv', 'APPNPConv', 'GINConv', 'GatedGraphConv', 'GMMConv',
+           'ChebConv', 'AGNNConv', 'NNConv', 'DenseGraphConv', 'DenseSAGEConv',
+           'DenseChebConv', 'EdgeConv']
--- a/python/dgl/nn/pytorch/conv/agnnconv.py
+++ b/python/dgl/nn/pytorch/conv/agnnconv.py
+"""Torch Module for Attention-based Graph Neural Network layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+from torch.nn import functional as F
+
+from .... import function as fn
+from ..softmax import edge_softmax
+
+
+class AGNNConv(nn.Module):
+    r"""Attention-based Graph Neural Network layer from paper `Attention-based
+    Graph Neural Network for Semi-Supervised Learning
+    <https://arxiv.org/abs/1803.03735>`__.
+
+    .. math::
+        H^{l+1} = P H^{l}
+
+    where :math:`P` is computed as:
+
+    .. math::
+        P_{ij} = \mathrm{softmax}_i ( \beta \cdot \cos(h_i^l, h_j^l))
+
+    Parameters
+    ----------
+    init_beta : float, optional
+        The :math:`\beta` in the formula.
+    learn_beta : bool, optional
+        If True, :math:`\beta` will be learnable parameter.
+    """
+    def __init__(self,
+                 init_beta=1.,
+                 learn_beta=True):
+        super(AGNNConv, self).__init__()
+        if learn_beta:
+            self.beta = nn.Parameter(th.Tensor([init_beta]))
+        else:
+            self.register_buffer('beta', th.Tensor([init_beta]))
+
+    def forward(self, graph, feat):
+        r"""Compute AGNN layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, *)` :math:`N` is the
+            number of nodes, and :math:`*` could be of any shape.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, *)` where :math:`*`
+            should be the same as input shape.
+        """
+        graph = graph.local_var()
+        graph.ndata['h'] = feat
+        graph.ndata['norm_h'] = F.normalize(feat, p=2, dim=-1)
+        # compute cosine distance
+        graph.apply_edges(fn.u_mul_v('norm_h', 'norm_h', 'cos'))
+        cos = graph.edata.pop('cos').sum(-1)
+        e = self.beta * cos
+        graph.edata['p'] = edge_softmax(graph, e)
+        graph.update_all(fn.u_mul_e('h', 'p', 'm'), fn.sum('m', 'h'))
+        return graph.ndata.pop('h')
--- a/python/dgl/nn/pytorch/conv/appnpconv.py
+++ b/python/dgl/nn/pytorch/conv/appnpconv.py
+"""Torch Module for APPNPConv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+
+from .... import function as fn
+from ..utils import Identity
+
+
+class APPNPConv(nn.Module):
+    r"""Approximate Personalized Propagation of Neural Predictions
+    layer from paper `Predict then Propagate: Graph Neural Networks
+    meet Personalized PageRank <https://arxiv.org/pdf/1810.05997.pdf>`__.
+
+    .. math::
+        H^{0} & = X
+
+        H^{t+1} & = (1-\alpha)\left(\hat{D}^{-1/2}
+        \hat{A} \hat{D}^{-1/2} H^{t} + \alpha H^{0}\right)
+
+    Parameters
+    ----------
+    k : int
+        Number of iterations :math:`K`.
+    alpha : float
+        The teleport probability :math:`\alpha`.
+    edge_drop : float, optional
+        Dropout rate on edges that controls the
+        messages received by each node. Default: ``0``.
+    """
+    def __init__(self,
+                 k,
+                 alpha,
+                 edge_drop=0.):
+        super(APPNPConv, self).__init__()
+        self._k = k
+        self._alpha = alpha
+        self.edge_drop = nn.Dropout(edge_drop) if edge_drop > 0 else Identity()
+
+    def forward(self, graph, feat):
+        r"""Compute APPNP layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, *)` :math:`N` is the
+            number of nodes, and :math:`*` could be of any shape.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, *)` where :math:`*`
+            should be the same as input shape.
+        """
+        graph = graph.local_var()
+        norm = th.pow(graph.in_degrees().float().clamp(min=1), -0.5)
+        norm = norm.unsqueeze(-1).to(feat.device)
+        feat_0 = feat
+        for _ in range(self._k):
+            # normalization by src
+            feat = feat * norm
+            graph.ndata['h'] = feat
+            graph.edata['w'] = self.edge_drop(
+                th.ones(graph.number_of_edges(), 1).to(feat.device))
+            graph.update_all(fn.u_mul_e('h', 'w', 'm'),
+                             fn.sum('m', 'h'))
+            feat = graph.ndata.pop('h')
+            # normalization by dst
+            feat = feat * norm
+            feat = (1 - self._alpha) * feat + self._alpha * feat_0
+        return feat
--- a/python/dgl/nn/pytorch/conv/chebconv.py
+++ b/python/dgl/nn/pytorch/conv/chebconv.py
+"""Torch Module for Chebyshev Spectral Graph Convolution layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+from torch.nn import init
+
+from .... import laplacian_lambda_max, broadcast_nodes, function as fn
+
+
+class ChebConv(nn.Module):
+    r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
+    Neural Networks on Graphs with Fast Localized Spectral Filtering
+    <https://arxiv.org/pdf/1606.09375.pdf>`__.
+
+    .. math::
+        h_i^{l+1} &= \sum_{k=0}^{K-1} W^{k, l}z_i^{k, l}
+
+        Z^{0, l} &= H^{l}
+
+        Z^{1, l} &= \hat{L} \cdot H^{l}
+
+        Z^{k, l} &= 2 \cdot \hat{L} \cdot Z^{k-1, l} - Z^{k-2, l}
+
+        \hat{L} &= 2\left(I - \hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2}\right)/\lambda_{max} - I
+
+    Parameters
+    ----------
+    in_feats: int
+        Number of input features.
+    out_feats: int
+        Number of output features.
+    k : int
+        Chebyshev filter size.
+    bias : bool, optional
+        If True, adds a learnable bias to the output. Default: ``True``.
+    """
+
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 k,
+                 bias=True):
+        super(ChebConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self.fc = nn.ModuleList([
+            nn.Linear(in_feats, out_feats, bias=False) for _ in range(k)
+        ])
+        self._k = k
+        if bias:
+            self.bias = nn.Parameter(th.Tensor(out_feats))
+        else:
+            self.register_buffer('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Reinitialize learnable parameters."""
+        if self.bias is not None:
+            init.zeros_(self.bias)
+        for module in self.fc.modules():
+            if isinstance(module, nn.Linear):
+                init.xavier_normal_(module.weight, init.calculate_gain('relu'))
+                if module.bias is not None:
+                    init.zeros_(module.bias)
+
+    def forward(self, graph, feat, lambda_max=None):
+        r"""Compute ChebNet layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph or BatchedDGLGraph
+            The graph.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+        lambda_max : list or tensor or None, optional.
+            A list(tensor) with length :math:`B`, stores the largest eigenvalue
+            of the normalized laplacian of each individual graph in ``graph``,
+            where :math:`B` is the batch size of the input graph. Default: None.
+            If None, this method would compute the list by calling
+            ``dgl.laplacian_lambda_max``.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        with graph.local_scope():
+            norm = th.pow(
+                graph.in_degrees().float().clamp(min=1), -0.5).unsqueeze(-1).to(feat.device)
+            if lambda_max is None:
+                lambda_max = laplacian_lambda_max(graph)
+            if isinstance(lambda_max, list):
+                lambda_max = th.Tensor(lambda_max).to(feat.device)
+            if lambda_max.dim() < 1:
+                lambda_max = lambda_max.unsqueeze(-1)  # (B,) to (B, 1)
+            # broadcast from (B, 1) to (N, 1)
+            lambda_max = broadcast_nodes(graph, lambda_max)
+            # T0(X)
+            Tx_0 = feat
+            rst = self.fc[0](Tx_0)
+            # T1(X)
+            if self._k > 1:
+                graph.ndata['h'] = Tx_0 * norm
+                graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
+                h = graph.ndata.pop('h') * norm
+                # Λ = 2 * (I - D ^ -1/2 A D ^ -1/2) / lambda_max - I
+                #   = - 2(D ^ -1/2 A D ^ -1/2) / lambda_max + (2 / lambda_max - 1) I
+                Tx_1 = -2. * h / lambda_max + Tx_0 * (2. / lambda_max - 1)
+                rst = rst + self.fc[1](Tx_1)
+            # Ti(x), i = 2...k
+            for i in range(2, self._k):
+                graph.ndata['h'] = Tx_1 * norm
+                graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
+                h = graph.ndata.pop('h') * norm
+                # Tx_k = 2 * Λ * Tx_(k-1) - Tx_(k-2)
+                #      = - 4(D ^ -1/2 A D ^ -1/2) / lambda_max Tx_(k-1) +
+                #        (4 / lambda_max - 2) Tx_(k-1) -
+                #        Tx_(k-2)
+                Tx_2 = -4. * h / lambda_max + Tx_1 * (4. / lambda_max - 2) - Tx_0
+                rst = rst + self.fc[i](Tx_2)
+                Tx_1, Tx_0 = Tx_2, Tx_1
+            # add bias
+            if self.bias is not None:
+                rst = rst + self.bias
+            return rst
--- a/python/dgl/nn/pytorch/conv/densechebconv.py
+++ b/python/dgl/nn/pytorch/conv/densechebconv.py
+"""Torch Module for DenseChebConv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+from torch.nn import init
+
+
+class DenseChebConv(nn.Module):
+    r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
+    Neural Networks on Graphs with Fast Localized Spectral Filtering
+    <https://arxiv.org/pdf/1606.09375.pdf>`__.
+
+    We recommend to use this module when inducing ChebConv operations on dense
+    graphs / k-hop graphs.
+
+    Parameters
+    ----------
+    in_feats: int
+        Number of input features.
+    out_feats: int
+        Number of output features.
+    k : int
+        Chebyshev filter size.
+    bias : bool, optional
+        If True, adds a learnable bias to the output. Default: ``True``.
+
+    See also
+    --------
+    ChebConv
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 k,
+                 bias=True):
+        super(DenseChebConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._k = k
+        self.W = nn.Parameter(th.Tensor(k, in_feats, out_feats))
+        if bias:
+            self.bias = nn.Parameter(th.Tensor(out_feats))
+        else:
+            self.register_buffer('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Reinitialize learnable parameters."""
+        if self.bias is not None:
+            init.zeros_(self.bias)
+        for i in range(self._k):
+            init.xavier_normal_(self.W[i], init.calculate_gain('relu'))
+
+    def forward(self, adj, feat, lambda_max=None):
+        r"""Compute (Dense) Chebyshev Spectral Graph Convolution layer.
+
+        Parameters
+        ----------
+        adj : torch.Tensor
+            The adjacency matrix of the graph to apply Graph Convolution on,
+            should be of shape :math:`(N, N)`, where a row represents the destination
+            and a column represents the source.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+        lambda_max : float or None, optional
+            A float value indicates the largest eigenvalue of given graph.
+            Default: None.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        A = adj.to(feat)
+        num_nodes = A.shape[0]
+
+        in_degree = 1 / A.sum(dim=1).clamp(min=1).sqrt()
+        D_invsqrt = th.diag(in_degree)
+        I = th.eye(num_nodes).to(A)
+        L = I - D_invsqrt @ A @ D_invsqrt
+
+        if lambda_max is None:
+            lambda_ = th.eig(L)[0][:, 0]
+            lambda_max = lambda_.max()
+
+        L_hat = 2 * L / lambda_max - I
+        Z = [th.eye(num_nodes).to(A)]
+        for i in range(1, self._k):
+            if i == 1:
+                Z.append(L_hat)
+            else:
+                Z.append(2 * L_hat @ Z[-1] - Z[-2])
+
+        Zs = th.stack(Z, 0)  # (k, n, n)
+
+        Zh = (Zs @ feat.unsqueeze(0) @ self.W)
+        Zh = Zh.sum(0)
+
+        if self.bias is not None:
+            Zh = Zh + self.bias
+        return Zh
--- a/python/dgl/nn/pytorch/conv/densegraphconv.py
+++ b/python/dgl/nn/pytorch/conv/densegraphconv.py
+"""Torch Module for DenseGraphConv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+from torch.nn import init
+
+
+class DenseGraphConv(nn.Module):
+    """Graph Convolutional Network layer where the graph structure
+    is given by an adjacency matrix.
+    We recommend user to use this module when inducing graph convolution
+    on dense graphs / k-hop graphs.
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    norm : bool
+        If True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
+    bias : bool
+        If True, adds a learnable bias to the output. Default: ``True``.
+    activation : callable activation function/layer or None, optional
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+
+    See also
+    --------
+    GraphConv
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 norm=True,
+                 bias=True,
+                 activation=None):
+        super(DenseGraphConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._norm = norm
+        self.weight = nn.Parameter(th.Tensor(in_feats, out_feats))
+        if bias:
+            self.bias = nn.Parameter(th.Tensor(out_feats))
+        else:
+            self.register_buffer('bias', None)
+
+        self.reset_parameters()
+        self._activation = activation
+
+    def reset_parameters(self):
+        """Reinitialize learnable parameters."""
+        init.xavier_uniform_(self.weight)
+        if self.bias is not None:
+            init.zeros_(self.bias)
+
+    def forward(self, adj, feat):
+        r"""Compute (Dense) Graph Convolution layer.
+
+        Parameters
+        ----------
+        adj : torch.Tensor
+            The adjacency matrix of the graph to apply Graph Convolution on,
+            should be of shape :math:`(N, N)`, where a row represents the destination
+            and a column represents the source.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        adj = adj.float().to(feat.device)
+        if self._norm:
+            in_degrees = adj.sum(dim=1)
+            norm = th.pow(in_degrees, -0.5)
+            shp = norm.shape + (1,) * (feat.dim() - 1)
+            norm = th.reshape(norm, shp).to(feat.device)
+            feat = feat * norm
+
+        if self._in_feats > self._out_feats:
+            # mult W first to reduce the feature size for aggregation.
+            feat = th.matmul(feat, self.weight)
+            rst = adj @ feat
+        else:
+            # aggregate first then mult W
+            rst = adj @ feat
+            rst = th.matmul(rst, self.weight)
+
+        if self._norm:
+            rst = rst * norm
+
+        if self.bias is not None:
+            rst = rst + self.bias
+
+        if self._activation is not None:
+            rst = self._activation(rst)
+
+        return rst
--- a/python/dgl/nn/pytorch/conv/densesageconv.py
+++ b/python/dgl/nn/pytorch/conv/densesageconv.py
+"""Torch Module for DenseSAGEConv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+from torch import nn
+
+
+class DenseSAGEConv(nn.Module):
+    """GraphSAGE layer where the graph structure is given by an
+    adjacency matrix.
+    We recommend to use this module when inducing GraphSAGE operations
+    on dense graphs / k-hop graphs.
+
+    Note that we only support gcn aggregator in DenseSAGEConv.
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    feat_drop : float, optional
+        Dropout rate on features. Default: 0.
+    bias : bool
+        If True, adds a learnable bias to the output. Default: ``True``.
+    norm : callable activation function/layer or None, optional
+        If not None, applies normalization to the updated node features.
+    activation : callable activation function/layer or None, optional
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+
+    See also
+    --------
+    SAGEConv
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 feat_drop=0.,
+                 bias=True,
+                 norm=None,
+                 activation=None):
+        super(DenseSAGEConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._norm = norm
+        self.feat_drop = nn.Dropout(feat_drop)
+        self.activation = activation
+        self.fc = nn.Linear(in_feats, out_feats, bias=bias)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Reinitialize learnable parameters."""
+        gain = nn.init.calculate_gain('relu')
+        nn.init.xavier_uniform_(self.fc.weight, gain=gain)
+
+    def forward(self, adj, feat):
+        r"""Compute (Dense) Graph SAGE layer.
+
+        Parameters
+        ----------
+        adj : torch.Tensor
+            The adjacency matrix of the graph to apply Graph Convolution on,
+            should be of shape :math:`(N, N)`, where a row represents the destination
+            and a column represents the source.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        adj = adj.float().to(feat.device)
+        feat = self.feat_drop(feat)
+        in_degrees = adj.sum(dim=1).unsqueeze(-1)
+        h_neigh = (adj @ feat + feat) / (in_degrees + 1)
+        rst = self.fc(h_neigh)
+        # activation
+        if self.activation is not None:
+            rst = self.activation(rst)
+        # normalization
+        if self._norm is not None:
+            rst = self._norm(rst)
+
+        return rst
--- a/python/dgl/nn/pytorch/conv/edgeconv.py
+++ b/python/dgl/nn/pytorch/conv/edgeconv.py
+"""Torch Module for EdgeConv Layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+from torch import nn
+
+from .... import function as fn
+
+
+class EdgeConv(nn.Module):
+    r"""EdgeConv layer.
+
+    Introduced in "`Dynamic Graph CNN for Learning on Point Clouds
+    <https://arxiv.org/pdf/1801.07829>`__".  Can be described as follows:
+
+    .. math::
+
+       x_i^{(l+1)} = \max_{j \in \mathcal{N}(i)} \mathrm{ReLU}(
+       \Theta \cdot (x_j^{(l)} - x_i^{(l)}) + \Phi \cdot x_i^{(l)})
+
+    where :math:`\mathcal{N}(i)` is the neighbor of :math:`i`.
+
+    Parameters
+    ----------
+    in_feat : int
+        Input feature size.
+    out_feat : int
+        Output feature size.
+    batch_norm : bool
+        Whether to include batch normalization on messages.
+    """
+    def __init__(self, in_feat, out_feat, batch_norm=False):
+        super(EdgeConv, self).__init__()
+        self.batch_norm = batch_norm
+
+        self.theta = nn.Linear(in_feat, out_feat)
+        self.phi = nn.Linear(in_feat, out_feat)
+
+        if batch_norm:
+            self.bn = nn.BatchNorm1d(out_feat)
+
+    def message(self, edges):
+        """The message computation function.
+        """
+        theta_x = self.theta(edges.dst['x'] - edges.src['x'])
+        phi_x = self.phi(edges.src['x'])
+        return {'e': theta_x + phi_x}
+
+    def forward(self, g, h):
+        """Forward computation
+
+        Parameters
+        ----------
+        g : DGLGraph
+            The graph.
+        h : Tensor
+            :math:`(N, D)` where :math:`N` is the number of nodes and
+            :math:`D` is the number of feature dimensions.
+        Returns
+        -------
+        torch.Tensor
+            New node features.
+        """
+        with g.local_scope():
+            g.ndata['x'] = h
+            if not self.batch_norm:
+                g.update_all(self.message, fn.max('e', 'x'))
+            else:
+                g.apply_edges(self.message)
+                # Although the official implementation includes a per-edge
+                # batch norm within EdgeConv, I choose to replace it with a
+                # global batch norm for a number of reasons:
+                #
+                # (1) When the point clouds within each batch do not have the
+                #     same number of points, batch norm would not work.
+                #
+                # (2) Even if the point clouds always have the same number of
+                #     points, the points may as well be shuffled even with the
+                #     same (type of) object (and the official implementation
+                #     *does* shuffle the points of the same example for each
+                #     epoch).
+                #
+                #     For example, the first point of a point cloud of an
+                #     airplane does not always necessarily reside at its nose.
+                #
+                #     In this case, the learned statistics of each position
+                #     by batch norm is not as meaningful as those learned from
+                #     images.
+                g.edata['e'] = self.bn(g.edata['e'])
+                g.update_all(fn.copy_e('e', 'e'), fn.max('e', 'x'))
+            return g.ndata['x']
--- a/python/dgl/nn/pytorch/conv/gatconv.py
+++ b/python/dgl/nn/pytorch/conv/gatconv.py
+"""Torch modules for graph attention networks(GAT)."""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+
+from .... import function as fn
+from ..softmax import edge_softmax
+from ..utils import Identity
+
+# pylint: enable=W0235
+class GATConv(nn.Module):
+    r"""Apply `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`__
+    over an input signal.
+
+    .. math::
+        h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
+
+    where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and
+    node :math:`j`:
+
+    .. math::
+        \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
+
+        e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    num_heads : int
+        Number of heads in Multi-Head Attention.
+    feat_drop : float, optional
+        Dropout rate on feature, defaults: ``0``.
+    attn_drop : float, optional
+        Dropout rate on attention weight, defaults: ``0``.
+    negative_slope : float, optional
+        LeakyReLU angle of negative slope.
+    residual : bool, optional
+        If True, use residual connection.
+    activation : callable activation function/layer or None, optional.
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 num_heads,
+                 feat_drop=0.,
+                 attn_drop=0.,
+                 negative_slope=0.2,
+                 residual=False,
+                 activation=None):
+        super(GATConv, self).__init__()
+        self._num_heads = num_heads
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self.fc = nn.Linear(in_feats, out_feats * num_heads, bias=False)
+        self.attn_l = nn.Parameter(th.FloatTensor(size=(1, num_heads, out_feats)))
+        self.attn_r = nn.Parameter(th.FloatTensor(size=(1, num_heads, out_feats)))
+        self.feat_drop = nn.Dropout(feat_drop)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.leaky_relu = nn.LeakyReLU(negative_slope)
+        if residual:
+            if in_feats != out_feats:
+                self.res_fc = nn.Linear(in_feats, num_heads * out_feats, bias=False)
+            else:
+                self.res_fc = Identity()
+        else:
+            self.register_buffer('res_fc', None)
+        self.reset_parameters()
+        self.activation = activation
+
+    def reset_parameters(self):
+        """Reinitialize learnable parameters."""
+        gain = nn.init.calculate_gain('relu')
+        nn.init.xavier_normal_(self.fc.weight, gain=gain)
+        nn.init.xavier_normal_(self.attn_l, gain=gain)
+        nn.init.xavier_normal_(self.attn_r, gain=gain)
+        if isinstance(self.res_fc, nn.Linear):
+            nn.init.xavier_normal_(self.res_fc.weight, gain=gain)
+
+    def forward(self, graph, feat):
+        r"""Compute graph attention network layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
+            is the number of heads, and :math:`D_{out}` is size of output feature.
+        """
+        graph = graph.local_var()
+        h = self.feat_drop(feat)
+        feat = self.fc(h).view(-1, self._num_heads, self._out_feats)
+        el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1)
+        er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1)
+        graph.ndata.update({'ft': feat, 'el': el, 'er': er})
+        # compute edge attention
+        graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
+        e = self.leaky_relu(graph.edata.pop('e'))
+        # compute softmax
+        graph.edata['a'] = self.attn_drop(edge_softmax(graph, e))
+        # message passing
+        graph.update_all(fn.u_mul_e('ft', 'a', 'm'),
+                         fn.sum('m', 'ft'))
+        rst = graph.ndata['ft']
+        # residual
+        if self.res_fc is not None:
+            resval = self.res_fc(h).view(h.shape[0], -1, self._out_feats)
+            rst = rst + resval
+        # activation
+        if self.activation:
+            rst = self.activation(rst)
+        return rst
--- a/python/dgl/nn/pytorch/conv/gatedgraphconv.py
+++ b/python/dgl/nn/pytorch/conv/gatedgraphconv.py
+"""Torch Module for Gated Graph Convolution layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+from torch.nn import init
+
+from .... import function as fn
+
+
+class GatedGraphConv(nn.Module):
+    r"""Gated Graph Convolution layer from paper `Gated Graph Sequence
+    Neural Networks <https://arxiv.org/pdf/1511.05493.pdf>`__.
+
+    .. math::
+        h_{i}^{0} & = [ x_i \| \mathbf{0} ]
+
+        a_{i}^{t} & = \sum_{j\in\mathcal{N}(i)} W_{e_{ij}} h_{j}^{t}
+
+        h_{i}^{t+1} & = \mathrm{GRU}(a_{i}^{t}, h_{i}^{t})
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    n_steps : int
+        Number of recurrent steps.
+    n_etypes : int
+        Number of edge types.
+    bias : bool
+        If True, adds a learnable bias to the output. Default: ``True``.
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 n_steps,
+                 n_etypes,
+                 bias=True):
+        super(GatedGraphConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._n_steps = n_steps
+        self.edge_embed = nn.Embedding(n_etypes, out_feats * out_feats)
+        self.gru = nn.GRUCell(out_feats, out_feats, bias=bias)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Reinitialize learnable parameters."""
+        gain = init.calculate_gain('relu')
+        self.gru.reset_parameters()
+        init.xavier_normal_(self.edge_embed.weight, gain=gain)
+
+    def forward(self, graph, feat, etypes):
+        """Compute Gated Graph Convolution layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`N`
+            is the number of nodes of the graph and :math:`D_{in}` is the
+            input feature size.
+        etypes : torch.LongTensor
+            The edge type tensor of shape :math:`(E,)` where :math:`E` is
+            the number of edges of the graph.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is the output feature size.
+        """
+        graph = graph.local_var()
+        zero_pad = feat.new_zeros((feat.shape[0], self._out_feats - feat.shape[1]))
+        feat = th.cat([feat, zero_pad], -1)
+        # NOTE(zihao): there is still room to optimize, we may do kernel fusion
+        # for such operations in the future.
+        graph.edata['w'] = self.edge_embed(etypes).view(-1, self._out_feats, self._out_feats)
+        for _ in range(self._n_steps):
+            graph.ndata['h'] = feat.unsqueeze(-1) # (N, D, 1)
+            graph.update_all(fn.u_mul_e('h', 'w', 'm'),
+                             fn.sum('m', 'a'))
+            a = graph.ndata.pop('a').sum(dim=1) # (N, D)
+            feat = self.gru(a, feat)
+        return feat
--- a/python/dgl/nn/pytorch/conv/ginconv.py
+++ b/python/dgl/nn/pytorch/conv/ginconv.py
+"""Torch Module for Graph Isomorphism Network layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+
+from .... import function as fn
+
+
+class GINConv(nn.Module):
+    r"""Graph Isomorphism Network layer from paper `How Powerful are Graph
+    Neural Networks? <https://arxiv.org/pdf/1810.00826.pdf>`__.
+
+    .. math::
+        h_i^{(l+1)} = f_\Theta \left((1 + \epsilon) h_i^{l} +
+        \mathrm{aggregate}\left(\left\{h_j^{l}, j\in\mathcal{N}(i)
+        \right\}\right)\right)
+
+    Parameters
+    ----------
+    apply_func : callable activation function/layer or None
+        If not None, apply this function to the updated node feature,
+        the :math:`f_\Theta` in the formula.
+    aggregator_type : str
+        Aggregator type to use (``sum``, ``max`` or ``mean``).
+    init_eps : float, optional
+        Initial :math:`\epsilon` value, default: ``0``.
+    learn_eps : bool, optional
+        If True, :math:`\epsilon` will be a learnable parameter.
+    """
+    def __init__(self,
+                 apply_func,
+                 aggregator_type,
+                 init_eps=0,
+                 learn_eps=False):
+        super(GINConv, self).__init__()
+        self.apply_func = apply_func
+        if aggregator_type == 'sum':
+            self._reducer = fn.sum
+        elif aggregator_type == 'max':
+            self._reducer = fn.max
+        elif aggregator_type == 'mean':
+            self._reducer = fn.mean
+        else:
+            raise KeyError('Aggregator type {} not recognized.'.format(aggregator_type))
+        # to specify whether eps is trainable or not.
+        if learn_eps:
+            self.eps = th.nn.Parameter(th.FloatTensor([init_eps]))
+        else:
+            self.register_buffer('eps', th.FloatTensor([init_eps]))
+
+    def forward(self, graph, feat):
+        r"""Compute Graph Isomorphism Network layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, D)` where :math:`D`
+            could be any positive integer, :math:`N` is the number
+            of nodes. If ``apply_func`` is not None, :math:`D` should
+            fit the input dimensionality requirement of ``apply_func``.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, D_{out})` where
+            :math:`D_{out}` is the output dimensionality of ``apply_func``.
+            If ``apply_func`` is None, :math:`D_{out}` should be the same
+            as input dimensionality.
+        """
+        graph = graph.local_var()
+        graph.ndata['h'] = feat
+        graph.update_all(fn.copy_u('h', 'm'), self._reducer('m', 'neigh'))
+        rst = (1 + self.eps) * feat + graph.ndata['neigh']
+        if self.apply_func is not None:
+            rst = self.apply_func(rst)
+        return rst
--- a/python/dgl/nn/pytorch/conv/gmmconv.py
+++ b/python/dgl/nn/pytorch/conv/gmmconv.py
+"""Torch Module for GMM Conv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+from torch.nn import init
+
+from .... import function as fn
+from ..utils import Identity
+
+
+class GMMConv(nn.Module):
+    r"""The Gaussian Mixture Model Convolution layer from `Geometric Deep
+    Learning on Graphs and Manifolds using Mixture Model CNNs
+    <http://openaccess.thecvf.com/content_cvpr_2017/papers/Monti_Geometric_Deep_Learning_CVPR_2017_paper.pdf>`__.
+
+    .. math::
+        h_i^{l+1} & = \mathrm{aggregate}\left(\left\{\frac{1}{K}
+         \sum_{k}^{K} w_k(u_{ij}), \forall j\in \mathcal{N}(i)\right\}\right)
+
+        w_k(u) & = \exp\left(-\frac{1}{2}(u-\mu_k)^T \Sigma_k^{-1} (u - \mu_k)\right)
+
+    Parameters
+    ----------
+    in_feats : int
+        Number of input features.
+    out_feats : int
+        Number of output features.
+    dim : int
+        Dimensionality of pseudo-coordinte.
+    n_kernels : int
+        Number of kernels :math:`K`.
+    aggregator_type : str
+        Aggregator type (``sum``, ``mean``, ``max``).
+    residual : bool
+        If True, use residual connection inside this layer.
+    bias : bool
+        If True, adds a learnable bias to the output. Default: ``True``.
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 dim,
+                 n_kernels,
+                 aggregator_type,
+                 residual=True,
+                 bias=True):
+        super(GMMConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._dim = dim
+        self._n_kernels = n_kernels
+        if aggregator_type == 'sum':
+            self._reducer = fn.sum
+        elif aggregator_type == 'mean':
+            self._reducer = fn.mean
+        elif aggregator_type == 'max':
+            self._reducer = fn.max
+        else:
+            raise KeyError("Aggregator type {} not recognized.".format(aggregator_type))
+
+        self.mu = nn.Parameter(th.Tensor(n_kernels, dim))
+        self.inv_sigma = nn.Parameter(th.Tensor(n_kernels, dim))
+        self.fc = nn.Linear(in_feats, n_kernels * out_feats, bias=False)
+        if residual:
+            if in_feats != out_feats:
+                self.res_fc = nn.Linear(in_feats, out_feats, bias=False)
+            else:
+                self.res_fc = Identity()
+        else:
+            self.register_buffer('res_fc', None)
+
+        if bias:
+            self.bias = nn.Parameter(th.Tensor(out_feats))
+        else:
+            self.register_buffer('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Reinitialize learnable parameters."""
+        gain = init.calculate_gain('relu')
+        init.xavier_normal_(self.fc.weight, gain=gain)
+        if isinstance(self.res_fc, nn.Linear):
+            init.xavier_normal_(self.res_fc.weight, gain=gain)
+        init.normal_(self.mu.data, 0, 0.1)
+        init.normal_(self.inv_sigma.data, 1, 0.1)
+        if self.bias is not None:
+            init.zeros_(self.bias.data)
+
+    def forward(self, graph, feat, pseudo):
+        """Compute Gaussian Mixture Model Convolution layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`N`
+            is the number of nodes of the graph and :math:`D_{in}` is the
+            input feature size.
+        pseudo : torch.Tensor
+            The pseudo coordinate tensor of shape :math:`(E, D_{u})` where
+            :math:`E` is the number of edges of the graph and :math:`D_{u}`
+            is the dimensionality of pseudo coordinate.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is the output feature size.
+        """
+        graph = graph.local_var()
+        graph.ndata['h'] = self.fc(feat).view(-1, self._n_kernels, self._out_feats)
+        E = graph.number_of_edges()
+        # compute gaussian weight
+        gaussian = -0.5 * ((pseudo.view(E, 1, self._dim) -
+                            self.mu.view(1, self._n_kernels, self._dim)) ** 2)
+        gaussian = gaussian * (self.inv_sigma.view(1, self._n_kernels, self._dim) ** 2)
+        gaussian = th.exp(gaussian.sum(dim=-1, keepdim=True)) # (E, K, 1)
+        graph.edata['w'] = gaussian
+        graph.update_all(fn.u_mul_e('h', 'w', 'm'), self._reducer('m', 'h'))
+        rst = graph.ndata['h'].sum(1)
+        # residual connection
+        if self.res_fc is not None:
+            rst = rst + self.res_fc(feat)
+        # bias
+        if self.bias is not None:
+            rst = rst + self.bias
+        return rst
--- a/python/dgl/nn/pytorch/conv/graphconv.py
+++ b/python/dgl/nn/pytorch/conv/graphconv.py
+"""Torch modules for graph convolutions(GCN)."""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+from torch.nn import init
+
+from .... import function as fn
+
+# pylint: disable=W0235
+class GraphConv(nn.Module):
+    r"""Apply graph convolution over an input signal.
+
+    Graph convolution is introduced in `GCN <https://arxiv.org/abs/1609.02907>`__
+    and can be described as below:
+
+    .. math::
+      h_i^{(l+1)} = \sigma(b^{(l)} + \sum_{j\in\mathcal{N}(i)}\frac{1}{c_{ij}}h_j^{(l)}W^{(l)})
+
+    where :math:`\mathcal{N}(i)` is the neighbor set of node :math:`i`. :math:`c_{ij}` is equal
+    to the product of the square root of node degrees:
+    :math:`\sqrt{|\mathcal{N}(i)|}\sqrt{|\mathcal{N}(j)|}`. :math:`\sigma` is an activation
+    function.
+
+    The model parameters are initialized as in the
+    `original implementation <https://github.com/tkipf/gcn/blob/master/gcn/layers.py>`__ where
+    the weight :math:`W^{(l)}` is initialized using Glorot uniform initialization
+    and the bias is initialized to be zero.
+
+    Notes
+    -----
+    Zero in degree nodes could lead to invalid normalizer. A common practice
+    to avoid this is to add a self-loop for each node in the graph, which
+    can be achieved by:
+
+    >>> g = ... # some DGLGraph
+    >>> g.add_edges(g.nodes(), g.nodes())
+
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    norm : bool, optional
+        If True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
+    bias : bool, optional
+        If True, adds a learnable bias to the output. Default: ``True``.
+    activation: callable activation function/layer or None, optional
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+
+    Attributes
+    ----------
+    weight : torch.Tensor
+        The learnable weight tensor.
+    bias : torch.Tensor
+        The learnable bias tensor.
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 norm=True,
+                 bias=True,
+                 activation=None):
+        super(GraphConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._norm = norm
+
+        self.weight = nn.Parameter(th.Tensor(in_feats, out_feats))
+        if bias:
+            self.bias = nn.Parameter(th.Tensor(out_feats))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+        self._activation = activation
+
+    def reset_parameters(self):
+        """Reinitialize learnable parameters."""
+        init.xavier_uniform_(self.weight)
+        if self.bias is not None:
+            init.zeros_(self.bias)
+
+    def forward(self, graph, feat):
+        r"""Compute graph convolution.
+
+        Notes
+        -----
+        * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
+          dimensions, :math:`N` is the number of nodes.
+        * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
+          the same shape as the input.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : torch.Tensor
+            The input feature
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature
+        """
+        graph = graph.local_var()
+        if self._norm:
+            norm = th.pow(graph.in_degrees().float().clamp(min=1), -0.5)
+            shp = norm.shape + (1,) * (feat.dim() - 1)
+            norm = th.reshape(norm, shp).to(feat.device)
+            feat = feat * norm
+
+        if self._in_feats > self._out_feats:
+            # mult W first to reduce the feature size for aggregation.
+            feat = th.matmul(feat, self.weight)
+            graph.ndata['h'] = feat
+            graph.update_all(fn.copy_src(src='h', out='m'),
+                             fn.sum(msg='m', out='h'))
+            rst = graph.ndata['h']
+        else:
+            # aggregate first then mult W
+            graph.ndata['h'] = feat
+            graph.update_all(fn.copy_src(src='h', out='m'),
+                             fn.sum(msg='m', out='h'))
+            rst = graph.ndata['h']
+            rst = th.matmul(rst, self.weight)
+
+        if self._norm:
+            rst = rst * norm
+
+        if self.bias is not None:
+            rst = rst + self.bias
+
+        if self._activation is not None:
+            rst = self._activation(rst)
+
+        return rst
+
+    def extra_repr(self):
+        """Set the extra representation of the module,
+        which will come into effect when printing the model.
+        """
+        summary = 'in={_in_feats}, out={_out_feats}'
+        summary += ', normalization={_norm}'
+        if '_activation' in self.__dict__:
+            summary += ', activation={_activation}'
+        return summary.format(**self.__dict__)
--- a/python/dgl/nn/pytorch/conv/nnconv.py
+++ b/python/dgl/nn/pytorch/conv/nnconv.py
+"""Torch Module for NNConv layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+from torch.nn import init
+
+from .... import function as fn
+from ..utils import Identity
+
+
+class NNConv(nn.Module):
+    r"""Graph Convolution layer introduced in `Neural Message Passing
+    for Quantum Chemistry <https://arxiv.org/pdf/1704.01212.pdf>`__.
+
+    .. math::
+        h_{i}^{l+1} = h_{i}^{l} + \mathrm{aggregate}\left(\left\{
+        f_\Theta (e_{ij}) \cdot h_j^{l}, j\in \mathcal{N}(i) \right\}\right)
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    edge_func : callable activation function/layer
+        Maps each edge feature to a vector of shape
+        ``(in_feats * out_feats)`` as weight to compute
+        messages.
+        Also is the :math:`f_\Theta` in the formula.
+    aggregator_type : str
+        Aggregator type to use (``sum``, ``mean`` or ``max``).
+    residual : bool, optional
+        If True, use residual connection. Default: ``False``.
+    bias : bool, optional
+        If True, adds a learnable bias to the output. Default: ``True``.
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 edge_func,
+                 aggregator_type,
+                 residual=False,
+                 bias=True):
+        super(NNConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self.edge_nn = edge_func
+        if aggregator_type == 'sum':
+            self.reducer = fn.sum
+        elif aggregator_type == 'mean':
+            self.reducer = fn.mean
+        elif aggregator_type == 'max':
+            self.reducer = fn.max
+        else:
+            raise KeyError('Aggregator type {} not recognized: '.format(aggregator_type))
+        self._aggre_type = aggregator_type
+        if residual:
+            if in_feats != out_feats:
+                self.res_fc = nn.Linear(in_feats, out_feats, bias=False)
+            else:
+                self.res_fc = Identity()
+        else:
+            self.register_buffer('res_fc', None)
+        if bias:
+            self.bias = nn.Parameter(th.Tensor(out_feats))
+        else:
+            self.register_buffer('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Reinitialize learnable parameters."""
+        gain = init.calculate_gain('relu')
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+        if isinstance(self.res_fc, nn.Linear):
+            nn.init.xavier_normal_(self.res_fc.weight, gain=gain)
+
+    def forward(self, graph, feat, efeat):
+        r"""Compute MPNN Graph Convolution layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : torch.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`N`
+            is the number of nodes of the graph and :math:`D_{in}` is the
+            input feature size.
+        efeat : torch.Tensor
+            The edge feature of shape :math:`(N, *)`, should fit the input
+            shape requirement of ``edge_nn``.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is the output feature size.
+        """
+        graph = graph.local_var()
+        # (n, d_in, 1)
+        graph.ndata['h'] = feat.unsqueeze(-1)
+        # (n, d_in, d_out)
+        graph.edata['w'] = self.edge_nn(efeat).view(-1, self._in_feats, self._out_feats)
+        # (n, d_in, d_out)
+        graph.update_all(fn.u_mul_e('h', 'w', 'm'), self.reducer('m', 'neigh'))
+        rst = graph.ndata.pop('neigh').sum(dim=1) # (n, d_out)
+        # residual connection
+        if self.res_fc is not None:
+            rst = rst + self.res_fc(feat)
+        # bias
+        if self.bias is not None:
+            rst = rst + self.bias
+        return rst
--- a/python/dgl/nn/pytorch/conv/relgraphconv.py
+++ b/python/dgl/nn/pytorch/conv/relgraphconv.py
+"""Torch Module for Relational graph convolution layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import torch as th
+from torch import nn
+
+from .... import function as fn
+from .. import utils
+
+
+class RelGraphConv(nn.Module):
+    r"""Relational graph convolution layer.
+
+    Relational graph convolution is introduced in "`Modeling Relational Data with Graph
+    Convolutional Networks <https://arxiv.org/abs/1703.06103>`__"
+    and can be described as below:
+
+    .. math::
+
+       h_i^{(l+1)} = \sigma(\sum_{r\in\mathcal{R}}
+       \sum_{j\in\mathcal{N}^r(i)}\frac{1}{c_{i,r}}W_r^{(l)}h_j^{(l)}+W_0^{(l)}h_i^{(l)})
+
+    where :math:`\mathcal{N}^r(i)` is the neighbor set of node :math:`i` w.r.t. relation
+    :math:`r`. :math:`c_{i,r}` is the normalizer equal
+    to :math:`|\mathcal{N}^r(i)|`. :math:`\sigma` is an activation function. :math:`W_0`
+    is the self-loop weight.
+
+    The basis regularization decomposes :math:`W_r` by:
+
+    .. math::
+
+       W_r^{(l)} = \sum_{b=1}^B a_{rb}^{(l)}V_b^{(l)}
+
+    where :math:`B` is the number of bases.
+
+    The block-diagonal-decomposition regularization decomposes :math:`W_r` into :math:`B`
+    number of block diagonal matrices. We refer :math:`B` as the number of bases.
+
+    Parameters
+    ----------
+    in_feat : int
+        Input feature size.
+    out_feat : int
+        Output feature size.
+    num_rels : int
+        Number of relations.
+    regularizer : str
+        Which weight regularizer to use "basis" or "bdd"
+    num_bases : int, optional
+        Number of bases. If is none, use number of relations. Default: None.
+    bias : bool, optional
+        True if bias is added. Default: True
+    activation : callable, optional
+        Activation function. Default: None
+    self_loop : bool, optional
+        True to include self loop message. Default: False
+    dropout : float, optional
+        Dropout rate. Default: 0.0
+    """
+    def __init__(self,
+                 in_feat,
+                 out_feat,
+                 num_rels,
+                 regularizer="basis",
+                 num_bases=None,
+                 bias=True,
+                 activation=None,
+                 self_loop=False,
+                 dropout=0.0):
+        super(RelGraphConv, self).__init__()
+        self.in_feat = in_feat
+        self.out_feat = out_feat
+        self.num_rels = num_rels
+        self.regularizer = regularizer
+        self.num_bases = num_bases
+        if self.num_bases is None or self.num_bases > self.num_rels or self.num_bases < 0:
+            self.num_bases = self.num_rels
+        self.bias = bias
+        self.activation = activation
+        self.self_loop = self_loop
+
+        if regularizer == "basis":
+            # add basis weights
+            self.weight = nn.Parameter(th.Tensor(self.num_bases, self.in_feat, self.out_feat))
+            if self.num_bases < self.num_rels:
+                # linear combination coefficients
+                self.w_comp = nn.Parameter(th.Tensor(self.num_rels, self.num_bases))
+            nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
+            if self.num_bases < self.num_rels:
+                nn.init.xavier_uniform_(self.w_comp,
+                                        gain=nn.init.calculate_gain('relu'))
+            # message func
+            self.message_func = self.basis_message_func
+        elif regularizer == "bdd":
+            if in_feat % num_bases != 0 or out_feat % num_bases != 0:
+                raise ValueError('Feature size must be a multiplier of num_bases.')
+            # add block diagonal weights
+            self.submat_in = in_feat // self.num_bases
+            self.submat_out = out_feat // self.num_bases
+
+            # assuming in_feat and out_feat are both divisible by num_bases
+            self.weight = nn.Parameter(th.Tensor(
+                self.num_rels, self.num_bases * self.submat_in * self.submat_out))
+            nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
+            # message func
+            self.message_func = self.bdd_message_func
+        else:
+            raise ValueError("Regularizer must be either 'basis' or 'bdd'")
+
+        # bias
+        if self.bias:
+            self.h_bias = nn.Parameter(th.Tensor(out_feat))
+            nn.init.zeros_(self.h_bias)
+
+        # weight for self loop
+        if self.self_loop:
+            self.loop_weight = nn.Parameter(th.Tensor(in_feat, out_feat))
+            nn.init.xavier_uniform_(self.loop_weight,
+                                    gain=nn.init.calculate_gain('relu'))
+
+        self.dropout = nn.Dropout(dropout)
+
+    def basis_message_func(self, edges):
+        """Message function for basis regularizer"""
+        if self.num_bases < self.num_rels:
+            # generate all weights from bases
+            weight = self.weight.view(self.num_bases,
+                                      self.in_feat * self.out_feat)
+            weight = th.matmul(self.w_comp, weight).view(
+                self.num_rels, self.in_feat, self.out_feat)
+        else:
+            weight = self.weight
+
+        msg = utils.bmm_maybe_select(edges.src['h'], weight, edges.data['type'])
+        if 'norm' in edges.data:
+            msg = msg * edges.data['norm']
+        return {'msg': msg}
+
+    def bdd_message_func(self, edges):
+        """Message function for block-diagonal-decomposition regularizer"""
+        if edges.src['h'].dtype == th.int64 and len(edges.src['h'].shape) == 1:
+            raise TypeError('Block decomposition does not allow integer ID feature.')
+        weight = self.weight.index_select(0, edges.data['type']).view(
+            -1, self.submat_in, self.submat_out)
+        node = edges.src['h'].view(-1, 1, self.submat_in)
+        msg = th.bmm(node, weight).view(-1, self.out_feat)
+        if 'norm' in edges.data:
+            msg = msg * edges.data['norm']
+        return {'msg': msg}
+
+    def forward(self, g, x, etypes, norm=None):
+        """ Forward computation
+
+        Parameters
+        ----------
+        g : DGLGraph
+            The graph.
+        x : torch.Tensor
+            Input node features. Could be either
+                * :math:`(|V|, D)` dense tensor
+                * :math:`(|V|,)` int64 vector, representing the categorical values of each
+                  node. We then treat the input feature as an one-hot encoding feature.
+        etypes : torch.Tensor
+            Edge type tensor. Shape: :math:`(|E|,)`
+        norm : torch.Tensor
+            Optional edge normalizer tensor. Shape: :math:`(|E|, 1)`
+
+        Returns
+        -------
+        torch.Tensor
+            New node features.
+        """
+        g = g.local_var()
+        g.ndata['h'] = x
+        g.edata['type'] = etypes
+        if norm is not None:
+            g.edata['norm'] = norm
+        if self.self_loop:
+            loop_message = utils.matmul_maybe_select(x, self.loop_weight)
+        # message passing
+        g.update_all(self.message_func, fn.sum(msg='msg', out='h'))
+        # apply bias and activation
+        node_repr = g.ndata['h']
+        if self.bias:
+            node_repr = node_repr + self.h_bias
+        if self.self_loop:
+            node_repr = node_repr + loop_message
+        if self.activation:
+            node_repr = self.activation(node_repr)
+        node_repr = self.dropout(node_repr)
+        return node_repr