[Tensorflow] Several nn & example (#1191)

* several nn example * appnp * fix lint * lint * add dgi * fix * fix * fix * fff * docs * 111 * fix * change init * change result * tiaocan+1 * fix * fix lint * fix * fix

[Tensorflow] Several nn & example (#1191)
* several nn example * appnp * fix lint * lint * add dgi * fix * fix * fix * fff * docs * 111 * fix * change init * change result * tiaocan+1 * fix * fix lint * fix * fix
a00636a0 · VoVAllen · GitHub · 31a7d509 · a00636a0 · a00636a0
Unverified Commit a00636a0 authored Jan 19, 2020 by VoVAllen Committed by GitHub Jan 19, 2020
16 changed files
--- a/examples/tensorflow/rgcn/utils.py
+++ b/examples/tensorflow/rgcn/utils.py
+"""
+Utility functions for link prediction
+Most code is adapted from authors' implementation of RGCN link prediction:
+https://github.com/MichSchli/RelationPrediction
+
+"""
+
+import numpy as np
+import tensorflow as tf
+import dgl
+
+#######################################################################
+#
+# Utility function for building training and testing graphs
+#
+#######################################################################
+
+def get_adj_and_degrees(num_nodes, triplets):
+    """ Get adjacency list and degrees of the graph
+    """
+    adj_list = [[] for _ in range(num_nodes)]
+    for i,triplet in enumerate(triplets):
+        adj_list[triplet[0]].append([i, triplet[2]])
+        adj_list[triplet[2]].append([i, triplet[0]])
+
+    degrees = np.array([len(a) for a in adj_list])
+    adj_list = [np.array(a) for a in adj_list]
+    return adj_list, degrees
+
+def sample_edge_neighborhood(adj_list, degrees, n_triplets, sample_size):
+    """Sample edges by neighborhool expansion.
+
+    This guarantees that the sampled edges form a connected graph, which
+    may help deeper GNNs that require information from more than one hop.
+    """
+    edges = np.zeros((sample_size), dtype=np.int32)
+
+    #initialize
+    sample_counts = np.array([d for d in degrees])
+    picked = np.array([False for _ in range(n_triplets)])
+    seen = np.array([False for _ in degrees])
+
+    for i in range(0, sample_size):
+        weights = sample_counts * seen
+
+        if np.sum(weights) == 0:
+            weights = np.ones_like(weights)
+            weights[np.where(sample_counts == 0)] = 0
+
+        probabilities = (weights) / np.sum(weights)
+        chosen_vertex = np.random.choice(np.arange(degrees.shape[0]),
+                                         p=probabilities)
+        chosen_adj_list = adj_list[chosen_vertex]
+        seen[chosen_vertex] = True
+
+        chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
+        chosen_edge = chosen_adj_list[chosen_edge]
+        edge_number = chosen_edge[0]
+
+        while picked[edge_number]:
+            chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
+            chosen_edge = chosen_adj_list[chosen_edge]
+            edge_number = chosen_edge[0]
+
+        edges[i] = edge_number
+        other_vertex = chosen_edge[1]
+        picked[edge_number] = True
+        sample_counts[chosen_vertex] -= 1
+        sample_counts[other_vertex] -= 1
+        seen[other_vertex] = True
+
+    return edges
+
+def sample_edge_uniform(adj_list, degrees, n_triplets, sample_size):
+    """Sample edges uniformly from all the edges."""
+    all_edges = np.arange(n_triplets)
+    return np.random.choice(all_edges, sample_size, replace=False)
+
+def generate_sampled_graph_and_labels(triplets, sample_size, split_size,
+                                      num_rels, adj_list, degrees,
+                                      negative_rate, sampler="uniform"):
+    """Get training graph and signals
+    First perform edge neighborhood sampling on graph, then perform negative
+    sampling to generate negative samples
+    """
+    # perform edge neighbor sampling
+    if sampler == "uniform":
+        edges = sample_edge_uniform(adj_list, degrees, len(triplets), sample_size)
+    elif sampler == "neighbor":
+        edges = sample_edge_neighborhood(adj_list, degrees, len(triplets), sample_size)
+    else:
+        raise ValueError("Sampler type must be either 'uniform' or 'neighbor'.")
+
+    # relabel nodes to have consecutive node ids
+    edges = triplets[edges]
+    src, rel, dst = edges.transpose()
+    uniq_v, edges = np.unique((src, dst), return_inverse=True)
+    src, dst = np.reshape(edges, (2, -1))
+    relabeled_edges = np.stack((src, rel, dst)).transpose()
+
+    # negative sampling
+    samples, labels = negative_sampling(relabeled_edges, len(uniq_v),
+                                        negative_rate)
+
+    # further split graph, only half of the edges will be used as graph
+    # structure, while the rest half is used as unseen positive samples
+    split_size = int(sample_size * split_size)
+    graph_split_ids = np.random.choice(np.arange(sample_size),
+                                       size=split_size, replace=False)
+    src = src[graph_split_ids]
+    dst = dst[graph_split_ids]
+    rel = rel[graph_split_ids]
+
+    # build DGL graph
+    print("# sampled nodes: {}".format(len(uniq_v)))
+    print("# sampled edges: {}".format(len(src) * 2))
+    g, rel, norm = build_graph_from_triplets(len(uniq_v), num_rels,
+                                             (src, rel, dst))
+    return g, uniq_v, rel, norm, samples, labels
+
+def comp_deg_norm(g):
+    g = g.local_var()
+    in_deg = g.in_degrees(range(g.number_of_nodes())).float().numpy()
+    norm = 1.0 / in_deg
+    norm[np.isinf(norm)] = 0
+    return norm
+
+def build_graph_from_triplets(num_nodes, num_rels, triplets):
+    """ Create a DGL graph. The graph is bidirectional because RGCN authors
+        use reversed relations.
+        This function also generates edge type and normalization factor
+        (reciprocal of node incoming degree)
+    """
+    g = dgl.DGLGraph()
+    g.add_nodes(num_nodes)
+    src, rel, dst = triplets
+    src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
+    rel = np.concatenate((rel, rel + num_rels))
+    edges = sorted(zip(dst, src, rel))
+    dst, src, rel = np.array(edges).transpose()
+    g.add_edges(src, dst)
+    norm = comp_deg_norm(g)
+    print("# nodes: {}, # edges: {}".format(num_nodes, len(src)))
+    return g, rel, norm
+
+def build_test_graph(num_nodes, num_rels, edges):
+    src, rel, dst = edges.transpose()
+    print("Test graph:")
+    return build_graph_from_triplets(num_nodes, num_rels, (src, rel, dst))
+
+def negative_sampling(pos_samples, num_entity, negative_rate):
+    size_of_batch = len(pos_samples)
+    num_to_generate = size_of_batch * negative_rate
+    neg_samples = np.tile(pos_samples, (negative_rate, 1))
+    labels = np.zeros(size_of_batch * (negative_rate + 1), dtype=np.float32)
+    labels[: size_of_batch] = 1
+    values = np.random.randint(num_entity, size=num_to_generate)
+    choices = np.random.uniform(size=num_to_generate)
+    subj = choices > 0.5
+    obj = choices <= 0.5
+    neg_samples[subj, 0] = values[subj]
+    neg_samples[obj, 2] = values[obj]
+
+    return np.concatenate((pos_samples, neg_samples)), labels
+
--- a/python/dgl/nn/tensorflow/__init__.py
+++ b/python/dgl/nn/tensorflow/__init__.py
+"""Package for Tensorflow-specific NN modules."""
+from .conv import *
+from .softmax import *
+from .utils import *
+from .glob import *
--- a/python/dgl/nn/tensorflow/conv/__init__.py
+++ b/python/dgl/nn/tensorflow/conv/__init__.py
+"""TF NN conv module"""
+from .gatconv import GATConv
+from .relgraphconv import RelGraphConv
+from .graphconv import GraphConv
+from .ginconv import GINConv
+from .sageconv import SAGEConv
+from .sgconv import SGConv
+from .appnpconv import APPNPConv
--- a/python/dgl/nn/tensorflow/conv/appnpconv.py
+++ b/python/dgl/nn/tensorflow/conv/appnpconv.py
+"""TF Module for APPNPConv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import tensorflow as tf
+from tensorflow.keras import layers
+import numpy as np
+
+from .... import function as fn
+
+
+class APPNPConv(layers.Layer):
+    r"""Approximate Personalized Propagation of Neural Predictions
+    layer from paper `Predict then Propagate: Graph Neural Networks
+    meet Personalized PageRank <https://arxiv.org/pdf/1810.05997.pdf>`__.
+
+    .. math::
+        H^{0} & = X
+
+        H^{t+1} & = (1-\alpha)\left(\hat{D}^{-1/2}
+        \hat{A} \hat{D}^{-1/2} H^{t}\right) + \alpha H^{0}
+
+    Parameters
+    ----------
+    k : int
+        Number of iterations :math:`K`.
+    alpha : float
+        The teleport probability :math:`\alpha`.
+    edge_drop : float, optional
+        Dropout rate on edges that controls the
+        messages received by each node. Default: ``0``.
+    """
+
+    def __init__(self,
+                 k,
+                 alpha,
+                 edge_drop=0.):
+        super(APPNPConv, self).__init__()
+        self._k = k
+        self._alpha = alpha
+        self.edge_drop = layers.Dropout(edge_drop)
+
+    def call(self, graph, feat):
+        r"""Compute APPNP layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature of shape :math:`(N, *)` :math:`N` is the
+            number of nodes, and :math:`*` could be of any shape.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature of shape :math:`(N, *)` where :math:`*`
+            should be the same as input shape.
+        """
+        graph = graph.local_var()
+        degs = tf.clip_by_value(tf.cast(graph.in_degrees(), tf.float32),
+                                clip_value_min=1, clip_value_max=np.inf)
+        norm = tf.pow(degs, -0.5)
+        shp = norm.shape + (1,) * (feat.ndim - 1)
+        norm = tf.reshape(norm, shp)
+        feat_0 = feat
+        for _ in range(self._k):
+            # normalization by src node
+            feat = feat * norm
+            graph.ndata['h'] = feat
+            graph.edata['w'] = self.edge_drop(
+                tf.ones(graph.number_of_edges(), 1))
+            graph.update_all(fn.u_mul_e('h', 'w', 'm'),
+                             fn.sum('m', 'h'))
+            feat = graph.ndata.pop('h')
+            # normalization by dst node
+            feat = feat * norm
+            feat = (1 - self._alpha) * feat + self._alpha * feat_0
+        return feat
--- a/python/dgl/nn/tensorflow/conv/gatconv.py
+++ b/python/dgl/nn/tensorflow/conv/gatconv.py
+"""Tensorflow modules for graph attention networks(GAT)."""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import tensorflow as tf
+from tensorflow.keras import layers
+import numpy as np
+
+from .... import function as fn
+from ..softmax import edge_softmax
+from ..utils import Identity
+
+# pylint: enable=W0235
+
+
+class GATConv(layers.Layer):
+    r"""Apply `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`__
+    over an input signal.
+
+    .. math::
+        h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
+
+    where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and
+    node :math:`j`:
+
+    .. math::
+        \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
+
+        e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    num_heads : int
+        Number of heads in Multi-Head Attention.
+    feat_drop : float, optional
+        Dropout rate on feature, defaults: ``0``.
+    attn_drop : float, optional
+        Dropout rate on attention weight, defaults: ``0``.
+    negative_slope : float, optional
+        LeakyReLU angle of negative slope.
+    residual : bool, optional
+        If True, use residual connection.
+    activation : callable activation function/layer or None, optional.
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+    """
+
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 num_heads,
+                 feat_drop=0.,
+                 attn_drop=0.,
+                 negative_slope=0.2,
+                 residual=False,
+                 activation=None):
+        super(GATConv, self).__init__()
+        self._num_heads = num_heads
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        xinit = tf.keras.initializers.VarianceScaling(scale=np.sqrt(
+            2), mode="fan_avg", distribution="untruncated_normal")
+        self.fc = layers.Dense(
+            out_feats * num_heads, use_bias=False, kernel_initializer=xinit)
+        self.attn_l = tf.Variable(initial_value=xinit(
+            shape=(1, num_heads, out_feats), dtype='float32'), trainable=True)
+
+        self.attn_r = tf.Variable(initial_value=xinit(
+            shape=(1, num_heads, out_feats), dtype='float32'), trainable=True)
+        self.feat_drop = layers.Dropout(rate=feat_drop)
+        self.attn_drop = layers.Dropout(rate=attn_drop)
+        self.leaky_relu = layers.LeakyReLU(alpha=negative_slope)
+        if residual:
+            if in_feats != out_feats:
+                self.res_fc = layers.Dense(
+                    num_heads * out_feats, use_bias=False, kernel_initializer=xinit)
+            else:
+                self.res_fc = Identity()
+        else:
+            self.res_fc = None
+            # self.register_buffer('res_fc', None)
+        self.activation = activation
+
+    def call(self, graph, feat):
+        r"""Compute graph attention network layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
+            is the number of heads, and :math:`D_{out}` is size of output feature.
+        """
+        graph = graph.local_var()
+        h = self.feat_drop(feat)
+        feat = tf.reshape(self.fc(h), (-1, self._num_heads, self._out_feats))
+        el = tf.reduce_sum(feat * self.attn_l, axis=-1, keepdims=True)
+        er = tf.reduce_sum(feat * self.attn_r, axis=-1, keepdims=True)
+        graph.ndata.update({'ft': feat, 'el': el, 'er': er})
+        # compute edge attention
+        graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
+        e = self.leaky_relu(graph.edata.pop('e'))
+        # compute softmax
+        graph.edata['a'] = self.attn_drop(edge_softmax(graph, e))
+        # message passing
+        graph.update_all(fn.u_mul_e('ft', 'a', 'm'),
+                         fn.sum('m', 'ft'))
+        rst = graph.ndata['ft']
+        # residual
+        if self.res_fc is not None:
+            resval = tf.reshape(self.res_fc(
+                h), (h.shape[0], -1, self._out_feats))
+            rst = rst + resval
+        # activation
+        if self.activation:
+            rst = self.activation(rst)
+        return rst
--- a/python/dgl/nn/tensorflow/conv/ginconv.py
+++ b/python/dgl/nn/tensorflow/conv/ginconv.py
+"""Tensorflow Module for Graph Isomorphism Network layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from .... import function as fn
+
+
+class GINConv(layers.Layer):
+    r"""Graph Isomorphism Network layer from paper `How Powerful are Graph
+    Neural Networks? <https://arxiv.org/pdf/1810.00826.pdf>`__.
+
+    .. math::
+        h_i^{(l+1)} = f_\Theta \left((1 + \epsilon) h_i^{l} +
+        \mathrm{aggregate}\left(\left\{h_j^{l}, j\in\mathcal{N}(i)
+        \right\}\right)\right)
+
+    Parameters
+    ----------
+    apply_func : callable activation function/layer or None
+        If not None, apply this function to the updated node feature,
+        the :math:`f_\Theta` in the formula.
+    aggregator_type : str
+        Aggregator type to use (``sum``, ``max`` or ``mean``).
+    init_eps : float, optional
+        Initial :math:`\epsilon` value, default: ``0``.
+    learn_eps : bool, optional
+        If True, :math:`\epsilon` will be a learnable parameter.
+    """
+    def __init__(self,
+                 apply_func,
+                 aggregator_type,
+                 init_eps=0,
+                 learn_eps=False):
+        super(GINConv, self).__init__()
+        self.apply_func = apply_func
+        if aggregator_type == 'sum':
+            self._reducer = fn.sum
+        elif aggregator_type == 'max':
+            self._reducer = fn.max
+        elif aggregator_type == 'mean':
+            self._reducer = fn.mean
+        else:
+            raise KeyError('Aggregator type {} not recognized.'.format(aggregator_type))
+        # to specify whether eps is trainable or not.
+        self.eps = tf.Variable(initial_value=[init_eps], dtype=tf.float32, trainable=learn_eps)
+
+    def call(self, graph, feat):
+        r"""Compute Graph Isomorphism Network layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature of shape :math:`(N, D)` where :math:`D`
+            could be any positive integer, :math:`N` is the number
+            of nodes. If ``apply_func`` is not None, :math:`D` should
+            fit the input dimensionality requirement of ``apply_func``.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature of shape :math:`(N, D_{out})` where
+            :math:`D_{out}` is the output dimensionality of ``apply_func``.
+            If ``apply_func`` is None, :math:`D_{out}` should be the same
+            as input dimensionality.
+        """
+        graph = graph.local_var()
+        graph.ndata['h'] = feat
+        graph.update_all(fn.copy_u('h', 'm'), self._reducer('m', 'neigh'))
+        rst = (1 + self.eps) * feat + graph.ndata['neigh']
+        if self.apply_func is not None:
+            rst = self.apply_func(rst)
+        return rst
--- a/python/dgl/nn/tensorflow/conv/graphconv.py
+++ b/python/dgl/nn/tensorflow/conv/graphconv.py
+"""Tensorflow modules for graph convolutions(GCN)."""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import tensorflow as tf
+from tensorflow.keras import layers
+import numpy as np
+
+from .... import function as fn
+
+# pylint: disable=W0235
+
+
+class GraphConv(layers.Layer):
+    r"""Apply graph convolution over an input signal.
+
+    Graph convolution is introduced in `GCN <https://arxiv.org/abs/1609.02907>`__
+    and can be described as below:
+
+    .. math::
+      h_i^{(l+1)} = \sigma(b^{(l)} + \sum_{j\in\mathcal{N}(i)}\frac{1}{c_{ij}}h_j^{(l)}W^{(l)})
+
+    where :math:`\mathcal{N}(i)` is the neighbor set of node :math:`i`. :math:`c_{ij}` is equal
+    to the product of the square root of node degrees:
+    :math:`\sqrt{|\mathcal{N}(i)|}\sqrt{|\mathcal{N}(j)|}`. :math:`\sigma` is an activation
+    function.
+
+    The model parameters are initialized as in the
+    `original implementation <https://github.com/tkipf/gcn/blob/master/gcn/layers.py>`__ where
+    the weight :math:`W^{(l)}` is initialized using Glorot uniform initialization
+    and the bias is initialized to be zero.
+
+    Notes
+    -----
+    Zero in degree nodes could lead to invalid normalizer. A common practice
+    to avoid this is to add a self-loop for each node in the graph, which
+    can be achieved by:
+
+    >>> g = ... # some DGLGraph
+    >>> g.add_edges(g.nodes(), g.nodes())
+
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    norm : bool, optional
+        If True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
+    bias : bool, optional
+        If True, adds a learnable bias to the output. Default: ``True``.
+    activation: callable activation function/layer or None, optional
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+
+    Attributes
+    ----------
+    weight : tf.Tensor
+        The learnable weight tensor.
+    bias : tf.Tensor
+        The learnable bias tensor.
+    """
+
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 norm=True,
+                 bias=True,
+                 activation=None):
+        super(GraphConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._norm = norm
+
+        xinit = tf.keras.initializers.glorot_uniform()
+        self.weight = tf.Variable(initial_value=xinit(
+            shape=(in_feats, out_feats), dtype='float32'), trainable=True)
+
+        if bias:
+            zeroinit = tf.keras.initializers.zeros()
+            self.bias = tf.Variable(initial_value=zeroinit(
+                shape=(out_feats), dtype='float32'), trainable=True)
+
+        self._activation = activation
+
+    def call(self, graph, feat):
+        r"""Compute graph convolution.
+
+        Notes
+        -----
+        * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
+          dimensions, :math:`N` is the number of nodes.
+        * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
+          the same shape as the input.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature
+        """
+        graph = graph.local_var()
+        if self._norm:
+            in_degree = tf.clip_by_value(tf.cast(graph.in_degrees(), tf.float32), clip_value_min=1,
+                                         clip_value_max=np.inf)
+            norm = tf.pow(in_degree, -0.5)
+            shp = norm.shape + (1,) * (feat.ndim - 1)
+            norm = tf.reshape(norm, shp)
+            feat = feat * norm
+
+        if self._in_feats > self._out_feats:
+            # mult W first to reduce the feature size for aggregation.
+            feat = tf.matmul(feat, self.weight)
+            graph.ndata['h'] = feat
+            graph.update_all(fn.copy_src(src='h', out='m'),
+                             fn.sum(msg='m', out='h'))
+            rst = graph.ndata['h']
+        else:
+            # aggregate first then mult W
+            graph.ndata['h'] = feat
+            graph.update_all(fn.copy_src(src='h', out='m'),
+                             fn.sum(msg='m', out='h'))
+            rst = graph.ndata['h']
+            rst = tf.matmul(rst, self.weight)
+
+        if self._norm:
+            rst = rst * norm
+
+        if self.bias is not None:
+            rst = rst + self.bias
+
+        if self._activation is not None:
+            rst = self._activation(rst)
+
+        return rst
+
+    def extra_repr(self):
+        """Set the extra representation of the module,
+        which will come into effect when printing the model.
+        """
+        summary = 'in={_in_feats}, out={_out_feats}'
+        summary += ', normalization={_norm}'
+        if '_activation' in self.__dict__:
+            summary += ', activation={_activation}'
+        return summary.format(**self.__dict__)
--- a/python/dgl/nn/tensorflow/conv/relgraphconv.py
+++ b/python/dgl/nn/tensorflow/conv/relgraphconv.py
+"""Tensorflow Module for Relational graph convolution layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from .... import function as fn
+from .. import utils
+
+
+class RelGraphConv(layers.Layer):
+    r"""Relational graph convolution layer.
+
+    Relational graph convolution is introduced in "`Modeling Relational Data with Graph
+    Convolutional Networks <https://arxiv.org/abs/1703.06103>`__"
+    and can be described as below:
+
+    .. math::
+
+       h_i^{(l+1)} = \sigma(\sum_{r\in\mathcal{R}}
+       \sum_{j\in\mathcal{N}^r(i)}\frac{1}{c_{i,r}}W_r^{(l)}h_j^{(l)}+W_0^{(l)}h_i^{(l)})
+
+    where :math:`\mathcal{N}^r(i)` is the neighbor set of node :math:`i` w.r.t. relation
+    :math:`r`. :math:`c_{i,r}` is the normalizer equal
+    to :math:`|\mathcal{N}^r(i)|`. :math:`\sigma` is an activation function. :math:`W_0`
+    is the self-loop weight.
+
+    The basis regularization decomposes :math:`W_r` by:
+
+    .. math::
+
+       W_r^{(l)} = \sum_{b=1}^B a_{rb}^{(l)}V_b^{(l)}
+
+    where :math:`B` is the number of bases.
+
+    The block-diagonal-decomposition regularization decomposes :math:`W_r` into :math:`B`
+    number of block diagonal matrices. We refer :math:`B` as the number of bases.
+
+    Parameters
+    ----------
+    in_feat : int
+        Input feature size.
+    out_feat : int
+        Output feature size.
+    num_rels : int
+        Number of relations.
+    regularizer : str
+        Which weight regularizer to use "basis" or "bdd"
+    num_bases : int, optional
+        Number of bases. If is none, use number of relations. Default: None.
+    bias : bool, optional
+        True if bias is added. Default: True
+    activation : callable, optional
+        Activation function. Default: None
+    self_loop : bool, optional
+        True to include self loop message. Default: False
+    dropout : float, optional
+        Dropout rate. Default: 0.0
+    """
+
+    def __init__(self,
+                 in_feat,
+                 out_feat,
+                 num_rels,
+                 regularizer="basis",
+                 num_bases=None,
+                 bias=True,
+                 activation=None,
+                 self_loop=False,
+                 dropout=0.0):
+        super(RelGraphConv, self).__init__()
+        self.in_feat = in_feat
+        self.out_feat = out_feat
+        self.num_rels = num_rels
+        self.regularizer = regularizer
+        self.num_bases = num_bases
+        if self.num_bases is None or self.num_bases > self.num_rels or self.num_bases < 0:
+            self.num_bases = self.num_rels
+        self.bias = bias
+        self.activation = activation
+        self.self_loop = self_loop
+
+        xinit = tf.keras.initializers.glorot_uniform()
+        zeroinit = tf.keras.initializers.zeros()
+
+        if regularizer == "basis":
+            # add basis weights
+            self.weight = tf.Variable(initial_value=xinit(
+                shape=(self.num_bases, self.in_feat, self.out_feat),
+                dtype='float32'), trainable=True)
+            if self.num_bases < self.num_rels:
+                # linear combination coefficients
+                self.w_comp = tf.Variable(initial_value=xinit(
+                    shape=(self.num_rels, self.num_bases), dtype='float32'), trainable=True)
+            # message func
+            self.message_func = self.basis_message_func
+        elif regularizer == "bdd":
+            if in_feat % num_bases != 0 or out_feat % num_bases != 0:
+                raise ValueError(
+                    'Feature size must be a multiplier of num_bases.')
+            # add block diagonal weights
+            self.submat_in = in_feat // self.num_bases
+            self.submat_out = out_feat // self.num_bases
+
+            # assuming in_feat and out_feat are both divisible by num_bases
+            self.weight = tf.Variable(initial_value=xinit(
+                shape=(self.num_rels, self.num_bases *
+                       self.submat_in * self.submat_out),
+                dtype='float32'), trainable=True)
+            # message func
+            self.message_func = self.bdd_message_func
+        else:
+            raise ValueError("Regularizer must be either 'basis' or 'bdd'")
+
+        # bias
+        if self.bias:
+            self.h_bias = tf.Variable(initial_value=zeroinit(
+                shape=(out_feat), dtype='float32'), trainable=True)
+
+        # weight for self loop
+        if self.self_loop:
+            self.loop_weight = tf.Variable(initial_value=xinit(
+                shape=(in_feat, out_feat), dtype='float32'), trainable=True)
+
+        self.dropout = layers.Dropout(rate=dropout)
+
+    def basis_message_func(self, edges):
+        """Message function for basis regularizer"""
+        if self.num_bases < self.num_rels:
+            # generate all weights from bases
+            weight = tf.reshape(self.weight, (self.num_bases,
+                                              self.in_feat * self.out_feat))
+            weight = tf.reshape(tf.matmul(self.w_comp, weight), (
+                self.num_rels, self.in_feat, self.out_feat))
+        else:
+            weight = self.weight
+
+        msg = utils.bmm_maybe_select(
+            edges.src['h'], weight, edges.data['type'])
+        if 'norm' in edges.data:
+            msg = msg * edges.data['norm']
+        return {'msg': msg}
+
+    def bdd_message_func(self, edges):
+        """Message function for block-diagonal-decomposition regularizer"""
+        if ((edges.src['h'].dtype == tf.int64) and
+                len(edges.src['h'].shape) == 1):
+            raise TypeError(
+                'Block decomposition does not allow integer ID feature.')
+        weight = tf.reshape(tf.gather(
+            self.weight, edges.data['type']), (-1, self.submat_in, self.submat_out))
+        node = tf.reshape(edges.src['h'], (-1, 1, self.submat_in))
+        msg = tf.reshape(tf.matmul(node, weight), (-1, self.out_feat))
+        if 'norm' in edges.data:
+            msg = msg * edges.data['norm']
+        return {'msg': msg}
+
+    def call(self, g, x, etypes, norm=None):
+        """ Forward computation
+
+        Parameters
+        ----------
+        g : DGLGraph
+            The graph.
+        x : tf.Tensor
+            Input node features. Could be either
+                * :math:`(|V|, D)` dense tensor
+                * :math:`(|V|,)` int64 vector, representing the categorical values of each
+                  node. We then treat the input feature as an one-hot encoding feature.
+        etypes : tf.Tensor
+            Edge type tensor. Shape: :math:`(|E|,)`
+        norm : tf.Tensor
+            Optional edge normalizer tensor. Shape: :math:`(|E|, 1)`
+
+        Returns
+        -------
+        tf.Tensor
+            New node features.
+        """
+        g = g.local_var()
+        g.ndata['h'] = x
+        g.edata['type'] = tf.cast(etypes, tf.int64)
+        if norm is not None:
+            g.edata['norm'] = norm
+        if self.self_loop:
+            loop_message = utils.matmul_maybe_select(x, self.loop_weight)
+        # message passing
+        g.update_all(self.message_func, fn.sum(msg='msg', out='h'))
+        # apply bias and activation
+        node_repr = g.ndata['h']
+        if self.bias:
+            node_repr = node_repr + self.h_bias
+        if self.self_loop:
+            node_repr = node_repr + loop_message
+        if self.activation:
+            node_repr = self.activation(node_repr)
+        node_repr = self.dropout(node_repr)
+        return node_repr
--- a/python/dgl/nn/tensorflow/conv/sageconv.py
+++ b/python/dgl/nn/tensorflow/conv/sageconv.py
+"""Tensorflow Module for GraphSAGE layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from .... import function as fn
+
+
+class SAGEConv(layers.Layer):
+    r"""GraphSAGE layer from paper `Inductive Representation Learning on
+    Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`__.
+
+    .. math::
+        h_{\mathcal{N}(i)}^{(l+1)} & = \mathrm{aggregate}
+        \left(\{h_{j}^{l}, \forall j \in \mathcal{N}(i) \}\right)
+
+        h_{i}^{(l+1)} & = \sigma \left(W \cdot \mathrm{concat}
+        (h_{i}^{l}, h_{\mathcal{N}(i)}^{l+1} + b) \right)
+
+        h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l})
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    feat_drop : float
+        Dropout rate on features, default: ``0``.
+    aggregator_type : str
+        Aggregator type to use (``mean``, ``gcn``, ``pool``, ``lstm``).
+    bias : bool
+        If True, adds a learnable bias to the output. Default: ``True``.
+    norm : callable activation function/layer or None, optional
+        If not None, applies normalization to the updated node features.
+    activation : callable activation function/layer or None, optional
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+    """
+
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 aggregator_type,
+                 feat_drop=0.,
+                 bias=True,
+                 norm=None,
+                 activation=None):
+        super(SAGEConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._aggre_type = aggregator_type
+        self.norm = norm
+        self.feat_drop = layers.Dropout(feat_drop)
+        self.activation = activation
+        # aggregator type: mean/pool/lstm/gcn
+        if aggregator_type == 'pool':
+            self.fc_pool = layers.Dense(in_feats)
+        if aggregator_type == 'lstm':
+            self.lstm = layers.LSTM(units=in_feats)
+        if aggregator_type != 'gcn':
+            self.fc_self = layers.Dense(out_feats, use_bias=bias)
+        self.fc_neigh = layers.Dense(out_feats, use_bias=bias)
+
+    def _lstm_reducer(self, nodes):
+        """LSTM reducer
+        NOTE(zihao): lstm reducer with default schedule (degree bucketing)
+        is slow, we could accelerate this with degree padding in the future.
+        """
+        m = nodes.mailbox['m']  # (B, L, D)
+        rst = self.lstm(m)
+        return {'neigh': rst}
+
+    def call(self, graph, feat):
+        r"""Compute GraphSAGE layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        graph = graph.local_var()
+        feat = self.feat_drop(feat)
+        h_self = feat
+        if self._aggre_type == 'mean':
+            graph.ndata['h'] = feat
+            graph.update_all(fn.copy_src('h', 'm'), fn.mean('m', 'neigh'))
+            h_neigh = graph.ndata['neigh']
+        elif self._aggre_type == 'gcn':
+            graph.ndata['h'] = feat
+            graph.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'neigh'))
+            # divide in_degrees
+            degs = tf.cast(graph.in_degrees(), tf.float32)
+            h_neigh = (graph.ndata['neigh'] + graph.ndata['h']
+                       ) / (tf.expand_dims(degs, -1) + 1)
+        elif self._aggre_type == 'pool':
+            graph.ndata['h'] = tf.nn.relu(self.fc_pool(feat))
+            graph.update_all(fn.copy_src('h', 'm'), fn.max('m', 'neigh'))
+            h_neigh = graph.ndata['neigh']
+        elif self._aggre_type == 'lstm':
+            graph.ndata['h'] = feat
+            graph.update_all(fn.copy_src('h', 'm'), self._lstm_reducer)
+            h_neigh = graph.ndata['neigh']
+        else:
+            raise KeyError(
+                'Aggregator type {} not recognized.'.format(self._aggre_type))
+        # GraphSAGE GCN does not require fc_self.
+        if self._aggre_type == 'gcn':
+            rst = self.fc_neigh(h_neigh)
+        else:
+            rst = self.fc_self(h_self) + self.fc_neigh(h_neigh)
+        # activation
+        if self.activation is not None:
+            rst = self.activation(rst)
+        # normalization
+        if self.norm is not None:
+            rst = self.norm(rst)
+        return rst
--- a/python/dgl/nn/tensorflow/conv/sgconv.py
+++ b/python/dgl/nn/tensorflow/conv/sgconv.py
+"""tf Module for Simplifying Graph Convolution layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name, W0613
+import tensorflow as tf
+from tensorflow.keras import layers
+import numpy as np
+
+from .... import function as fn
+
+
+class SGConv(layers.Layer):
+    r"""Simplifying Graph Convolution layer from paper `Simplifying Graph
+    Convolutional Networks <https://arxiv.org/pdf/1902.07153.pdf>`__.
+
+    .. math::
+        H^{l+1} = (\hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2})^K H^{l} \Theta^{l}
+
+    Parameters
+    ----------
+    in_feats : int
+        Number of input features.
+    out_feats : int
+        Number of output features.
+    k : int
+        Number of hops :math:`K`. Defaults:``1``.
+    cached : bool
+        If True, the module would cache
+
+        .. math::
+            (\hat{D}^{-\frac{1}{2}}\hat{A}\hat{D}^{-\frac{1}{2}})^K X\Theta
+
+        at the first forward call. This parameter should only be set to
+        ``True`` in Transductive Learning setting.
+    bias : bool
+        If True, adds a learnable bias to the output. Default: ``True``.
+    norm : callable activation function/layer or None, optional
+        If not None, applies normalization to the updated node features.
+    """
+
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 k=1,
+                 cached=False,
+                 bias=True,
+                 norm=None):
+        super(SGConv, self).__init__()
+        self.fc = layers.Dense(out_feats, use_bias=bias)
+        self._cached = cached
+        self._cached_h = None
+        self._k = k
+        self.norm = norm
+
+    def call(self, graph, feat):
+        r"""Compute Simplifying Graph Convolution layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+
+        Notes
+        -----
+        If ``cache`` is se to True, ``feat`` and ``graph`` should not change during
+        training, or you will get wrong results.
+        """
+        graph = graph.local_var()
+        if self._cached_h is not None:
+            feat = self._cached_h
+        else:
+            # compute normalization
+            degs = tf.clip_by_value(tf.cast(
+                graph.in_degrees(), tf.float32), clip_value_min=1, clip_value_max=np.inf)
+            norm = tf.pow(degs, -0.5)
+            norm = tf.expand_dims(norm, 1)
+            # compute (D^-1 A^k D)^k X
+            for _ in range(self._k):
+                feat = feat * norm
+                graph.ndata['h'] = feat
+                graph.update_all(fn.copy_u('h', 'm'),
+                                 fn.sum('m', 'h'))
+                feat = graph.ndata.pop('h')
+                feat = feat * norm
+
+            if self.norm is not None:
+                feat = self.norm(feat)
+
+            # cache feature
+            if self._cached:
+                self._cached_h = feat
+        return self.fc(feat)
--- a/python/dgl/nn/tensorflow/glob.py
+++ b/python/dgl/nn/tensorflow/glob.py
+"""Tensorflow modules for graph global pooling."""
+# pylint: disable= no-member, arguments-differ, invalid-name, W0235
+import tensorflow as tf
+from tensorflow.keras import layers
+
+
+from ... import BatchedDGLGraph
+from ...batched_graph import sum_nodes, mean_nodes, max_nodes, \
+    softmax_nodes, topk_nodes
+
+
+__all__ = ['SumPooling', 'AvgPooling',
+           'MaxPooling', 'SortPooling', 'WeightAndSum', 'GlobalAttentionPooling']
+
+
+class SumPooling(layers.Layer):
+    r"""Apply sum pooling over the nodes in the graph.
+
+    .. math::
+        r^{(i)} = \sum_{k=1}^{N_i} x^{(i)}_k
+    """
+
+    def __init__(self):
+        super(SumPooling, self).__init__()
+
+    def call(self, graph, feat):
+        r"""Compute sum pooling.
+
+
+        Parameters
+        ----------
+        graph : DGLGraph or BatchedDGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature with shape :math:`(N, *)` where
+            :math:`N` is the number of nodes in the graph.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature with shape :math:`(*)` (if
+            input graph is a BatchedDGLGraph, the result shape
+            would be :math:`(B, *)`.
+        """
+        with graph.local_scope():
+            graph.ndata['h'] = feat
+            readout = sum_nodes(graph, 'h')
+            return readout
+
+
+class AvgPooling(layers.Layer):
+    r"""Apply average pooling over the nodes in the graph.
+
+    .. math::
+        r^{(i)} = \frac{1}{N_i}\sum_{k=1}^{N_i} x^{(i)}_k
+    """
+
+    def __init__(self):
+        super(AvgPooling, self).__init__()
+
+    def call(self, graph, feat):
+        r"""Compute average pooling.
+
+        Parameters
+        ----------
+        graph : DGLGraph or BatchedDGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature with shape :math:`(N, *)` where
+            :math:`N` is the number of nodes in the graph.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature with shape :math:`(*)` (if
+            input graph is a BatchedDGLGraph, the result shape
+            would be :math:`(B, *)`.
+        """
+        with graph.local_scope():
+            graph.ndata['h'] = feat
+            readout = mean_nodes(graph, 'h')
+            return readout
+
+
+class MaxPooling(layers.Layer):
+    r"""Apply max pooling over the nodes in the graph.
+
+    .. math::
+        r^{(i)} = \max_{k=1}^{N_i}\left( x^{(i)}_k \right)
+    """
+
+    def __init__(self):
+        super(MaxPooling, self).__init__()
+
+    def call(self, graph, feat):
+        r"""Compute max pooling.
+
+        Parameters
+        ----------
+        graph : DGLGraph or BatchedDGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature with shape :math:`(N, *)` where
+            :math:`N` is the number of nodes in the graph.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature with shape :math:`(*)` (if
+            input graph is a BatchedDGLGraph, the result shape
+            would be :math:`(B, *)`.
+        """
+        with graph.local_scope():
+            graph.ndata['h'] = feat
+            readout = max_nodes(graph, 'h')
+            return readout
+
+
+class SortPooling(layers.Layer):
+    r"""Apply Sort Pooling (`An End-to-End Deep Learning Architecture for Graph Classification
+    <https://www.cse.wustl.edu/~ychen/public/DGCNN.pdf>`__) over the nodes in the graph.
+
+    Parameters
+    ----------
+    k : int
+        The number of nodes to hold for each graph.
+    """
+
+    def __init__(self, k):
+        super(SortPooling, self).__init__()
+        self.k = k
+
+    def call(self, graph, feat):
+        r"""Compute sort pooling.
+
+        Parameters
+        ----------
+        graph : DGLGraph or BatchedDGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature with shape :math:`(N, D)` where
+            :math:`N` is the number of nodes in the graph.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature with shape :math:`(k * D)` (if
+            input graph is a BatchedDGLGraph, the result shape
+            would be :math:`(B, k * D)`.
+        """
+        with graph.local_scope():
+            # Sort the feature of each node in ascending order.
+            feat = tf.sort(feat, -1)
+            graph.ndata['h'] = feat
+            # Sort nodes according to their last features.
+            ret = tf.reshape(topk_nodes(graph, 'h', self.k, idx=-1)[0], (
+                -1, self.k * feat.shape[-1]))
+            if isinstance(graph, BatchedDGLGraph):
+                return ret
+            else:
+                return tf.squeeze(ret, 0)
+
+
+class GlobalAttentionPooling(layers.Layer):
+    r"""Apply Global Attention Pooling (`Gated Graph Sequence Neural Networks
+    <https://arxiv.org/abs/1511.05493.pdf>`__) over the nodes in the graph.
+
+    .. math::
+        r^{(i)} = \sum_{k=1}^{N_i}\mathrm{softmax}\left(f_{gate}
+        \left(x^{(i)}_k\right)\right) f_{feat}\left(x^{(i)}_k\right)
+
+    Parameters
+    ----------
+    gate_nn : tf.layers.Layer
+        A neural network that computes attention scores for each feature.
+    feat_nn : tf.layers.Layer, optional
+        A neural network applied to each feature before combining them
+        with attention scores.
+    """
+
+    def __init__(self, gate_nn, feat_nn=None):
+        super(GlobalAttentionPooling, self).__init__()
+        self.gate_nn = gate_nn
+        self.feat_nn = feat_nn
+
+    def call(self, graph, feat):
+        r"""Compute global attention pooling.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : tf.Tensor
+            The input feature with shape :math:`(N, D)` where
+            :math:`N` is the number of nodes in the graph.
+
+        Returns
+        -------
+        tf.Tensor
+            The output feature with shape :math:`(D)` (if
+            input graph is a BatchedDGLGraph, the result shape
+            would be :math:`(B, D)`.
+        """
+        with graph.local_scope():
+            gate = self.gate_nn(feat)
+            assert gate.shape[-1] == 1, "The output of gate_nn should have size 1 at the last axis."
+            feat = self.feat_nn(feat) if self.feat_nn else feat
+
+            graph.ndata['gate'] = gate
+            gate = softmax_nodes(graph, 'gate')
+            graph.ndata.pop('gate')
+
+            graph.ndata['r'] = feat * gate
+            readout = sum_nodes(graph, 'r')
+            graph.ndata.pop('r')
+
+            return readout
+
+
+class WeightAndSum(layers.Layer):
+    """Compute importance weights for atoms and perform a weighted sum.
+
+    Parameters
+    ----------
+    in_feats : int
+        Input atom feature size
+    """
+
+    def __init__(self, in_feats):
+        super(WeightAndSum, self).__init__()
+        self.in_feats = in_feats
+        self.atom_weighting = tf.keras.Sequential(
+            layers.Dense(1),
+            layers.Activation(tf.nn.sigmoid)
+        )
+
+    def call(self, bg, feats):
+        """Compute molecule representations out of atom representations
+
+        Parameters
+        ----------
+        bg : BatchedDGLGraph
+            B Batched DGLGraphs for processing multiple molecules in parallel
+        feats : FloatTensor of shape (N, self.in_feats)
+            Representations for all atoms in the molecules
+            * N is the total number of atoms in all molecules
+
+        Returns
+        -------
+        FloatTensor of shape (B, self.in_feats)
+            Representations for B molecules
+        """
+        with bg.local_scope():
+            bg.ndata['h'] = feats
+            bg.ndata['w'] = self.atom_weighting(bg.ndata['h'])
+            h_g_sum = sum_nodes(bg, 'h', 'w')
+
+        return h_g_sum
--- a/python/dgl/nn/tensorflow/softmax.py
+++ b/python/dgl/nn/tensorflow/softmax.py
+"""tf modules for graph related softmax."""
+# pylint: disable= no-member, arguments-differ
+import tensorflow as tf
+
+from ... import function as fn
+from ...base import ALL, is_all
+
+__all__ = ['edge_softmax']
+
+
+def edge_softmax_real(graph, score, eids=ALL):
+    """Edge Softmax function"""
+    if not is_all(eids):
+        graph = graph.edge_subgraph(tf.cast(eids, tf.int64))
+    g = graph.local_var()
+    g.edata['s'] = score
+    g.update_all(fn.copy_e('s', 'm'), fn.max('m', 'smax'))
+    g.apply_edges(fn.e_sub_v('s', 'smax', 'out'))
+    g.edata['out'] = tf.math.exp(g.edata['out'])
+    g.update_all(fn.copy_e('out', 'm'), fn.sum('m', 'out_sum'))
+    g.apply_edges(fn.e_div_v('out', 'out_sum', 'out'))
+    out = g.edata['out']
+
+    def edge_softmax_backward(grad_out):
+        g = graph.local_var()
+        # clear backward cache explicitly
+        g.edata['out'] = out
+        g.edata['grad_s'] = out * grad_out
+        g.update_all(fn.copy_e('grad_s', 'm'), fn.sum('m', 'accum'))
+        g.apply_edges(fn.e_mul_v('out', 'accum', 'out'))
+        grad_score = g.edata['grad_s'] - g.edata['out']
+        return grad_score
+
+    return out, edge_softmax_backward
+
+
+def edge_softmax(graph, logits, eids=ALL):
+    """Closure for tf.custom_gradient"""
+
+    @tf.custom_gradient
+    def _lambda(logits):
+        return edge_softmax_real(graph, logits, eids=eids)
+
+    return _lambda(logits)
--- a/python/dgl/nn/tensorflow/utils.py
+++ b/python/dgl/nn/tensorflow/utils.py
+"""Utilities for tf NN package"""
+# pylint: disable=no-member, invalid-name
+from tensorflow.keras import layers  # pylint: disable=W0235
+import tensorflow as tf
+
+
+def matmul_maybe_select(A, B):
+    """Perform Matrix multiplication C = A * B but A could be an integer id vector.
+
+    If A is an integer vector, we treat it as multiplying a one-hot encoded tensor.
+    In this case, the expensive dense matrix multiply can be replaced by a much
+    cheaper index lookup.
+
+    For example,
+    ::
+
+        A = [2, 0, 1],
+        B = [[0.1, 0.2],
+             [0.3, 0.4],
+             [0.5, 0.6]]
+
+    then matmul_maybe_select(A, B) is equivalent to
+    ::
+
+        [[0, 0, 1],     [[0.1, 0.2],
+         [1, 0, 0],  *   [0.3, 0.4],
+         [0, 1, 0]]      [0.5, 0.6]]
+
+    In all other cases, perform a normal matmul.
+
+    Parameters
+    ----------
+    A : tf.Tensor
+        lhs tensor
+    B : tf.Tensor
+        rhs tensor
+
+    Returns
+    -------
+    C : tf.Tensor
+        result tensor
+    """
+    if A.dtype == tf.int64 and len(A.shape) == 1:
+        return tf.gather(B, A)
+    else:
+        return tf.matmul(A, B)
+
+
+def bmm_maybe_select(A, B, index):
+    """Slice submatrices of A by the given index and perform bmm.
+
+    B is a 3D tensor of shape (N, D1, D2), which can be viewed as a stack of
+    N matrices of shape (D1, D2). The input index is an integer vector of length M.
+    A could be either:
+    (1) a dense tensor of shape (M, D1),
+    (2) an integer vector of length M.
+    The result C is a 2D matrix of shape (M, D2)
+
+    For case (1), C is computed by bmm:
+    ::
+
+        C[i, :] = matmul(A[i, :], B[index[i], :, :])
+
+    For case (2), C is computed by index select:
+    ::
+
+        C[i, :] = B[index[i], A[i], :]
+
+    Parameters
+    ----------
+    A : tf.Tensor
+        lhs tensor
+    B : tf.Tensor
+        rhs tensor
+    index : tf.Tensor
+        index tensor
+
+    Returns
+    -------
+    C : tf.Tensor
+        return tensor
+    """
+    if A.dtype == tf.int64 and len(A.shape) == 1:
+        # following is a faster version of B[index, A, :]
+        B = tf.reshape(B, (-1, B.shape[2]))
+        flatidx = index * B.shape[1] + A
+        return tf.gather(B, flatidx)
+    else:
+        BB = tf.gather(B, index)
+        return tf.squeeze(tf.matmul(tf.expand_dims(A, 1), BB))
+
+
+class Identity(layers.Layer):
+    """A placeholder identity operator that is argument-insensitive.
+    """
+
+    def call(self, x):
+        """Return input"""
+        return x
--- a/tests/backend/tensorflow/__init__.py
+++ b/tests/backend/tensorflow/__init__.py
@@ -18,8 +18,8 @@ def array_equal(a, b):


 def allclose(a, b, rtol=1e-4, atol=1e-4):
-    return np.allclose(a.numpy(),
-                       b.numpy(), rtol=rtol, atol=atol)
+    return np.allclose(tf.convert_to_tensor(a).numpy(),
+                       tf.convert_to_tensor(b).numpy(), rtol=rtol, atol=atol)


 def randn(shape):

--- a/tests/scripts/task_unit_test.sh
+++ b/tests/scripts/task_unit_test.sh
@@ -23,6 +23,13 @@ export PYTHONPATH=tests:${PWD}/python:$PYTHONPATH
 export DGL_DOWNLOAD_DIR=${PWD}
 export TF_FORCE_GPU_ALLOW_GROWTH=true

+if [ $2 == "gpu" ] 
+then
+  export CUDA_VISIBLE_DEVICES=0
+else
+  export CUDA_VISIBLE_DEVICES=-1
+fi
+
 conda activate ${DGLBACKEND}-ci

 python3 -m pytest -v --junitxml=pytest_compute.xml tests/compute || fail "compute"

--- a/tests/tensorflow/test_nn.py
+++ b/tests/tensorflow/test_nn.py
+import tensorflow as tf
+from tensorflow.keras import layers
+import networkx as nx
+import dgl
+import dgl.nn.tensorflow as nn
+import dgl.function as fn
+import backend as F
+from copy import deepcopy
+
+import numpy as np
+import scipy as sp
+
+def _AXWb(A, X, W, b):
+    X = tf.matmul(X, W)
+    Y = tf.reshape(tf.matmul(A, tf.reshape(X, (X.shape[0], -1))), X.shape)
+    return Y + b
+
+def test_graph_conv():
+    g = dgl.DGLGraph(nx.path_graph(3))
+    ctx = F.ctx()
+    adj = tf.sparse.to_dense(tf.sparse.reorder(g.adjacency_matrix(ctx=ctx)))
+
+    conv = nn.GraphConv(5, 2, norm=False, bias=True)
+    # conv = conv
+    print(conv)
+    # test#1: basic
+    h0 = F.ones((3, 5))
+    h1 = conv(g, h0)
+    assert len(g.ndata) == 0
+    assert len(g.edata) == 0
+    assert F.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias))
+    # test#2: more-dim
+    h0 = F.ones((3, 5, 5))
+    h1 = conv(g, h0)
+    assert len(g.ndata) == 0
+    assert len(g.edata) == 0
+    assert F.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias))
+
+    conv = nn.GraphConv(5, 2)
+    # conv = conv
+    # test#3: basic
+    h0 = F.ones((3, 5))
+    h1 = conv(g, h0)
+    assert len(g.ndata) == 0
+    assert len(g.edata) == 0
+    # test#4: basic
+    h0 = F.ones((3, 5, 5))
+    h1 = conv(g, h0)
+    assert len(g.ndata) == 0
+    assert len(g.edata) == 0
+
+    conv = nn.GraphConv(5, 2)
+    # conv = conv
+    # test#3: basic
+    h0 = F.ones((3, 5))
+    h1 = conv(g, h0)
+    assert len(g.ndata) == 0
+    assert len(g.edata) == 0
+    # test#4: basic
+    h0 = F.ones((3, 5, 5))
+    h1 = conv(g, h0)
+    assert len(g.ndata) == 0
+    assert len(g.edata) == 0
+
+    # test rest_parameters
+    # old_weight = deepcopy(conv.weight.data)
+    # conv.reset_parameters()
+    # new_weight = conv.weight.data
+    # assert not F.allclose(old_weight, new_weight)
+
+def _S2AXWb(A, N, X, W, b):
+    X1 = X * N
+    X1 = th.matmul(A, X1.view(X1.shape[0], -1))
+    X1 = X1 * N
+    X2 = X1 * N
+    X2 = th.matmul(A, X2.view(X2.shape[0], -1))
+    X2 = X2 * N
+    X = th.cat([X, X1, X2], dim=-1)
+    Y = th.matmul(X, W.rot90())
+
+    return Y + b
+
+def test_simple_pool():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(nx.path_graph(15))
+
+    sum_pool = nn.SumPooling()
+    avg_pool = nn.AvgPooling()
+    max_pool = nn.MaxPooling()
+    sort_pool = nn.SortPooling(10) # k = 10
+    print(sum_pool, avg_pool, max_pool, sort_pool)
+
+    # test#1: basic
+    h0 = F.randn((g.number_of_nodes(), 5))
+    h1 = sum_pool(g, h0)
+    assert F.allclose(h1, F.sum(h0, 0))
+    h1 = avg_pool(g, h0)
+    assert F.allclose(h1, F.mean(h0, 0))
+    h1 = max_pool(g, h0)
+    assert F.allclose(h1, F.max(h0, 0))
+    h1 = sort_pool(g, h0)
+    assert h1.shape[0] == 10 * 5 and h1.ndim == 1
+
+    # test#2: batched graph
+    g_ = dgl.DGLGraph(nx.path_graph(5))
+    bg = dgl.batch([g, g_, g, g_, g])
+    h0 = F.randn((bg.number_of_nodes(), 5))
+    h1 = sum_pool(bg, h0)
+    truth = tf.stack([F.sum(h0[:15], 0),
+                      F.sum(h0[15:20], 0),
+                      F.sum(h0[20:35], 0),
+                      F.sum(h0[35:40], 0),
+                      F.sum(h0[40:55], 0)], 0)
+    assert F.allclose(h1, truth)
+
+    h1 = avg_pool(bg, h0)
+    truth = tf.stack([F.mean(h0[:15], 0),
+                      F.mean(h0[15:20], 0),
+                      F.mean(h0[20:35], 0),
+                      F.mean(h0[35:40], 0),
+                      F.mean(h0[40:55], 0)], 0)
+    assert F.allclose(h1, truth)
+
+    h1 = max_pool(bg, h0)
+    truth = tf.stack([F.max(h0[:15], 0),
+                      F.max(h0[15:20], 0),
+                      F.max(h0[20:35], 0),
+                      F.max(h0[35:40], 0),
+                      F.max(h0[40:55], 0)], 0)
+    assert F.allclose(h1, truth)
+
+    h1 = sort_pool(bg, h0)
+    assert h1.shape[0] == 5 and h1.shape[1] == 10 * 5 and h1.ndim == 2
+
+def uniform_attention(g, shape):
+    a = F.ones(shape)
+    target_shape = (g.number_of_edges(),) + (1,) * (len(shape) - 1)
+    return a / tf.cast(tf.reshape(g.in_degrees(g.edges()[1]), target_shape), tf.float32)
+
+def test_edge_softmax():
+    # Basic
+    g = dgl.DGLGraph(nx.path_graph(3))
+    edata = F.ones((g.number_of_edges(), 1))
+    a = nn.edge_softmax(g, edata)
+    assert len(g.ndata) == 0
+    assert len(g.edata) == 0
+    assert F.allclose(a, uniform_attention(g, a.shape))
+
+    # Test higher dimension case
+    edata = F.ones((g.number_of_edges(), 3, 1))
+    a = nn.edge_softmax(g, edata)
+    assert len(g.ndata) == 0
+    assert len(g.edata) == 0
+    assert F.allclose(a, uniform_attention(g, a.shape))
+
+    # Test both forward and backward with Tensorflow built-in softmax.
+    g = dgl.DGLGraph()
+    g.add_nodes(30)
+    # build a complete graph
+    for i in range(30):
+        for j in range(30):
+            g.add_edge(i, j)
+
+    
+    score = F.randn((900, 1))
+    with tf.GradientTape() as tape:
+        tape.watch(score)
+        grad = F.randn((900, 1))
+        y = tf.reshape(F.softmax(tf.reshape(score,(30, 30)), dim=0), (-1, 1))
+        grads = tape.gradient(y, [score])
+        grad_score = grads[0]
+
+    with tf.GradientTape() as tape:
+        tape.watch(score)
+        y_dgl = nn.edge_softmax(g, score)
+        assert len(g.ndata) == 0
+        assert len(g.edata) == 0
+        # check forward
+        assert F.allclose(y_dgl, y)
+        grads = tape.gradient(y_dgl, [score])
+    # checkout gradient
+    assert F.allclose(grads[0], grad_score)
+    print(grads[0][:10], grad_score[:10])
+    
+    # Test 2
+    def generate_rand_graph(n):
+      arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(np.int64)
+      return dgl.DGLGraph(arr, readonly=True)
+    
+    g = generate_rand_graph(50)
+    a1 = F.randn((g.number_of_edges(), 1))
+    a2 = tf.identity(a1)
+    with tf.GradientTape() as tape:
+        tape.watch(a1)
+        g.edata['s'] = a1
+        g.group_apply_edges('dst', lambda edges: {'ss':F.softmax(edges.data['s'], 1)})
+        loss = tf.reduce_sum(g.edata['ss'])
+        a1_grad = tape.gradient(loss, [a1])[0]
+    
+    with tf.GradientTape() as tape:
+        tape.watch(a2)
+        builtin_sm = nn.edge_softmax(g, a2)
+        loss = tf.reduce_sum(builtin_sm)
+        a2_grad = tape.gradient(loss, [a2])[0]
+    print(a1_grad - a2_grad)
+    assert len(g.ndata) == 0
+    assert len(g.edata) == 2
+    assert F.allclose(a1_grad, a2_grad, rtol=1e-4, atol=1e-4) # Follow tolerance in unittest backend
+
+def test_partial_edge_softmax():
+    g = dgl.DGLGraph()
+    g.add_nodes(30)
+    # build a complete graph
+    for i in range(30):
+        for j in range(30):
+            g.add_edge(i, j)
+
+    score = F.randn((300, 1))
+    grad = F.randn((300, 1))
+    import numpy as np
+    eids = np.random.choice(900, 300, replace=False).astype('int64')
+    eids = F.zerocopy_from_numpy(eids)
+    # compute partial edge softmax
+    with tf.GradientTape() as tape:
+        tape.watch(score)
+        y_1 = nn.edge_softmax(g, score, eids)
+        grads = tape.gradient(y_1, [score])
+    grad_1 = grads[0]
+    # compute edge softmax on edge subgraph
+    subg = g.edge_subgraph(eids)
+    with tf.GradientTape() as tape:
+        tape.watch(score)
+        y_2 = nn.edge_softmax(subg, score)
+        grads = tape.gradient(y_2, [score])
+    grad_2 = grads[0]
+
+    assert F.allclose(y_1, y_2)
+    assert F.allclose(grad_1, grad_2)
+
+def test_glob_att_pool():
+    g = dgl.DGLGraph(nx.path_graph(10))
+
+    gap = nn.GlobalAttentionPooling(layers.Dense(1), layers.Dense(10))
+    print(gap)
+
+    # test#1: basic
+    h0 = F.randn((g.number_of_nodes(), 5))
+    h1 = gap(g, h0)
+    assert h1.shape[0] == 10 and h1.ndim == 1
+
+    # test#2: batched graph
+    bg = dgl.batch([g, g, g, g])
+    h0 = F.randn((bg.number_of_nodes(), 5))
+    h1 = gap(bg, h0)
+    assert h1.shape[0] == 4 and h1.shape[1] == 10 and h1.ndim == 2
+
+
+def test_rgcn():
+    etype = []
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    # 5 etypes
+    R = 5
+    for i in range(g.number_of_edges()):
+        etype.append(i % 5)
+    B = 2
+    I = 10
+    O = 8
+
+    rgc_basis = nn.RelGraphConv(I, O, R, "basis", B)
+    h = tf.random.normal((100, I))
+    r = tf.constant(etype)
+    h_new = rgc_basis(g, h, r)
+    assert list(h_new.shape) == [100, O]
+
+    rgc_bdd = nn.RelGraphConv(I, O, R, "bdd", B)
+    h = tf.random.normal((100, I))
+    r = tf.constant(etype)
+    h_new = rgc_bdd(g, h, r)
+    assert list(h_new.shape) == [100, O]
+
+    # with norm
+    norm = tf.zeros((g.number_of_edges(), 1))
+
+    rgc_basis = nn.RelGraphConv(I, O, R, "basis", B)
+    h = tf.random.normal((100, I))
+    r = tf.constant(etype)
+    h_new = rgc_basis(g, h, r, norm)
+    assert list(h_new.shape) == [100, O]
+
+    rgc_bdd = nn.RelGraphConv(I, O, R, "bdd", B)
+    h = tf.random.normal((100, I))
+    r = tf.constant(etype)
+    h_new = rgc_bdd(g, h, r, norm)
+    assert list(h_new.shape) == [100, O]
+
+    # id input
+    rgc_basis = nn.RelGraphConv(I, O, R, "basis", B)
+    h = tf.constant(np.random.randint(0, I, (100,)))
+    r = tf.constant(etype)
+    h_new = rgc_basis(g, h, r)
+    assert list(h_new.shape) == [100, O]
+
+def test_gat_conv():
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    gat = nn.GATConv(5, 2, 4)
+    feat = F.randn((100, 5))
+    h = gat(g, feat)
+    assert h.shape[-1] == 2 and h.shape[-2] == 4
+
+def test_sage_conv():
+    for aggre_type in ['mean', 'pool', 'gcn', 'lstm']:
+        g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+        sage = nn.SAGEConv(5, 10, aggre_type)
+        feat = F.randn((100, 5))
+        h = sage(g, feat)
+        assert h.shape[-1] == 10
+
+def test_sgc_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    # not cached
+    sgc = nn.SGConv(5, 10, 3)
+    feat = F.randn((100, 5))
+
+    h = sgc(g, feat)
+    assert h.shape[-1] == 10
+
+    # cached
+    sgc = nn.SGConv(5, 10, 3, True)
+    h_0 = sgc(g, feat)
+    h_1 = sgc(g, feat + 1)
+    assert F.allclose(h_0, h_1)
+    assert h_0.shape[-1] == 10
+
+def test_appnp_conv():
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    appnp = nn.APPNPConv(10, 0.1)
+    feat = F.randn((100, 5))
+
+    h = appnp(g, feat)
+    assert h.shape[-1] == 5
+
+def test_gin_conv():
+    for aggregator_type in ['mean', 'max', 'sum']:
+        g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+        gin = nn.GINConv(
+            tf.keras.layers.Dense(12),
+            aggregator_type
+        )
+        feat = F.randn((100, 5))
+        gin = gin
+        h = gin(g, feat)
+        assert h.shape[-1] == 12
+
+
+if __name__ == '__main__':
+    test_graph_conv()
+    test_edge_softmax()
+    test_partial_edge_softmax()
+    # test_set2set()
+    test_glob_att_pool()
+    test_simple_pool()
+    # test_set_trans()
+    test_rgcn()
+    # test_tagconv()
+    test_gat_conv()
+    test_sage_conv()
+    test_sgc_conv()
+    test_appnp_conv()
+    test_gin_conv()
+    # test_agnn_conv()
+    # test_gated_graph_conv()
+    # test_nn_conv()
+    # test_gmm_conv()
+    # test_dense_graph_conv()
+    # test_dense_sage_conv()
+    # test_dense_cheb_conv()
+    # test_sequential()
+