[NN] Add commonly used GNN models from examples to dgl.nn modules. (#748)

* gat * upd * upd sage * upd * upd * upd * upd * upd * add gmmconv * upd ggnn * upd * upd * upd * upd * add citation examples * add README * fix cheb * improve doc * formula * upd * trigger * lint * lint * upd * add test for transform * add test * check * upd * improve doc * shape check * upd * densechebconv, currently not correct (?) * fix cheb * fix * upd * upd sgc-reddit * upd * trigger

[NN] Add commonly used GNN models from examples to dgl.nn modules. (#748)
* gat * upd * upd sage * upd * upd * upd * upd * upd * add gmmconv * upd ggnn * upd * upd * upd * upd * add citation examples * add README * fix cheb * improve doc * formula * upd * trigger * lint * lint * upd * add test for transform * add test * check * upd * improve doc * shape check * upd * densechebconv, currently not correct (?) * fix cheb * fix * upd * upd sgc-reddit * upd * trigger
650f6ee1 · Zihao Ye · GitHub · 8079d986 · 650f6ee1 · 650f6ee1
Unverified Commit 650f6ee1 authored Aug 27, 2019 by Zihao Ye Committed by GitHub Aug 27, 2019
9 changed files
--- a/python/dgl/nn/mxnet/glob.py
+++ b/python/dgl/nn/mxnet/glob.py
 """MXNet modules for graph global pooling."""
-# pylint: disable= no-member, arguments-differ, C0103, W0235
+# pylint: disable= no-member, arguments-differ, invalid-name, W0235
 from mxnet import gluon, nd
 from mxnet.gluon import nn


--- a/python/dgl/nn/pytorch/conv.py
+++ b/python/dgl/nn/pytorch/conv.py
--- a/python/dgl/nn/pytorch/glob.py
+++ b/python/dgl/nn/pytorch/glob.py
 """Torch modules for graph global pooling."""
-# pylint: disable= no-member, arguments-differ, C0103, W0235
+# pylint: disable= no-member, arguments-differ, invalid-name, W0235
 import torch as th
 import torch.nn as nn
 import numpy as np
@@ -178,17 +178,6 @@ class GlobalAttentionPooling(nn.Module):
        super(GlobalAttentionPooling, self).__init__()
        self.gate_nn = gate_nn
        self.feat_nn = feat_nn
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Reinitialize learnable parameters."""
-        for p in self.gate_nn.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
-        if self.feat_nn:
-            for p in self.feat_nn.parameters():
-                if p.dim() > 1:
-                    nn.init.xavier_uniform_(p)

    def forward(self, feat, graph):
        r"""Compute global attention pooling.

--- a/python/dgl/transform.py
+++ b/python/dgl/transform.py
-"""Module for graph transformation methods."""
+"""Module for graph transformation utilities."""
+
+import numpy as np
+from scipy import sparse
 from ._ffi.function import _init_api
 from .graph import DGLGraph
-from .batched_graph import BatchedDGLGraph
+from .graph_index import from_coo
+from .batched_graph import BatchedDGLGraph, unbatch
+from .backend import asnumpy, tensor
+

-__all__ = ['line_graph', 'reverse', 'to_simple_graph', 'to_bidirected']
+__all__ = ['line_graph', 'khop_adj', 'khop_graph', 'reverse', 'to_simple_graph', 'to_bidirected',
+           'laplacian_lambda_max']


 def line_graph(g, backtracking=True, shared=False):
@@ -12,6 +19,7 @@ def line_graph(g, backtracking=True, shared=False):
    Parameters
    ----------
    g : dgl.DGLGraph
+        The input graph.
    backtracking : bool, optional
        Whether the returned line graph is backtracking.
    shared : bool, optional
@@ -26,6 +34,88 @@ def line_graph(g, backtracking=True, shared=False):
    node_frame = g._edge_frame if shared else None
    return DGLGraph(graph_data, node_frame)

+def khop_adj(g, k):
+    """Return the matrix of :math:`A^k` where :math:`A` is the adjacency matrix of :math:`g`,
+    where a row represents the destination and a column represents the source.
+
+    Parameters
+    ----------
+    g : dgl.DGLGraph
+        The input graph.
+    k : int
+        The :math:`k` in :math:`A^k`.
+
+    Returns
+    -------
+    tensor
+        The returned tensor, dtype is ``np.float32``.
+
+    Examples
+    --------
+
+    >>> import dgl
+    >>> g = dgl.DGLGraph()
+    >>> g.add_nodes(5)
+    >>> g.add_edges([0,1,2,3,4,0,1,2,3,4], [0,1,2,3,4,1,2,3,4,0])
+    >>> dgl.khop_adj(g, 1)
+    tensor([[1., 0., 0., 0., 1.],
+            [1., 1., 0., 0., 0.],
+            [0., 1., 1., 0., 0.],
+            [0., 0., 1., 1., 0.],
+            [0., 0., 0., 1., 1.]])
+    >>> dgl.khop_adj(g, 3)
+    tensor([[1., 0., 1., 3., 3.],
+            [3., 1., 0., 1., 3.],
+            [3., 3., 1., 0., 1.],
+            [1., 3., 3., 1., 0.],
+            [0., 1., 3., 3., 1.]])
+    """
+    adj_k = g.adjacency_matrix_scipy(return_edge_ids=False) ** k
+    return tensor(adj_k.todense().astype(np.float32))
+
+def khop_graph(g, k):
+    """Return the graph that includes all :math:`k`-hop neighbors of the given graph as edges.
+    The adjacency matrix of the returned graph is :math:`A^k`
+    (where :math:`A` is the adjacency matrix of :math:`g`).
+
+    Parameters
+    ----------
+    g : dgl.DGLGraph
+        The input graph.
+    k : int
+        The :math:`k` in `k`-hop graph.
+
+    Returns
+    -------
+    dgl.DGLGraph
+        The returned ``DGLGraph``.
+
+    Examples
+    --------
+
+    >>> import dgl
+    >>> g = dgl.DGLGraph()
+    >>> g.add_nodes(5)
+    >>> g.add_edges([0,1,2,3,4,0,1,2,3,4], [0,1,2,3,4,1,2,3,4,0])
+    >>> dgl.khop_graph(g, 1)
+    DGLGraph(num_nodes=5, num_edges=10,
+             ndata_schemes={}
+             edata_schemes={})
+    >>> dgl.khop_graph(g, 3)
+    DGLGraph(num_nodes=5, num_edges=40,
+             ndata_schemes={}
+             edata_schemes={})
+    """
+    n = g.number_of_nodes()
+    adj_k = g.adjacency_matrix_scipy(return_edge_ids=False) ** k
+    adj_k = adj_k.tocoo()
+    multiplicity = adj_k.data
+    row = np.repeat(adj_k.row, multiplicity)
+    col = np.repeat(adj_k.col, multiplicity)
+    # TODO(zihao): we should support creating multi-graph from scipy sparse matrix
+    # in the future.
+    return DGLGraph(from_coo(n, row, col, True, True))
+
 def reverse(g, share_ndata=False, share_edata=False):
    """Return the reverse of a graph

@@ -46,6 +136,7 @@ def reverse(g, share_ndata=False, share_edata=False):
    Parameters
    ----------
    g : dgl.DGLGraph
+        The input graph.
    share_ndata: bool, optional
        If True, the original graph and the reversed graph share memory for node attributes.
        Otherwise the reversed graph will not be initialized with node attributes.
@@ -169,4 +260,49 @@ def to_bidirected(g, readonly=True):
        newgidx = _CAPI_DGLToBidirectedMutableGraph(g._graph)
    return DGLGraph(newgidx)

+def laplacian_lambda_max(g):
+    """Return the largest eigenvalue of the normalized symmetric laplacian of g.
+
+    The eigenvalue of the normalized symmetric of any graph is less than or equal to 2,
+    ref: https://en.wikipedia.org/wiki/Laplacian_matrix#Properties
+
+    Parameters
+    ----------
+    g : DGLGraph or BatchedDGLGraph
+        The input graph, it should be an undirected graph.
+
+    Returns
+    -------
+    list :
+        * If the input g is a DGLGraph, the returned value would be
+          a list with one element, indicating the largest eigenvalue of g.
+        * If the input g is a BatchedDGLGraph, the returned value would
+          be a list, where the i-th item indicates the largest eigenvalue
+          of i-th graph in g.
+
+    Examples
+    --------
+
+    >>> import dgl
+    >>> g = dgl.DGLGraph()
+    >>> g.add_nodes(5)
+    >>> g.add_edges([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], [1, 2, 3, 4, 0, 4, 0, 1, 2, 3])
+    >>> dgl.laplacian_lambda_max(g)
+    [1.809016994374948]
+    """
+    if isinstance(g, BatchedDGLGraph):
+        g_arr = unbatch(g)
+    else:
+        g_arr = [g]
+
+    rst = []
+    for g_i in g_arr:
+        n = g_i.number_of_nodes()
+        adj = g_i.adjacency_matrix_scipy(return_edge_ids=False).astype(float)
+        norm = sparse.diags(asnumpy(g_i.in_degrees()).clip(1) ** -0.5, dtype=float)
+        laplacian = sparse.eye(n) - norm * adj * norm
+        rst.append(sparse.linalg.eigs(laplacian, 1, which='LM',
+                                      return_eigenvectors=False)[0].real)
+    return rst
+
 _init_api("dgl.transform")
--- a/tests/backend/backend_unittest.py
+++ b/tests/backend/backend_unittest.py
@@ -110,6 +110,11 @@ def min(x, dim):
 def prod(x, dim):
    """Computes the prod of array elements over given axes"""
    pass
+
+def matmul(a, b):
+    """Compute Matrix Multiplication between a and b"""
+    pass
+
 ###############################################################################
 # Tensor functions used *only* on index tensor
 # ----------------

--- a/tests/backend/mxnet/__init__.py
+++ b/tests/backend/mxnet/__init__.py
@@ -83,6 +83,9 @@ def min(x, dim):
 def prod(x, dim):
    return x.prod(dim)

+def matmul(a, b):
+    return nd.dot(a, b)
+
 record_grad = autograd.record



--- a/tests/backend/pytorch/__init__.py
+++ b/tests/backend/pytorch/__init__.py
@@ -79,6 +79,9 @@ def min(x, dim):
 def prod(x, dim):
    return x.prod(dim)

+def matmul(a, b):
+    return a @ b
+
 class record_grad(object):
    def __init__(self):
        pass

--- a/tests/compute/test_transform.py
+++ b/tests/compute/test_transform.py
@@ -112,6 +112,56 @@ def test_bidirected_graph():
    _test(False, True)
    _test(False, False)

+def test_khop_graph():
+    N = 20
+    feat = F.randn((N, 5))
+    g = dgl.DGLGraph(nx.erdos_renyi_graph(N, 0.3))
+    for k in range(4):
+        g_k = dgl.khop_graph(g, k)
+        # use original graph to do message passing for k times.
+        g.ndata['h'] = feat
+        for _ in range(k):
+            g.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
+        h_0 = g.ndata.pop('h')
+        # use k-hop graph to do message passing for one time.
+        g_k.ndata['h'] = feat
+        g_k.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
+        h_1 = g_k.ndata.pop('h')
+        assert F.allclose(h_0, h_1, rtol=1e-3, atol=1e-3)
+
+def test_khop_adj():
+    N = 20
+    feat = F.randn((N, 5))
+    g = dgl.DGLGraph(nx.erdos_renyi_graph(N, 0.3))
+    for k in range(3):
+        adj = F.tensor(dgl.khop_adj(g, k))
+        # use original graph to do message passing for k times.
+        g.ndata['h'] = feat
+        for _ in range(k):
+            g.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
+        h_0 = g.ndata.pop('h')
+        # use k-hop adj to do message passing for one time.
+        h_1 = F.matmul(adj, feat)
+        assert F.allclose(h_0, h_1, rtol=1e-3, atol=1e-3)
+
+def test_laplacian_lambda_max():
+    N = 20
+    eps = 1e-6
+    # test DGLGraph
+    g = dgl.DGLGraph(nx.erdos_renyi_graph(N, 0.3))
+    l_max = dgl.laplacian_lambda_max(g)
+    assert (l_max[0] < 2 + eps)
+    # test BatchedDGLGraph
+    N_arr = [20, 30, 10, 12]
+    bg = dgl.batch([
+        dgl.DGLGraph(nx.erdos_renyi_graph(N, 0.3))
+        for N in N_arr
+    ])
+    l_max_arr = dgl.laplacian_lambda_max(bg)
+    assert len(l_max_arr) == len(N_arr)
+    for l_max in l_max_arr:
+        assert l_max < 2 + eps
+
 if __name__ == '__main__':
    test_line_graph()
    test_no_backtracking()
@@ -119,3 +169,6 @@ if __name__ == '__main__':
    test_reverse_shared_frames()
    test_simple_graph()
    test_bidirected_graph()
+    test_khop_adj()
+    test_khop_graph()
+    test_laplacian_lambda_max()
--- a/tests/pytorch/test_nn.py
+++ b/tests/pytorch/test_nn.py
@@ -20,7 +20,7 @@ def test_graph_conv():

    conv = nn.GraphConv(5, 2, norm=False, bias=True)
    if F.gpu_ctx():
-        conv.cuda()
+        conv = conv.to(ctx)
    print(conv)
    # test#1: basic
    h0 = F.ones((3, 5))
@@ -37,7 +37,7 @@ def test_graph_conv():

    conv = nn.GraphConv(5, 2)
    if F.gpu_ctx():
-        conv.cuda()
+        conv = conv.to(ctx)
    # test#3: basic
    h0 = F.ones((3, 5))
    h1 = conv(h0, g)
@@ -51,7 +51,7 @@ def test_graph_conv():

    conv = nn.GraphConv(5, 2)
    if F.gpu_ctx():
-        conv.cuda()
+        conv = conv.to(ctx)
    # test#3: basic
    h0 = F.ones((3, 5))
    h1 = conv(h0, g)
@@ -81,15 +81,15 @@ def _S2AXWb(A, N, X, W, b):

    return Y + b

-def test_tgconv():
+def test_tagconv():
    g = dgl.DGLGraph(nx.path_graph(3))
    ctx = F.ctx()
    adj = g.adjacency_matrix(ctx=ctx)
    norm = th.pow(g.in_degrees().float(), -0.5)

-    conv = nn.TGConv(5, 2, bias=True)
+    conv = nn.TAGConv(5, 2, bias=True)
    if F.gpu_ctx():
-        conv.cuda()
+        conv = conv.to(ctx)
    print(conv)

    # test#1: basic
@@ -102,27 +102,27 @@ def test_tgconv():

    assert F.allclose(h1, _S2AXWb(adj, norm, h0, conv.lin.weight, conv.lin.bias))

-    conv = nn.TGConv(5, 2)
+    conv = nn.TAGConv(5, 2)
    if F.gpu_ctx():
-        conv.cuda()
+        conv = conv.to(ctx)
    # test#2: basic
    h0 = F.ones((3, 5))
    h1 = conv(h0, g)
-    assert len(g.ndata) == 0
-    assert len(g.edata) == 0
+    assert h1.shape[-1] == 2

-    # test rest_parameters
+    # test reset_parameters
    old_weight = deepcopy(conv.lin.weight.data)
    conv.reset_parameters()
    new_weight = conv.lin.weight.data
    assert not F.allclose(old_weight, new_weight)

 def test_set2set():
+    ctx = F.ctx()
    g = dgl.DGLGraph(nx.path_graph(10))

    s2s = nn.Set2Set(5, 3, 3) # hidden size 5, 3 iters, 3 layers
    if F.gpu_ctx():
-        s2s.cuda()
+        s2s = s2s.to(ctx)
    print(s2s)

    # test#1: basic
@@ -139,11 +139,12 @@ def test_set2set():
    assert h1.shape[0] == 3 and h1.shape[1] == 10 and h1.dim() == 2

 def test_glob_att_pool():
+    ctx = F.ctx()
    g = dgl.DGLGraph(nx.path_graph(10))

    gap = nn.GlobalAttentionPooling(th.nn.Linear(5, 1), th.nn.Linear(5, 10))
    if F.gpu_ctx():
-        gap.cuda()
+        gap = gap.to(ctx)
    print(gap)

    # test#1: basic
@@ -158,6 +159,7 @@ def test_glob_att_pool():
    assert h1.shape[0] == 4 and h1.shape[1] == 10 and h1.dim() == 2

 def test_simple_pool():
+    ctx = F.ctx()
    g = dgl.DGLGraph(nx.path_graph(15))

    sum_pool = nn.SumPooling()
@@ -168,6 +170,12 @@ def test_simple_pool():

    # test#1: basic
    h0 = F.randn((g.number_of_nodes(), 5))
+    if F.gpu_ctx():
+        sum_pool = sum_pool.to(ctx)
+        avg_pool = avg_pool.to(ctx)
+        max_pool = max_pool.to(ctx)
+        sort_pool = sort_pool.to(ctx)
+        h0 = h0.to(ctx)
    h1 = sum_pool(h0, g)
    assert F.allclose(h1, F.sum(h0, 0))
    h1 = avg_pool(h0, g)
@@ -181,6 +189,8 @@ def test_simple_pool():
    g_ = dgl.DGLGraph(nx.path_graph(5))
    bg = dgl.batch([g, g_, g, g_, g])
    h0 = F.randn((bg.number_of_nodes(), 5))
+    if F.gpu_ctx():
+        h0 = h0.to(ctx)

    h1 = sum_pool(h0, bg)
    truth = th.stack([F.sum(h0[:15], 0),
@@ -210,15 +220,16 @@ def test_simple_pool():
    assert h1.shape[0] == 5 and h1.shape[1] == 10 * 5 and h1.dim() == 2

 def test_set_trans():
+    ctx = F.ctx()
    g = dgl.DGLGraph(nx.path_graph(15))

    st_enc_0 = nn.SetTransformerEncoder(50, 5, 10, 100, 2, 'sab')
    st_enc_1 = nn.SetTransformerEncoder(50, 5, 10, 100, 2, 'isab', 3)
    st_dec = nn.SetTransformerDecoder(50, 5, 10, 100, 2, 4)
    if F.gpu_ctx():
-        st_enc_0.cuda()
-        st_enc_1.cuda()
-        st_dec.cuda()
+        st_enc_0 = st_enc_0.to(ctx)
+        st_enc_1 = st_enc_1.to(ctx)
+        st_dec = st_dec.to(ctx)
    print(st_enc_0, st_enc_1, st_dec)

    # test#1: basic
@@ -354,6 +365,207 @@ def test_rgcn():
    h_new = rgc_basis(g, h, r)
    assert list(h_new.shape) == [100, O]

+def test_gat_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    gat = nn.GATConv(5, 2, 4)
+    feat = F.randn((100, 5))
+
+    if F.gpu_ctx():
+        gat = gat.to(ctx)
+        feat = feat.to(ctx)
+
+    h = gat(feat, g)
+    assert h.shape[-1] == 2 and h.shape[-2] == 4
+
+def test_sage_conv():
+    for aggre_type in ['mean', 'pool', 'gcn', 'lstm']:
+        ctx = F.ctx()
+        g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+        sage = nn.SAGEConv(5, 10, aggre_type)
+        feat = F.randn((100, 5))
+
+        if F.gpu_ctx():
+            sage = sage.to(ctx)
+            feat = feat.to(ctx)
+
+        h = sage(feat, g)
+        assert h.shape[-1] == 10
+
+def test_sgc_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    # not cached
+    sgc = nn.SGConv(5, 10, 3)
+    feat = F.randn((100, 5))
+
+    if F.gpu_ctx():
+        sgc = sgc.to(ctx)
+        feat = feat.to(ctx)
+
+    h = sgc(feat, g)
+    assert h.shape[-1] == 10
+
+    # cached
+    sgc = nn.SGConv(5, 10, 3, True)
+
+    if F.gpu_ctx():
+        sgc = sgc.to(ctx)
+
+    h_0 = sgc(feat, g)
+    h_1 = sgc(feat + 1, g)
+    assert F.allclose(h_0, h_1)
+    assert h_0.shape[-1] == 10
+
+def test_appnp_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    appnp = nn.APPNPConv(10, 0.1)
+    feat = F.randn((100, 5))
+
+    if F.gpu_ctx():
+        appnp = appnp.to(ctx)
+        feat = feat.to(ctx)
+
+    h = appnp(feat, g)
+    assert h.shape[-1] == 5
+
+def test_gin_conv():
+    for aggregator_type in ['mean', 'max', 'sum']:
+        ctx = F.ctx()
+        g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+        gin = nn.GINConv(
+            th.nn.Linear(5, 12),
+            aggregator_type
+        )
+        feat = F.randn((100, 5))
+
+        if F.gpu_ctx():
+            gin = gin.to(ctx)
+            feat = feat.to(ctx)
+
+        h = gin(feat, g)
+        assert h.shape[-1] == 12
+
+def test_agnn_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    agnn = nn.AGNNConv(1)
+    feat = F.randn((100, 5))
+
+    if F.gpu_ctx():
+        agnn = agnn.to(ctx)
+        feat = feat.to(ctx)
+
+    h = agnn(feat, g)
+    assert h.shape[-1] == 5
+
+def test_gated_graph_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    ggconv = nn.GatedGraphConv(5, 10, 5, 3)
+    etypes = th.arange(g.number_of_edges()) % 3
+    feat = F.randn((100, 5))
+
+    if F.gpu_ctx():
+        ggconv = ggconv.to(ctx)
+        feat = feat.to(ctx)
+        etypes = etypes.to(ctx)
+
+    h = ggconv(feat, etypes, g)
+    # current we only do shape check
+    assert h.shape[-1] == 10
+
+def test_nn_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    edge_func = th.nn.Linear(4, 5 * 10)
+    nnconv = nn.NNConv(5, 10, edge_func, 'mean')
+    feat = F.randn((100, 5))
+    efeat = F.randn((g.number_of_edges(), 4))
+
+    if F.gpu_ctx():
+        nnconv = nnconv.to(ctx)
+        feat = feat.to(ctx)
+        efeat = efeat.to(ctx)
+
+    h = nnconv(feat, efeat, g)
+    # currently we only do shape check
+    assert h.shape[-1] == 10
+
+def test_gmm_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    gmmconv = nn.GMMConv(5, 10, 3, 4, 'mean')
+    feat = F.randn((100, 5))
+    pseudo = F.randn((g.number_of_edges(), 3))
+
+    if F.gpu_ctx():
+        gmmconv = gmmconv.to(ctx)
+        feat = feat.to(ctx)
+        pseudo = pseudo.to(ctx)
+
+    h = gmmconv(feat, pseudo, g)
+    # currently we only do shape check
+    assert h.shape[-1] == 10
+
+def test_dense_graph_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    adj = g.adjacency_matrix(ctx=ctx).to_dense()
+    conv = nn.GraphConv(5, 2, norm=False, bias=True)
+    dense_conv = nn.DenseGraphConv(5, 2, norm=False, bias=True)
+    dense_conv.weight.data = conv.weight.data
+    dense_conv.bias.data = conv.bias.data
+    feat = F.randn((100, 5))
+    if F.gpu_ctx():
+        conv = conv.to(ctx)
+        dense_conv = dense_conv.to(ctx)
+        feat = feat.to(ctx)
+
+    out_conv = conv(feat, g)
+    out_dense_conv = dense_conv(feat, adj)
+    assert F.allclose(out_conv, out_dense_conv)
+
+def test_dense_sage_conv():
+    ctx = F.ctx()
+    g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+    adj = g.adjacency_matrix(ctx=ctx).to_dense()
+    sage = nn.SAGEConv(5, 2, 'gcn',)
+    dense_sage = nn.DenseSAGEConv(5, 2)
+    dense_sage.fc.weight.data = sage.fc_neigh.weight.data
+    dense_sage.fc.bias.data = sage.fc_neigh.bias.data
+    feat = F.randn((100, 5))
+    if F.gpu_ctx():
+        sage = sage.to(ctx)
+        dense_sage = dense_sage.to(ctx)
+        feat = feat.to(ctx)
+
+    out_sage = sage(feat, g)
+    out_dense_sage = dense_sage(feat, adj)
+    assert F.allclose(out_sage, out_dense_sage)
+
+def test_dense_cheb_conv():
+    for k in range(1, 4):
+        ctx = F.ctx()
+        g = dgl.DGLGraph(sp.sparse.random(100, 100, density=0.1), readonly=True)
+        adj = g.adjacency_matrix(ctx=ctx).to_dense()
+        cheb = nn.ChebConv(5, 2, k)
+        dense_cheb = nn.DenseChebConv(5, 2, k)
+        for i in range(len(cheb.fc)):
+            dense_cheb.W.data[i] = cheb.fc[i].weight.data.t()
+        if cheb.bias is not None:
+            dense_cheb.bias.data = cheb.bias.data
+        feat = F.randn((100, 5))
+        if F.gpu_ctx():
+            cheb = cheb.to(ctx)
+            dense_cheb = dense_cheb.to(ctx)
+            feat = feat.to(ctx)
+
+        out_cheb = cheb(feat, g)
+        out_dense_cheb = dense_cheb(feat, adj)
+        assert F.allclose(out_cheb, out_dense_cheb)
+
 if __name__ == '__main__':
    test_graph_conv()
    test_edge_softmax()
@@ -362,3 +574,17 @@ if __name__ == '__main__':
    test_simple_pool()
    test_set_trans()
    test_rgcn()
+    test_tagconv()
+    test_gat_conv()
+    test_sage_conv()
+    test_sgc_conv()
+    test_appnp_conv()
+    test_gin_conv()
+    test_agnn_conv()
+    test_gated_graph_conv()
+    test_nn_conv()
+    test_gmm_conv()
+    test_dense_graph_conv()
+    test_dense_sage_conv()
+    test_dense_cheb_conv()
+