[NN] nn modules & examples update (#890)

* upd * damn it * fuck * fuck pylint * fudge * remove some comments about MXNet * upd * upd * damn it * damn it * fuck * fuck * upd * upd * pylint bastard * upd * upd * upd * upd * upd * upd * upd * upd * upd

[NN] nn modules & examples update (#890)
* upd * damn it * fuck * fuck pylint * fudge * remove some comments about MXNet * upd * upd * damn it * damn it * fuck * fuck * upd * upd * pylint bastard * upd * upd * upd * upd * upd * upd * upd * upd * upd
9a0511c8 · Zihao Ye · Minjie Wang · 7f65199a · 9a0511c8 · 9a0511c8
Commit 9a0511c8 authored Nov 04, 2019 by Zihao Ye Committed by Minjie Wang Nov 03, 2019
20 changed files
--- a/examples/pytorch/gin/main.py
+++ b/examples/pytorch/gin/main.py
@@ -23,16 +23,16 @@ def train(args, net, trainloader, optimizer, criterion, epoch):
    for pos, (graphs, labels) in zip(bar, trainloader):
        # batch graphs will be shipped to device in forward part of model
        labels = labels.to(args.device)
-        outputs = net(graphs)
+        feat = graphs.ndata['attr'].to(args.device)
+        outputs = net(graphs, feat)

        loss = criterion(outputs, labels)
        running_loss += loss.item()

        # backprop
-        if optimizer is not None:
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()

        # report
        bar.set_description('epoch-{}'.format(epoch))
@@ -50,15 +50,12 @@ def eval_net(args, net, dataloader, criterion):
    total_loss = 0
    total_correct = 0

-    # total_iters = len(dataloader)
-
    for data in dataloader:
        graphs, labels = data
+        feat = graphs.ndata['attr'].to(args.device)
        labels = labels.to(args.device)
-
        total += len(labels)
-
-        outputs = net(graphs)
+        outputs = net(graphs, feat)
        _, predicted = torch.max(outputs.data, 1)

        total_correct += (predicted == labels.data).sum().item()
@@ -99,8 +96,7 @@ def main(args):
        args.num_layers, args.num_mlp_layers,
        dataset.dim_nfeats, args.hidden_dim, dataset.gclasses,
        args.final_dropout, args.learn_eps,
-        args.graph_pooling_type, args.neighbor_pooling_type,
-        args.device).to(args.device)
+        args.graph_pooling_type, args.neighbor_pooling_type).to(args.device)

    criterion = nn.CrossEntropyLoss()  # defaul reduce is true
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

--- a/examples/pytorch/graphsage/graphsage.py
+++ b/examples/pytorch/graphsage/graphsage.py
@@ -6,7 +6,6 @@ Simple reference implementation of GraphSAGE.
 """
 import argparse
 import time
-import abc
 import numpy as np
 import networkx as nx
 import torch

--- a/examples/pytorch/model_zoo/geometric/.gitignore
+++ b/examples/pytorch/model_zoo/geometric/.gitignore
+MNIST/
--- a/examples/pytorch/model_zoo/geometric/README.md
+++ b/examples/pytorch/model_zoo/geometric/README.md
+Geometric Deep Learning models
+=========
+
+This example shows how to use geometric deep learning models defined in `dgl.nn.pytorch.conv` for
+graph classification.
+
+Currently we support following models:
+- [ChebNet](https://arxiv.org/pdf/1606.09375.pdf)
+- [MoNet](https://arxiv.org/pdf/1611.08402.pdf)
+
+## Image Classification on MNIST
+
+By transforming images to graphs, graph classifcation algorithms could
+be applied to image classification problems.
+
+### Usage
+```bash
+python mnist.py --model cheb --gpu 0
+python mnist.py --model monet --gpu 0
+```
+
+### Acknowledgement
+We thank [Xavier Bresson](https://github.com/xbresson) for providing 
+code for graph coarsening algorithm and grid graph building in  
+[CE7454_2019 Labs](https://github.com/xbresson/CE7454_2019/tree/master/codes/labs_lecture14/lab01_ChebGCNs).
--- a/examples/pytorch/model_zoo/geometric/coarsening.py
+++ b/examples/pytorch/model_zoo/geometric/coarsening.py
+# author: xbresson
+# code link: https://github.com/xbresson/CE7454_2019/blob/master/codes/labs_lecture14/lab01_ChebGCNs/lib/coarsening.py
+
+import numpy as np
+import scipy.sparse
+import sklearn.metrics
+
+
+def laplacian(W, normalized=True):
+    """Return graph Laplacian"""
+
+    # Degree matrix.
+    d = W.sum(axis=0)
+
+    # Laplacian matrix.
+    if not normalized:
+        D = scipy.sparse.diags(d.A.squeeze(), 0)
+        L = D - W
+    else:
+        d += np.spacing(np.array(0, W.dtype))
+        d = 1 / np.sqrt(d)
+        D = scipy.sparse.diags(d.A.squeeze(), 0)
+        I = scipy.sparse.identity(d.size, dtype=W.dtype)
+        L = I - D * W * D
+
+    assert np.abs(L - L.T).mean() < 1e-9
+    assert type(L) is scipy.sparse.csr.csr_matrix
+    return L
+
+
+def rescale_L(L, lmax=2):
+    """Rescale Laplacian eigenvalues to [-1,1]"""
+    M, M = L.shape
+    I = scipy.sparse.identity(M, format='csr', dtype=L.dtype)
+    L /= lmax * 2
+    L -= I
+    return L
+
+
+def lmax_L(L):
+    """Compute largest Laplacian eigenvalue"""
+    return scipy.sparse.linalg.eigsh(L, k=1, which='LM', return_eigenvectors=False)[0]
+
+
+# graph coarsening with Heavy Edge Matching
+def coarsen(A, levels):
+    graphs, parents = HEM(A, levels)
+    perms = compute_perm(parents)
+
+    laplacians = []
+    for i, A in enumerate(graphs):
+        M, M = A.shape
+
+        if i < levels:
+            A = perm_adjacency(A, perms[i])
+
+        A = A.tocsr()
+        A.eliminate_zeros()
+        Mnew, Mnew = A.shape
+        print('Layer {0}: M_{0} = |V| = {1} nodes ({2} added), |E| = {3} edges'.format(i, Mnew, Mnew - M, A.nnz // 2))
+
+        L = laplacian(A, normalized=True)
+        laplacians.append(L)
+
+    return laplacians, perms[0] if len(perms) > 0 else None
+
+
+def HEM(W, levels, rid=None):
+    """
+    Coarsen a graph multiple times using the Heavy Edge Matching (HEM).
+    Input
+    W: symmetric sparse weight (adjacency) matrix
+    levels: the number of coarsened graphs
+    Output
+    graph[0]: original graph of size N_1
+    graph[2]: coarser graph of size N_2 < N_1
+    graph[levels]: coarsest graph of Size N_levels < ... < N_2 < N_1
+    parents[i] is a vector of size N_i with entries ranging from 1 to N_{i+1}
+        which indicate the parents in the coarser graph[i+1]
+    nd_sz{i} is a vector of size N_i that contains the size of the supernode in the graph{i}
+    Note
+    if "graph" is a list of length k, then "parents" will be a list of length k-1
+    """
+
+    N, N = W.shape
+
+    if rid is None:
+        rid = np.random.permutation(range(N))
+
+    ss = np.array(W.sum(axis=0)).squeeze()
+    rid = np.argsort(ss)
+
+    parents = []
+    degree = W.sum(axis=0) - W.diagonal()
+    graphs = []
+    graphs.append(W)
+
+    print('Heavy Edge Matching coarsening with Xavier version')
+
+    for _ in range(levels):
+
+        # CHOOSE THE WEIGHTS FOR THE PAIRING
+        # weights = ones(N,1)       # metis weights
+        weights = degree  # graclus weights
+        # weights = supernode_size  # other possibility
+        weights = np.array(weights).squeeze()
+
+        # PAIR THE VERTICES AND CONSTRUCT THE ROOT VECTOR
+        idx_row, idx_col, val = scipy.sparse.find(W)
+        cc = idx_row
+        rr = idx_col
+        vv = val
+
+        # TO BE SPEEDUP
+        if not (list(cc) == list(np.sort(cc))):
+            tmp = cc
+            cc = rr
+            rr = tmp
+
+        cluster_id = HEM_one_level(cc, rr, vv, rid, weights)  # cc is ordered
+        parents.append(cluster_id)
+
+        # COMPUTE THE EDGES WEIGHTS FOR THE NEW GRAPH
+        nrr = cluster_id[rr]
+        ncc = cluster_id[cc]
+        nvv = vv
+        Nnew = cluster_id.max() + 1
+        # CSR is more appropriate: row,val pairs appear multiple times
+        W = scipy.sparse.csr_matrix((nvv, (nrr, ncc)), shape=(Nnew, Nnew))
+        W.eliminate_zeros()
+
+        # Add new graph to the list of all coarsened graphs
+        graphs.append(W)
+        N, N = W.shape
+
+        # COMPUTE THE DEGREE (OMIT OR NOT SELF LOOPS)
+        degree = W.sum(axis=0)
+        # degree = W.sum(axis=0) - W.diagonal()
+
+        # CHOOSE THE ORDER IN WHICH VERTICES WILL BE VISTED AT THE NEXT PASS
+        # [~, rid]=sort(ss);     # arthur strategy
+        # [~, rid]=sort(supernode_size);    #  thomas strategy
+        # rid=randperm(N);                  #  metis/graclus strategy
+        ss = np.array(W.sum(axis=0)).squeeze()
+        rid = np.argsort(ss)
+
+    return graphs, parents
+
+
+# Coarsen a graph given by rr,cc,vv.  rr is assumed to be ordered
+def HEM_one_level(rr, cc, vv, rid, weights):
+    nnz = rr.shape[0]
+    N = rr[nnz - 1] + 1
+
+    marked = np.zeros(N, np.bool)
+    rowstart = np.zeros(N, np.int32)
+    rowlength = np.zeros(N, np.int32)
+    cluster_id = np.zeros(N, np.int32)
+
+    oldval = rr[0]
+    count = 0
+    clustercount = 0
+
+    for ii in range(nnz):
+        rowlength[count] = rowlength[count] + 1
+        if rr[ii] > oldval:
+            oldval = rr[ii]
+            rowstart[count + 1] = ii
+            count = count + 1
+
+    for ii in range(N):
+        tid = rid[ii]
+        if not marked[tid]:
+            wmax = 0.0
+            rs = rowstart[tid]
+            marked[tid] = True
+            bestneighbor = -1
+            for jj in range(rowlength[tid]):
+                nid = cc[rs + jj]
+                if marked[nid]:
+                    tval = 0.0
+                else:
+
+                    # First approach
+                    if 2 == 1:
+                        tval = vv[rs + jj] * (1.0 / weights[tid] + 1.0 / weights[nid])
+
+                    # Second approach
+                    if 1 == 1:
+                        Wij = vv[rs + jj]
+                        Wii = vv[rowstart[tid]]
+                        Wjj = vv[rowstart[nid]]
+                        di = weights[tid]
+                        dj = weights[nid]
+                        tval = (2. * Wij + Wii + Wjj) * 1. / (di + dj + 1e-9)
+
+                if tval > wmax:
+                    wmax = tval
+                    bestneighbor = nid
+
+            cluster_id[tid] = clustercount
+
+            if bestneighbor > -1:
+                cluster_id[bestneighbor] = clustercount
+                marked[bestneighbor] = True
+
+            clustercount += 1
+
+    return cluster_id
+
+
+def compute_perm(parents):
+    """
+    Return a list of indices to reorder the adjacency and data matrices so
+    that the union of two neighbors from layer to layer forms a binary tree.
+    """
+
+    # Order of last layer is random (chosen by the clustering algorithm).
+    indices = []
+    if len(parents) > 0:
+        M_last = max(parents[-1]) + 1
+        indices.append(list(range(M_last)))
+
+    for parent in parents[::-1]:
+
+        # Fake nodes go after real ones.
+        pool_singeltons = len(parent)
+
+        indices_layer = []
+        for i in indices[-1]:
+            indices_node = list(np.where(parent == i)[0])
+            assert 0 <= len(indices_node) <= 2
+
+            # Add a node to go with a singelton.
+            if len(indices_node) is 1:
+                indices_node.append(pool_singeltons)
+                pool_singeltons += 1
+
+            # Add two nodes as children of a singelton in the parent.
+            elif len(indices_node) is 0:
+                indices_node.append(pool_singeltons + 0)
+                indices_node.append(pool_singeltons + 1)
+                pool_singeltons += 2
+
+            indices_layer.extend(indices_node)
+        indices.append(indices_layer)
+
+    # Sanity checks.
+    for i, indices_layer in enumerate(indices):
+        M = M_last * 2 ** i
+        # Reduction by 2 at each layer (binary tree).
+        assert len(indices[0] == M)
+        # The new ordering does not omit an indice.
+        assert sorted(indices_layer) == list(range(M))
+
+    return indices[::-1]
+
+
+assert (compute_perm([np.array([4, 1, 1, 2, 2, 3, 0, 0, 3]), np.array([2, 1, 0, 1, 0])])
+        == [[3, 4, 0, 9, 1, 2, 5, 8, 6, 7, 10, 11], [2, 4, 1, 3, 0, 5], [0, 1, 2]])
+
+
+def perm_adjacency(A, indices):
+    """
+    Permute adjacency matrix, i.e. exchange node ids,
+    so that binary unions form the clustering tree.
+    """
+    if indices is None:
+        return A
+
+    M, M = A.shape
+    Mnew = len(indices)
+    A = A.tocoo()
+
+    # Add Mnew - M isolated vertices.
+    rows = scipy.sparse.coo_matrix((Mnew - M, M), dtype=np.float32)
+    cols = scipy.sparse.coo_matrix((Mnew, Mnew - M), dtype=np.float32)
+    A = scipy.sparse.vstack([A, rows])
+    A = scipy.sparse.hstack([A, cols])
+
+    # Permute the rows and the columns.
+    perm = np.argsort(indices)
+    A.row = np.array(perm)[A.row]
+    A.col = np.array(perm)[A.col]
+
+    assert np.abs(A - A.T).mean() < 1e-8  # 1e-9
+    assert type(A) is scipy.sparse.coo.coo_matrix
+    return A
+
+
+def perm_data(x, indices):
+    """
+    Permute data matrix, i.e. exchange node ids,
+    so that binary unions form the clustering tree.
+    """
+    if indices is None:
+        return x
+
+    N, M = x.shape
+    Mnew = len(indices)
+    assert Mnew >= M
+    xnew = np.empty((N, Mnew))
+    for i, j in enumerate(indices):
+        # Existing vertex, i.e. real data.
+        if j < M:
+            xnew[:, i] = x[:, j]
+        # Fake vertex because of singeltons.
+        # They will stay 0 so that max pooling chooses the singelton.
+        # Or -infty ?
+        else:
+            xnew[:, i] = np.zeros(N)
+    return xnew
--- a/examples/pytorch/model_zoo/geometric/coordinate.py
+++ b/examples/pytorch/model_zoo/geometric/coordinate.py
+import torch as th
+
+"""Compute x,y coordinate for nodes in the graph"""
+eps = 1e-8
+def get_coordinates(graphs, grid_side, coarsening_levels, perm):
+    rst = []
+    for l in range(coarsening_levels + 1):
+        xs, ys = [], []
+        for i in range(graphs[l].number_of_nodes()):
+            cnt = eps
+            x_accum = 0
+            y_accum = 0
+            for j in range(i * 2 ** l, (i + 1) * 2 ** l):
+                if perm[j] < grid_side ** 2:
+                    x_accum += (perm[j] // grid_side)
+                    y_accum += (perm[j] % grid_side)
+                    cnt += 1
+            xs.append(x_accum / cnt)
+            ys.append(y_accum / cnt)
+        rst.append(th.cat([th.tensor(xs).view(-1, 1), th.tensor(ys).view(-1, 1)], -1))
+    return rst
+
+"""Cartesian coordinate to polar coordinate"""
+def z2polar(edges):
+    z = edges.dst['xy'] - edges.src['xy']
+    rho = th.norm(z, dim=-1, p=2)
+    x, y = z.unbind(dim=-1)
+    phi = th.atan2(y, x)
+    return {'u': th.cat([rho.unsqueeze(-1), phi.unsqueeze(-1)], -1)}
+
--- a/examples/pytorch/model_zoo/geometric/grid_graph.py
+++ b/examples/pytorch/model_zoo/geometric/grid_graph.py
+# author: xbresson
+# code link: https://github.com/xbresson/CE7454_2019/blob/master/codes/labs_lecture14/lab01_ChebGCNs/lib/grid_graph.py
+
+import sklearn
+import sklearn.metrics
+import scipy.sparse, scipy.sparse.linalg  # scipy.spatial.distance
+import numpy as np
+
+
+def grid_graph(grid_side,number_edges,metric):
+    """Generate graph of a grid"""
+    z = grid(grid_side)
+    dist, idx = distance_sklearn_metrics(z, k=number_edges, metric=metric)
+    A = adjacency(dist, idx)
+    print("nb edges: ",A.nnz)
+    return A
+
+
+def grid(m, dtype=np.float32):
+    """Return coordinates of grid points"""
+    M = m**2
+    x = np.linspace(0,1,m, dtype=dtype)
+    y = np.linspace(0,1,m, dtype=dtype)
+    xx, yy = np.meshgrid(x, y)
+    z = np.empty((M,2), dtype)
+    z[:,0] = xx.reshape(M)
+    z[:,1] = yy.reshape(M)
+    return z
+
+
+def distance_sklearn_metrics(z, k=4, metric='euclidean'):
+    """Compute pairwise distances"""
+    #d = sklearn.metrics.pairwise.pairwise_distances(z, metric=metric, n_jobs=-2)
+    d = sklearn.metrics.pairwise.pairwise_distances(z, metric=metric, n_jobs=1)
+    # k-NN
+    idx = np.argsort(d)[:,1:k+1]
+    d.sort()
+    d = d[:,1:k+1]
+    return d, idx
+
+
+def adjacency(dist, idx):
+    """Return adjacency matrix of a kNN graph"""
+    M, k = dist.shape
+    assert M, k == idx.shape
+    assert dist.min() >= 0
+    assert dist.max() <= 1
+
+    # Pairwise distances
+    sigma2 = np.mean(dist[:,-1])**2
+    dist = np.exp(- dist**2 / sigma2)
+
+    # Weight matrix
+    I = np.arange(0, M).repeat(k)
+    J = idx.reshape(M*k)
+    V = dist.reshape(M*k)
+    W = scipy.sparse.coo_matrix((V, (I, J)), shape=(M, M))
+
+    # No self-connections
+    W.setdiag(0)
+
+    # Undirected graph
+    bigger = W.T > W
+    W = W - W.multiply(bigger) + W.T.multiply(bigger)
+
+    assert W.nnz % 2 == 0
+    assert np.abs(W - W.T).mean() < 1e-10
+    assert type(W) is scipy.sparse.csr.csr_matrix
+    return W
--- a/examples/pytorch/model_zoo/geometric/mnist.py
+++ b/examples/pytorch/model_zoo/geometric/mnist.py
+import argparse
+import time
+import numpy as np
+import networkx as nx
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import dgl
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+from dgl import DGLGraph
+from dgl.data import register_data_args, load_data
+from dgl.nn.pytorch.conv import ChebConv, GMMConv
+from dgl.nn.pytorch.glob import MaxPooling
+from grid_graph import grid_graph
+from coarsening import coarsen
+from coordinate import get_coordinates, z2polar
+
+argparser = argparse.ArgumentParser("MNIST")
+argparser.add_argument("--gpu", type=int, default=-1,
+                       help="gpu id, use cpu if set to -1")
+argparser.add_argument("--model", type=str, default="chebnet",
+                       help="model to use, chebnet/monet")
+argparser.add_argument("--batch-size", type=int, default=100,
+                       help="batch size")
+args = argparser.parse_args()
+
+grid_side = 28
+number_edges = 8
+metric = 'euclidean'
+
+A = grid_graph(28, 8, metric)
+
+coarsening_levels = 4
+L, perm = coarsen(A, coarsening_levels)
+g_arr = [DGLGraph(csr) for csr in L]
+
+coordinate_arr = get_coordinates(g_arr, grid_side, coarsening_levels, perm)
+for g, coordinate_arr in zip(g_arr, coordinate_arr):
+    g.ndata['xy'] = coordinate_arr
+    g.apply_edges(z2polar)
+
+def batcher(batch):
+    g_batch = [[] for _ in range(coarsening_levels + 1)]
+    x_batch = []
+    y_batch = []
+    for x, y in batch:
+        x = torch.cat([x.view(-1), x.new_zeros(928 - 28 ** 2)], 0)
+        x = x[perm]
+        x_batch.append(x)
+        y_batch.append(y)
+        for i in range(coarsening_levels + 1):
+            g_batch[i].append(g_arr[i])
+
+    x_batch = torch.cat(x_batch).unsqueeze(-1)
+    y_batch = torch.LongTensor(y_batch)
+    g_batch = [dgl.batch(g) for g in g_batch]
+    return g_batch, x_batch, y_batch
+
+trainset = datasets.MNIST(root='.', train=True, download=True, transform=transforms.ToTensor())
+testset = datasets.MNIST(root='.', train=False, download=True, transform=transforms.ToTensor())
+
+train_loader = DataLoader(trainset,
+                          batch_size=args.batch_size,
+                          shuffle=True,
+                          collate_fn=batcher,
+                          num_workers=6)
+test_loader = DataLoader(testset,
+                         batch_size=args.batch_size,
+                         shuffle=False,
+                         collate_fn=batcher,
+                         num_workers=6)
+
+class MoNet(nn.Module):
+    def __init__(self,
+                 n_kernels,
+                 in_feats,
+                 hiddens,
+                 out_feats):
+        super(MoNet, self).__init__()
+        self.pool = nn.MaxPool1d(2)
+        self.layers = nn.ModuleList()
+        self.readout = MaxPooling()
+
+        # Input layer
+        self.layers.append(
+            GMMConv(in_feats, hiddens[0], 2, n_kernels))
+
+        # Hidden layer
+        for i in range(1, len(hiddens)):
+            self.layers.append(GMMConv(hiddens[i - 1], hiddens[i], 2, n_kernels))
+
+        self.cls = nn.Sequential(
+            nn.Linear(hiddens[-1], out_feats),
+            nn.LogSoftmax()
+        )
+
+    def forward(self, g_arr, feat):
+        for g, layer in zip(g_arr, self.layers):
+            u = g.edata['u'].to(feat.device)
+            feat = self.pool(layer(g, feat, u).transpose(-1, -2).unsqueeze(0))\
+                .squeeze(0).transpose(-1, -2)
+        return self.cls(self.readout(g_arr[-1], feat))
+
+class ChebNet(nn.Module):
+    def __init__(self,
+                 k,
+                 in_feats,
+                 hiddens,
+                 out_feats):
+        super(ChebNet, self).__init__()
+        self.pool = nn.MaxPool1d(2)
+        self.layers = nn.ModuleList()
+        self.readout = MaxPooling()
+
+        # Input layer
+        self.layers.append(
+            ChebConv(in_feats, hiddens[0], k))
+
+        for i in range(1, len(hiddens)):
+            self.layers.append(
+                ChebConv(hiddens[i - 1], hiddens[i], k))
+
+        self.cls = nn.Sequential(
+            nn.Linear(hiddens[-1], out_feats),
+            nn.LogSoftmax()
+        )
+
+    def forward(self, g_arr, feat):
+        for g, layer in zip(g_arr, self.layers):
+            feat = self.pool(layer(g, feat, [2] * g.batch_size).transpose(-1, -2).unsqueeze(0))\
+                .squeeze(0).transpose(-1, -2)
+        return self.cls(self.readout(g_arr[-1], feat))
+
+if args.gpu == -1:
+    device = torch.device('cpu')
+else:
+    device = torch.device(args.gpu)
+
+if args.model == 'chebnet':
+    model = ChebNet(2, 1, [32, 64, 128, 256], 10)
+else:
+    model = MoNet(10, 1, [32, 64, 128, 256], 10)
+
+model = model.to(device)
+
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+log_interval = 50
+
+for epoch in range(10):
+    print('epoch {} starts'.format(epoch))
+    model.train()
+    hit, tot = 0, 0
+    loss_accum = 0
+    for i, (g, x, y) in enumerate(train_loader):
+        x = x.to(device)
+        y = y.to(device)
+        out = model(g, x)
+        hit += (out.max(-1)[1] == y).sum().item()
+        tot += len(y)
+        loss = F.nll_loss(out, y)
+        loss_accum += loss.item()
+
+        if (i + 1) % log_interval == 0:
+            print('loss: {}, acc: {}'.format(loss_accum / log_interval, hit / tot))
+            hit, tot = 0, 0
+            loss_accum = 0
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+    model.eval()
+    hit, tot = 0, 0
+    for g, x, y in test_loader:
+        x = x.to(device)
+        y = y.to(device)
+        out = model(g, x)
+        hit += (out.max(-1)[1] == y).sum().item()
+        tot += len(y)
+
+    print('test acc: ', hit / tot)
--- a/examples/pytorch/monet/README.md
+++ b/examples/pytorch/monet/README.md
+MoNet
+=====
+
+- paper link: [Geometric deep learning on graphs and manifolds using mixture model CNNs](https://arxiv.org/pdf/1611.08402.pdf)
+
+Dependencies
+============
+
+- pytorch 1.1+
+
+Results
+=======
+
+Node classification on citation networks:
+- Cora: ~0.816
+- Pubmed: ~0.763
+
+Image classification on MNIST:
+- please refer to [model_zoo/geometric](../model_zoo/geometric).
\ No newline at end of file
--- a/examples/pytorch/monet/citation.py
+++ b/examples/pytorch/monet/citation.py
+import argparse
+import time
+import numpy as np
+import networkx as nx
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dgl import DGLGraph
+from dgl.data import register_data_args, load_data
+from dgl.nn.pytorch.conv import GMMConv
+
+
+class MoNet(nn.Module):
+    def __init__(self,
+                 g,
+                 in_feats,
+                 n_hidden,
+                 out_feats,
+                 n_layers,
+                 dim,
+                 n_kernels,
+                 dropout):
+        super(MoNet, self).__init__()
+        self.g = g
+        self.layers = nn.ModuleList()
+        self.pseudo_proj = nn.ModuleList()
+
+        # Input layer
+        self.layers.append(
+            GMMConv(in_feats, n_hidden, dim, n_kernels))
+        self.pseudo_proj.append(
+            nn.Sequential(nn.Linear(2, dim), nn.Tanh()))
+
+        # Hidden layer
+        for _ in range(n_layers - 1):
+            self.layers.append(GMMConv(n_hidden, n_hidden, dim, n_kernels))
+            self.pseudo_proj.append(
+                nn.Sequential(nn.Linear(2, dim), nn.Tanh()))
+
+        # Output layer
+        self.layers.append(GMMConv(n_hidden, out_feats, dim, n_kernels))
+        self.pseudo_proj.append(
+            nn.Sequential(nn.Linear(2, dim), nn.Tanh()))
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, feat, pseudo):
+        h = feat
+        for i in range(len(self.layers)):
+            if i != 0:
+                h = self.dropout(h)
+            h = self.layers[i](
+                self.g, h, self.pseudo_proj[i](pseudo))
+        return h
+
+def evaluate(model, features, pseudo, labels, mask):
+    model.eval()
+    with torch.no_grad():
+        logits = model(features, pseudo)
+        logits = logits[mask]
+        labels = labels[mask]
+        _, indices = torch.max(logits, dim=1)
+        correct = torch.sum(indices == labels)
+        return correct.item() * 1.0 / len(labels)
+
+def main(args):
+    # load and preprocess dataset
+    data = load_data(args)
+    features = torch.FloatTensor(data.features)
+    labels = torch.LongTensor(data.labels)
+    if False: #hasattr(torch, 'BoolTensor'):
+        train_mask = torch.BoolTensor(data.train_mask)
+        val_mask = torch.BoolTensor(data.val_mask)
+        test_mask = torch.BoolTensor(data.test_mask)
+    else:
+        train_mask = torch.ByteTensor(data.train_mask)
+        val_mask = torch.ByteTensor(data.val_mask)
+        test_mask = torch.ByteTensor(data.test_mask)
+    in_feats = features.shape[1]
+    n_classes = data.num_labels
+    n_edges = data.graph.number_of_edges()
+    print("""----Data statistics------'
+      #Edges %d
+      #Classes %d
+      #Train samples %d
+      #Val samples %d
+      #Test samples %d""" %
+          (n_edges, n_classes,
+           train_mask.sum().item(),
+           val_mask.sum().item(),
+           test_mask.sum().item()))
+
+    if args.gpu < 0:
+        cuda = False
+    else:
+        cuda = True
+        torch.cuda.set_device(args.gpu)
+        features = features.cuda()
+        labels = labels.cuda()
+        train_mask = train_mask.cuda()
+        val_mask = val_mask.cuda()
+        test_mask = test_mask.cuda()
+        print("use cuda:", args.gpu)
+
+    # graph preprocess and calculate normalization factor
+    g = data.graph
+    g.remove_edges_from(nx.selfloop_edges(g))
+    g = DGLGraph(g)
+    n_edges = g.number_of_edges()
+    us, vs = g.edges()
+    pseudo = []
+    for i in range(g.number_of_edges()):
+        pseudo.append([
+            1 / np.sqrt(g.in_degree(us[i])),
+            1 / np.sqrt(g.in_degree(vs[i]))
+        ])
+    pseudo = torch.Tensor(pseudo)
+    if cuda:
+        pseudo = pseudo.cuda()
+
+    # create GraphSAGE model
+    model = MoNet(g,
+                  in_feats,
+                  args.n_hidden,
+                  n_classes,
+                  args.n_layers,
+                  args.pseudo_dim,
+                  args.n_kernels,
+                  args.dropout
+                  )
+
+    if cuda:
+        model.cuda()
+    loss_fcn = torch.nn.CrossEntropyLoss()
+
+    # use optimizer
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+
+    # initialize graph
+    dur = []
+    for epoch in range(args.n_epochs):
+        model.train()
+        if epoch >= 3:
+            t0 = time.time()
+        # forward
+        logits = model(features, pseudo)
+        loss = loss_fcn(logits[train_mask], labels[train_mask])
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        if epoch >= 3:
+            dur.append(time.time() - t0)
+
+        acc = evaluate(model, features, pseudo, labels, val_mask)
+        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
+              "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
+                                            acc, n_edges / np.mean(dur) / 1000))
+
+    print()
+    acc = evaluate(model, features, pseudo, labels, test_mask)
+    print("Test Accuracy {:.4f}".format(acc))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='MoNet on citation network')
+    register_data_args(parser)
+    parser.add_argument("--dropout", type=float, default=0.5,
+                        help="dropout probability")
+    parser.add_argument("--gpu", type=int, default=-1,
+                        help="gpu")
+    parser.add_argument("--lr", type=float, default=1e-2,
+                        help="learning rate")
+    parser.add_argument("--n-epochs", type=int, default=200,
+                        help="number of training epochs")
+    parser.add_argument("--n-hidden", type=int, default=16,
+                        help="number of hidden gcn units")
+    parser.add_argument("--n-layers", type=int, default=1,
+                        help="number of hidden gcn layers")
+    parser.add_argument("--pseudo-dim", type=int, default=2,
+                        help="Pseudo coordinate dimensions in GMMConv, 2 for cora and 3 for pubmed")
+    parser.add_argument("--n-kernels", type=int, default=3,
+                        help="Number of kernels in GMMConv layer")
+    parser.add_argument("--weight-decay", type=float, default=5e-4,
+                        help="Weight for L2 loss")
+    args = parser.parse_args()
+    print(args)
+
+    main(args)
--- a/python/dgl/data/gindt.py
+++ b/python/dgl/data/gindt.py
@@ -10,6 +10,8 @@ https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip
 import os
 import numpy as np

+from .. import backend as F
+
 from .utils import download, extract_archive, get_download_dir, _get_dgl_url
 from ..graph import DGLGraph

@@ -235,8 +237,7 @@ class GINDataset(object):
            for g in self.graphs:
                g.ndata['attr'] = np.zeros((
                    g.number_of_nodes(), len(label2idx)))
-                g.ndata['attr'][range(g.number_of_nodes(
-                )), [label2idx[nl.item()] for nl in g.ndata['label']]] = 1
+                g.ndata['attr'][:, [label2idx[F.as_scalar(nl)] for nl in g.ndata['label']]] = 1

        # after load, get the #classes and #dim
        self.gclasses = len(self.glabel_dict)

--- a/python/dgl/nn/mxnet/conv/__init__.py
+++ b/python/dgl/nn/mxnet/conv/__init__.py
@@ -4,5 +4,22 @@
 from .graphconv import GraphConv
 from .relgraphconv import RelGraphConv
 from .tagconv import TAGConv
+from .gatconv import GATConv
+from .sageconv import SAGEConv
+from .gatedgraphconv import GatedGraphConv
+from .chebconv import ChebConv
+from .agnnconv import AGNNConv
+from .appnpconv import APPNPConv
+from .densegraphconv import DenseGraphConv
+from .densesageconv import DenseSAGEConv
+from .densechebconv import DenseChebConv
+from .edgeconv import EdgeConv
+from .ginconv import GINConv
+from .gmmconv import GMMConv
+from .nnconv import NNConv
+from .sgconv import SGConv

-__all__ = ['GraphConv', 'TAGConv', 'RelGraphConv']
+__all__ = ['GraphConv', 'TAGConv', 'RelGraphConv', 'GATConv',
+           'SAGEConv', 'GatedGraphConv', 'ChebConv', 'AGNNConv',
+           'APPNPConv', 'DenseGraphConv', 'DenseSAGEConv', 'DenseChebConv',
+           'EdgeConv', 'GINConv', 'GMMConv', 'NNConv', 'SGConv']
--- a/python/dgl/nn/mxnet/conv/agnnconv.py
+++ b/python/dgl/nn/mxnet/conv/agnnconv.py
+"""MXNet Module for Attention-based Graph Neural Network layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import mxnet as mx
+from mxnet.gluon import nn
+
+from .... import function as fn
+from ..softmax import edge_softmax
+from ..utils import normalize
+
+class AGNNConv(nn.Block):
+    r"""Attention-based Graph Neural Network layer from paper `Attention-based
+    Graph Neural Network for Semi-Supervised Learning
+    <https://arxiv.org/abs/1803.03735>`__.
+
+    .. math::
+        H^{l+1} = P H^{l}
+
+    where :math:`P` is computed as:
+
+    .. math::
+        P_{ij} = \mathrm{softmax}_i ( \beta \cdot \cos(h_i^l, h_j^l))
+
+    Parameters
+    ----------
+    init_beta : float, optional
+        The :math:`\beta` in the formula.
+    learn_beta : bool, optional
+        If True, :math:`\beta` will be learnable parameter.
+    """
+    def __init__(self,
+                 init_beta=1.,
+                 learn_beta=True):
+        super(AGNNConv, self).__init__()
+        with self.name_scope():
+            self.beta = self.params.get('beta',
+                                        shape=(1,),
+                                        grad_req='write' if learn_beta else 'null',
+                                        init=mx.init.Constant(init_beta))
+
+    def forward(self, graph, feat):
+        r"""Compute AGNN Layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : mxnet.NDArray
+            The input feature of shape :math:`(N, *)` :math:`N` is the
+            number of nodes, and :math:`*` could be of any shape.
+
+        Returns
+        -------
+        mxnet.NDArray
+            The output feature of shape :math:`(N, *)` where :math:`*`
+            should be the same as input shape.
+        """
+        graph = graph.local_var()
+        graph.ndata['h'] = feat
+        graph.ndata['norm_h'] = normalize(feat, p=2, axis=-1)
+        # compute cosine distance
+        graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos'))
+        cos = graph.edata.pop('cos')
+        e = self.beta.data(feat.context) * cos
+        graph.edata['p'] = edge_softmax(graph, e)
+        graph.update_all(fn.u_mul_e('h', 'p', 'm'), fn.sum('m', 'h'))
+        return graph.ndata.pop('h')
--- a/python/dgl/nn/mxnet/conv/appnpconv.py
+++ b/python/dgl/nn/mxnet/conv/appnpconv.py
+"""MXNet Module for APPNPConv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import mxnet as mx
+from mxnet import nd
+from mxnet.gluon import nn
+
+from .... import function as fn
+
+class APPNPConv(nn.Block):
+    r"""Approximate Personalized Propagation of Neural Predictions
+    layer from paper `Predict then Propagate: Graph Neural Networks
+    meet Personalized PageRank <https://arxiv.org/pdf/1810.05997.pdf>`__.
+
+    .. math::
+        H^{0} & = X
+
+        H^{t+1} & = (1-\alpha)\left(\hat{D}^{-1/2}
+        \hat{A} \hat{D}^{-1/2} H^{t} + \alpha H^{0}\right)
+
+    Parameters
+    ----------
+    k : int
+        Number of iterations :math:`K`.
+    alpha : float
+        The teleport probability :math:`\alpha`.
+    edge_drop : float, optional
+        Dropout rate on edges that controls the
+        messages received by each node. Default: ``0``.
+    """
+    def __init__(self,
+                 k,
+                 alpha,
+                 edge_drop=0.):
+        super(APPNPConv, self).__init__()
+        self._k = k
+        self._alpha = alpha
+        with self.name_scope():
+            self.edge_drop = nn.Dropout(edge_drop)
+
+    def forward(self, graph, feat):
+        r"""Compute APPNP layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : mx.NDArray
+            The input feature of shape :math:`(N, *)` :math:`N` is the
+            number of nodes, and :math:`*` could be of any shape.
+
+        Returns
+        -------
+        mx.NDArray
+            The output feature of shape :math:`(N, *)` where :math:`*`
+            should be the same as input shape.
+        """
+        graph = graph.local_var()
+        norm = mx.nd.power(mx.nd.clip(
+            graph.in_degrees().astype(feat.dtype), a_min=1, a_max=float("inf")), -0.5)
+        shp = norm.shape + (1,) * (feat.ndim - 1)
+        norm = norm.reshape(shp).as_in_context(feat.context)
+        feat_0 = feat
+        for _ in range(self._k):
+            # normalization by src node
+            feat = feat * norm
+            graph.ndata['h'] = feat
+            graph.edata['w'] = self.edge_drop(
+                nd.ones((graph.number_of_edges(), 1), ctx=feat.context))
+            graph.update_all(fn.u_mul_e('h', 'w', 'm'),
+                             fn.sum('m', 'h'))
+            feat = graph.ndata.pop('h')
+            # normalization by dst node
+            feat = feat * norm
+            feat = (1 - self._alpha) * feat + self._alpha * feat_0
+        return feat
--- a/python/dgl/nn/mxnet/conv/chebconv.py
+++ b/python/dgl/nn/mxnet/conv/chebconv.py
+"""MXNet Module for Chebyshev Spectral Graph Convolution layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import math
+import mxnet as mx
+from mxnet import nd
+from mxnet.gluon import nn
+
+from .... import laplacian_lambda_max, broadcast_nodes, function as fn
+
+
+class ChebConv(nn.Block):
+    r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
+    Neural Networks on Graphs with Fast Localized Spectral Filtering
+    <https://arxiv.org/pdf/1606.09375.pdf>`__.
+
+    .. math::
+        h_i^{l+1} &= \sum_{k=0}^{K-1} W^{k, l}z_i^{k, l}
+
+        Z^{0, l} &= H^{l}
+
+        Z^{1, l} &= \hat{L} \cdot H^{l}
+
+        Z^{k, l} &= 2 \cdot \hat{L} \cdot Z^{k-1, l} - Z^{k-2, l}
+
+        \hat{L} &= 2\left(I - \hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2}\right)/\lambda_{max} - I
+
+    Parameters
+    ----------
+    in_feats: int
+        Number of input features.
+    out_feats: int
+        Number of output features.
+    k : int
+        Chebyshev filter size.
+    bias : bool, optional
+        If True, adds a learnable bias to the output. Default: ``True``.
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 k,
+                 bias=True):
+        super(ChebConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._k = k
+        with self.name_scope():
+            self.fc = nn.Sequential()
+            for _ in range(k):
+                self.fc.add(
+                    nn.Dense(out_feats, use_bias=False,
+                             weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)),
+                             in_units=in_feats)
+                )
+            if bias:
+                self.bias = self.params.get('bias', shape=(out_feats,),
+                                            init=mx.init.Zero())
+            else:
+                self.bias = None
+
+    def forward(self, graph, feat, lambda_max=None):
+        r"""Compute ChebNet layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph or BatchedDGLGraph
+            The graph.
+        feat : mxnet.NDArray
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+        lambda_max : list or mxnet.NDArray or None, optional.
+            A list(tensor) with length :math:`B`, stores the largest eigenvalue
+            of the normalized laplacian of each individual graph in ``graph``,
+            where :math:`B` is the batch size of the input graph. Default: None.
+            If None, this method would compute the list by calling
+            ``dgl.laplacian_lambda_max``.
+
+        Returns
+        -------
+        mxnet.NDArray
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        with graph.local_scope():
+            degs = graph.in_degrees().astype('float32')
+            norm = mx.nd.power(mx.nd.clip(degs, a_min=1, a_max=float("inf")), -0.5)
+            norm = norm.expand_dims(-1).as_in_context(feat.context)
+            if lambda_max is None:
+                lambda_max = laplacian_lambda_max(graph)
+            if isinstance(lambda_max, list):
+                lambda_max = nd.array(lambda_max).as_in_context(feat.context)
+            if lambda_max.ndim == 1:
+                lambda_max = lambda_max.expand_dims(-1)
+            # broadcast from (B, 1) to (N, 1)
+            lambda_max = broadcast_nodes(graph, lambda_max)
+            # T0(X)
+            Tx_0 = feat
+            rst = self.fc[0](Tx_0)
+            # T1(X)
+            if self._k > 1:
+                graph.ndata['h'] = Tx_0 * norm
+                graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
+                h = graph.ndata.pop('h') * norm
+                # Λ = 2 * (I - D ^ -1/2 A D ^ -1/2) / lambda_max - I
+                #   = - 2(D ^ -1/2 A D ^ -1/2) / lambda_max + (2 / lambda_max - 1) I
+                Tx_1 = -2. * h / lambda_max + Tx_0 * (2. / lambda_max - 1)
+                rst = rst + self.fc[1](Tx_1)
+            # Ti(x), i = 2...k
+            for i in range(2, self._k):
+                graph.ndata['h'] = Tx_1 * norm
+                graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
+                h = graph.ndata.pop('h') * norm
+                # Tx_k = 2 * Λ * Tx_(k-1) - Tx_(k-2)
+                #      = - 4(D ^ -1/2 A D ^ -1/2) / lambda_max Tx_(k-1) +
+                #        (4 / lambda_max - 2) Tx_(k-1) -
+                #        Tx_(k-2)
+                Tx_2 = -4. * h / lambda_max + Tx_1 * (4. / lambda_max - 2) - Tx_0
+                rst = rst + self.fc[i](Tx_2)
+                Tx_1, Tx_0 = Tx_2, Tx_1
+            # add bias
+            if self.bias is not None:
+                rst = rst + self.bias.data(feat.context)
+            return rst
--- a/python/dgl/nn/mxnet/conv/densechebconv.py
+++ b/python/dgl/nn/mxnet/conv/densechebconv.py
+"""MXNet Module for DenseChebConv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import math
+import mxnet as mx
+from mxnet import nd
+from mxnet.gluon import nn
+
+
+class DenseChebConv(nn.Block):
+    r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
+    Neural Networks on Graphs with Fast Localized Spectral Filtering
+    <https://arxiv.org/pdf/1606.09375.pdf>`__.
+
+    We recommend to use this module when inducing ChebConv operations on dense
+    graphs / k-hop graphs.
+
+    Parameters
+    ----------
+    in_feats: int
+        Number of input features.
+    out_feats: int
+        Number of output features.
+    k : int
+        Chebyshev filter size.
+    bias : bool, optional
+        If True, adds a learnable bias to the output. Default: ``True``.
+
+    See also
+    --------
+    ChebConv
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 k,
+                 bias=True):
+        super(DenseChebConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._k = k
+        with self.name_scope():
+            self.fc = nn.Sequential()
+            for _ in range(k):
+                self.fc.add(
+                    nn.Dense(out_feats, in_units=in_feats, use_bias=False,
+                             weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)))
+                )
+            if bias:
+                self.bias = self.params.get('bias', shape=(out_feats,),
+                                            init=mx.init.Zero())
+            else:
+                self.bias = None
+
+    def forward(self, adj, feat, lambda_max=None):
+        r"""Compute (Dense) Chebyshev Spectral Graph Convolution layer.
+
+        Parameters
+        ----------
+        adj : mxnet.NDArray
+            The adjacency matrix of the graph to apply Graph Convolution on,
+            should be of shape :math:`(N, N)`, where a row represents the destination
+            and a column represents the source.
+        feat : mxnet.NDArray
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+        lambda_max : float or None, optional
+            A float value indicates the largest eigenvalue of given graph.
+            Default: None.
+
+        Returns
+        -------
+        mxnet.NDArray
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        A = adj.astype(feat.dtype).as_in_context(feat.context)
+        num_nodes = A.shape[0]
+
+        in_degree = 1. / nd.clip(A.sum(axis=1), 1, float('inf')).sqrt()
+        D_invsqrt = nd.diag(in_degree)
+        I = nd.eye(num_nodes, ctx=A.context)
+        L = I - nd.dot(D_invsqrt, nd.dot(A, D_invsqrt))
+
+        if lambda_max is None:
+            # NOTE(zihao): this only works for directed graph.
+            lambda_max = (nd.linalg.syevd(L)[1]).max()
+
+        L_hat = 2 * L / lambda_max - I
+        Z = [nd.eye(num_nodes, ctx=A.context)]
+        Zh = self.fc[0](feat)
+        for i in range(1, self._k):
+            if i == 1:
+                Z.append(L_hat)
+            else:
+                Z.append(2 * nd.dot(L_hat, Z[-1]) - Z[-2])
+            Zh = Zh + nd.dot(Z[i], self.fc[i](feat))
+
+        if self.bias is not None:
+            Zh = Zh + self.bias.data(feat.context)
+        return Zh
--- a/python/dgl/nn/mxnet/conv/densegraphconv.py
+++ b/python/dgl/nn/mxnet/conv/densegraphconv.py
+"""MXNet Module for DenseGraphConv"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import math
+import mxnet as mx
+from mxnet import nd
+from mxnet.gluon import nn
+
+
+class DenseGraphConv(nn.Block):
+    """Graph Convolutional Network layer where the graph structure
+    is given by an adjacency matrix.
+    We recommend user to use this module when inducing graph convolution
+    on dense graphs / k-hop graphs.
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    norm : bool
+        If True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
+    bias : bool
+        If True, adds a learnable bias to the output. Default: ``True``.
+    activation : callable activation function/layer or None, optional
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+
+    See also
+    --------
+    GraphConv
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 norm=True,
+                 bias=True,
+                 activation=None):
+        super(DenseGraphConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._norm = norm
+        with self.name_scope():
+            self.weight = self.params.get('weight', shape=(in_feats, out_feats),
+                                          init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
+            if bias:
+                self.bias = self.params.get('bias', shape=(out_feats,),
+                                            init=mx.init.Zero())
+            else:
+                self.bias = None
+            self._activation = activation
+
+    def forward(self, adj, feat):
+        r"""Compute (Dense) Graph Convolution layer.
+
+        Parameters
+        ----------
+        adj : mxnet.NDArray
+            The adjacency matrix of the graph to apply Graph Convolution on,
+            should be of shape :math:`(N, N)`, where a row represents the destination
+            and a column represents the source.
+        feat : mxnet.NDArray
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        mxnet.NDArray
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        adj = adj.astype(feat.dtype).as_in_context(feat.context)
+        if self._norm:
+            in_degrees = adj.sum(axis=1)
+            norm = nd.power(in_degrees, -0.5)
+            shp = norm.shape + (1,) * (feat.ndim - 1)
+            norm = norm.reshape(shp).as_in_context(feat.context)
+            feat = feat * norm
+
+        if self._in_feats > self._out_feats:
+            # mult W first to reduce the feature size for aggregation.
+            feat = nd.dot(feat, self.weight.data(feat.context))
+            rst = nd.dot(adj, feat)
+        else:
+            # aggregate first then mult W
+            rst = nd.dot(adj, feat)
+            rst = nd.dot(rst, self.weight.data(feat.context))
+
+        if self._norm:
+            rst = rst * norm
+
+        if self.bias is not None:
+            rst = rst + self.bias.data(feat.context)
+
+        if self._activation is not None:
+            rst = self._activation(rst)
+
+        return rst
--- a/python/dgl/nn/mxnet/conv/densesageconv.py
+++ b/python/dgl/nn/mxnet/conv/densesageconv.py
+"""MXNet Module for DenseGraphSAGE"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import math
+import mxnet as mx
+from mxnet import nd
+from mxnet.gluon import nn
+
+
+class DenseSAGEConv(nn.Block):
+    """GraphSAGE layer where the graph structure is given by an
+    adjacency matrix.
+    We recommend to use this module when inducing GraphSAGE operations
+    on dense graphs / k-hop graphs.
+
+    Note that we only support gcn aggregator in DenseSAGEConv.
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    feat_drop : float, optional
+        Dropout rate on features. Default: 0.
+    bias : bool
+        If True, adds a learnable bias to the output. Default: ``True``.
+    norm : callable activation function/layer or None, optional
+        If not None, applies normalization to the updated node features.
+    activation : callable activation function/layer or None, optional
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+
+    See also
+    --------
+    SAGEConv
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 feat_drop=0.,
+                 bias=True,
+                 norm=None,
+                 activation=None):
+        super(DenseSAGEConv, self).__init__()
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        self._norm = norm
+        with self.name_scope():
+            self.feat_drop = nn.Dropout(feat_drop)
+            self.activation = activation
+            self.fc = nn.Dense(out_feats, in_units=in_feats, use_bias=bias,
+                               weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)))
+
+    def forward(self, adj, feat):
+        r"""Compute (Dense) Graph SAGE layer.
+
+        Parameters
+        ----------
+        adj : mxnet.NDArray
+            The adjacency matrix of the graph to apply Graph Convolution on,
+            should be of shape :math:`(N, N)`, where a row represents the destination
+            and a column represents the source.
+        feat : mxnet.NDArray
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        mxnet.NDArray
+            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
+            is size of output feature.
+        """
+        adj = adj.astype(feat.dtype).as_in_context(feat.context)
+        feat = self.feat_drop(feat)
+        in_degrees = adj.sum(axis=1, keepdims=True)
+        h_neigh = (nd.dot(adj, feat) + feat) / (in_degrees + 1)
+        rst = self.fc(h_neigh)
+        # activation
+        if self.activation is not None:
+            rst = self.activation(rst)
+        # normalization
+        if self._norm is not None:
+            rst = self._norm(rst)
+
+        return rst
--- a/python/dgl/nn/mxnet/conv/edgeconv.py
+++ b/python/dgl/nn/mxnet/conv/edgeconv.py
+"""MXNet Module for EdgeConv Layer"""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import mxnet as mx
+from mxnet.gluon import nn
+
+from .... import function as fn
+
+
+class EdgeConv(nn.Block):
+    r"""EdgeConv layer.
+
+    Introduced in "`Dynamic Graph CNN for Learning on Point Clouds
+    <https://arxiv.org/pdf/1801.07829>`__".  Can be described as follows:
+
+    .. math::
+       x_i^{(l+1)} = \max_{j \in \mathcal{N}(i)} \mathrm{ReLU}(
+       \Theta \cdot (x_j^{(l)} - x_i^{(l)}) + \Phi \cdot x_i^{(l)})
+
+    where :math:`\mathcal{N}(i)` is the neighbor of :math:`i`.
+
+    Parameters
+    ----------
+    in_feat : int
+        Input feature size.
+    out_feat : int
+        Output feature size.
+    batch_norm : bool
+        Whether to include batch normalization on messages.
+    """
+    def __init__(self,
+                 in_feat,
+                 out_feat,
+                 batch_norm=False):
+        super(EdgeConv, self).__init__()
+        self.batch_norm = batch_norm
+
+        with self.name_scope():
+            self.theta = nn.Dense(out_feat, in_units=in_feat,
+                                  weight_initializer=mx.init.Xavier())
+            self.phi = nn.Dense(out_feat, in_units=in_feat,
+                                weight_initializer=mx.init.Xavier())
+
+            if batch_norm:
+                self.bn = nn.BatchNorm(in_channels=out_feat)
+
+    def message(self, edges):
+        r"""The message computation function
+        """
+        theta_x = self.theta(edges.dst['x'] - edges.src['x'])
+        phi_x = self.phi(edges.src['x'])
+        return {'e': theta_x + phi_x}
+
+    def forward(self, g, h):
+        r"""Forward computation
+
+        Parameters
+        ----------
+        g : DGLGraph
+            The graph.
+        h : mxnet.NDArray
+            :math:`(N, D)` where :math:`N` is the number of nodes and
+            :math:`D` is the number of feature dimensions.
+        Returns
+        -------
+        mxnet.NDArray
+            New node features.
+        """
+        with g.local_scope():
+            g.ndata['x'] = h
+            if not self.batch_norm:
+                g.update_all(self.message, fn.max('e', 'x'))
+            else:
+                g.apply_edges(self.message)
+                g.edata['e'] = self.bn(g.edata['e'])
+                g.update_all(fn.copy_e('e', 'm'), fn.max('m', 'x'))
+            return g.ndata['x']
--- a/python/dgl/nn/mxnet/conv/gatconv.py
+++ b/python/dgl/nn/mxnet/conv/gatconv.py
+"""MXNet modules for graph attention networks(GAT)."""
+# pylint: disable= no-member, arguments-differ, invalid-name
+import math
+import mxnet as mx
+from mxnet.gluon import nn
+from mxnet.gluon.contrib.nn import Identity
+
+from .... import function as fn
+from ..softmax import edge_softmax
+
+#pylint: enable=W0235
+class GATConv(nn.Block):
+    r"""Apply `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`__
+    over an input signal.
+
+    .. math::
+        h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
+
+    where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and
+    node :math:`j`:
+
+    .. math::
+        \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
+
+        e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
+
+    Parameters
+    ----------
+    in_feats : int
+        Input feature size.
+    out_feats : int
+        Output feature size.
+    num_heads : int
+        Number of heads in Multi-Head Attention.
+    feat_drop : float, optional
+        Dropout rate on feature, defaults: ``0``.
+    attn_drop : float, optional
+        Dropout rate on attention weight, defaults: ``0``.
+    negative_slope : float, optional
+        LeakyReLU angle of negative slope.
+    residual : bool, optional
+        If True, use residual connection.
+    activation : callable activation function/layer or None, optional.
+        If not None, applies an activation function to the updated node features.
+        Default: ``None``.
+    """
+    def __init__(self,
+                 in_feats,
+                 out_feats,
+                 num_heads,
+                 feat_drop=0.,
+                 attn_drop=0.,
+                 negative_slope=0.2,
+                 residual=False,
+                 activation=None):
+        super(GATConv, self).__init__()
+        self._num_heads = num_heads
+        self._in_feats = in_feats
+        self._out_feats = out_feats
+        with self.name_scope():
+            self.fc = nn.Dense(out_feats * num_heads, use_bias=False,
+                               weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)),
+                               in_units=in_feats)
+            self.attn_l = self.params.get('attn_l',
+                                          shape=(1, num_heads, out_feats),
+                                          init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
+            self.attn_r = self.params.get('attn_r',
+                                          shape=(1, num_heads, out_feats),
+                                          init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
+            self.feat_drop = nn.Dropout(feat_drop)
+            self.attn_drop = nn.Dropout(attn_drop)
+            self.leaky_relu = nn.LeakyReLU(negative_slope)
+            if residual:
+                if in_feats != out_feats:
+                    self.res_fc = nn.Dense(out_feats * num_heads, use_bias=False,
+                                           weight_initializer=mx.init.Xavier(
+                                               magnitude=math.sqrt(2.0)),
+                                           in_units=in_feats)
+                else:
+                    self.res_fc = Identity()
+            else:
+                self.res_fc = None
+            self.activation = activation
+
+    def forward(self, graph, feat):
+        r"""Compute graph attention network layer.
+
+        Parameters
+        ----------
+        graph : DGLGraph
+            The graph.
+        feat : mxnet.NDArray
+            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
+            is size of input feature, :math:`N` is the number of nodes.
+
+        Returns
+        -------
+        mxnet.NDArray
+            The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
+            is the number of heads, and :math:`D_{out}` is size of output feature.
+        """
+        graph = graph.local_var()
+        h = self.feat_drop(feat)
+        feat = self.fc(h).reshape(-1, self._num_heads, self._out_feats)
+        el = (feat * self.attn_l.data(feat.context)).sum(axis=-1).expand_dims(-1)
+        er = (feat * self.attn_r.data(feat.context)).sum(axis=-1).expand_dims(-1)
+        graph.ndata.update({'ft': feat, 'el': el, 'er': er})
+        # compute edge attention
+        graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
+        e = self.leaky_relu(graph.edata.pop('e'))
+        # compute softmax
+        graph.edata['a'] = self.attn_drop(edge_softmax(graph, e))
+        graph.update_all(fn.u_mul_e('ft', 'a', 'm'),
+                         fn.sum('m', 'ft'))
+        rst = graph.ndata['ft']
+        # residual
+        if self.res_fc is not None:
+            resval = self.res_fc(h).reshape(h.shape[0], -1, self._out_feats)
+            rst = rst + resval
+        # activation
+        if self.activation:
+            rst = self.activation(rst)
+        return rst