Commit 9a0511c8 authored by Zihao Ye's avatar Zihao Ye Committed by Minjie Wang
Browse files

[NN] nn modules & examples update (#890)

* upd

* damn it

* fuck

* fuck pylint

* fudge

* remove some comments about MXNet

* upd

* upd

* damn it

* damn it

* fuck

* fuck

* upd

* upd

* pylint bastard

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd
parent 7f65199a
......@@ -23,16 +23,16 @@ def train(args, net, trainloader, optimizer, criterion, epoch):
for pos, (graphs, labels) in zip(bar, trainloader):
# batch graphs will be shipped to device in forward part of model
labels = labels.to(args.device)
outputs = net(graphs)
feat = graphs.ndata['attr'].to(args.device)
outputs = net(graphs, feat)
loss = criterion(outputs, labels)
running_loss += loss.item()
# backprop
if optimizer is not None:
optimizer.zero_grad()
loss.backward()
optimizer.step()
optimizer.zero_grad()
loss.backward()
optimizer.step()
# report
bar.set_description('epoch-{}'.format(epoch))
......@@ -50,15 +50,12 @@ def eval_net(args, net, dataloader, criterion):
total_loss = 0
total_correct = 0
# total_iters = len(dataloader)
for data in dataloader:
graphs, labels = data
feat = graphs.ndata['attr'].to(args.device)
labels = labels.to(args.device)
total += len(labels)
outputs = net(graphs)
outputs = net(graphs, feat)
_, predicted = torch.max(outputs.data, 1)
total_correct += (predicted == labels.data).sum().item()
......@@ -99,8 +96,7 @@ def main(args):
args.num_layers, args.num_mlp_layers,
dataset.dim_nfeats, args.hidden_dim, dataset.gclasses,
args.final_dropout, args.learn_eps,
args.graph_pooling_type, args.neighbor_pooling_type,
args.device).to(args.device)
args.graph_pooling_type, args.neighbor_pooling_type).to(args.device)
criterion = nn.CrossEntropyLoss() # defaul reduce is true
optimizer = optim.Adam(model.parameters(), lr=args.lr)
......
......@@ -6,7 +6,6 @@ Simple reference implementation of GraphSAGE.
"""
import argparse
import time
import abc
import numpy as np
import networkx as nx
import torch
......
Geometric Deep Learning models
=========
This example shows how to use geometric deep learning models defined in `dgl.nn.pytorch.conv` for
graph classification.
Currently we support following models:
- [ChebNet](https://arxiv.org/pdf/1606.09375.pdf)
- [MoNet](https://arxiv.org/pdf/1611.08402.pdf)
## Image Classification on MNIST
By transforming images to graphs, graph classifcation algorithms could
be applied to image classification problems.
### Usage
```bash
python mnist.py --model cheb --gpu 0
python mnist.py --model monet --gpu 0
```
### Acknowledgement
We thank [Xavier Bresson](https://github.com/xbresson) for providing
code for graph coarsening algorithm and grid graph building in
[CE7454_2019 Labs](https://github.com/xbresson/CE7454_2019/tree/master/codes/labs_lecture14/lab01_ChebGCNs).
# author: xbresson
# code link: https://github.com/xbresson/CE7454_2019/blob/master/codes/labs_lecture14/lab01_ChebGCNs/lib/coarsening.py
import numpy as np
import scipy.sparse
import sklearn.metrics
def laplacian(W, normalized=True):
"""Return graph Laplacian"""
# Degree matrix.
d = W.sum(axis=0)
# Laplacian matrix.
if not normalized:
D = scipy.sparse.diags(d.A.squeeze(), 0)
L = D - W
else:
d += np.spacing(np.array(0, W.dtype))
d = 1 / np.sqrt(d)
D = scipy.sparse.diags(d.A.squeeze(), 0)
I = scipy.sparse.identity(d.size, dtype=W.dtype)
L = I - D * W * D
assert np.abs(L - L.T).mean() < 1e-9
assert type(L) is scipy.sparse.csr.csr_matrix
return L
def rescale_L(L, lmax=2):
"""Rescale Laplacian eigenvalues to [-1,1]"""
M, M = L.shape
I = scipy.sparse.identity(M, format='csr', dtype=L.dtype)
L /= lmax * 2
L -= I
return L
def lmax_L(L):
"""Compute largest Laplacian eigenvalue"""
return scipy.sparse.linalg.eigsh(L, k=1, which='LM', return_eigenvectors=False)[0]
# graph coarsening with Heavy Edge Matching
def coarsen(A, levels):
graphs, parents = HEM(A, levels)
perms = compute_perm(parents)
laplacians = []
for i, A in enumerate(graphs):
M, M = A.shape
if i < levels:
A = perm_adjacency(A, perms[i])
A = A.tocsr()
A.eliminate_zeros()
Mnew, Mnew = A.shape
print('Layer {0}: M_{0} = |V| = {1} nodes ({2} added), |E| = {3} edges'.format(i, Mnew, Mnew - M, A.nnz // 2))
L = laplacian(A, normalized=True)
laplacians.append(L)
return laplacians, perms[0] if len(perms) > 0 else None
def HEM(W, levels, rid=None):
"""
Coarsen a graph multiple times using the Heavy Edge Matching (HEM).
Input
W: symmetric sparse weight (adjacency) matrix
levels: the number of coarsened graphs
Output
graph[0]: original graph of size N_1
graph[2]: coarser graph of size N_2 < N_1
graph[levels]: coarsest graph of Size N_levels < ... < N_2 < N_1
parents[i] is a vector of size N_i with entries ranging from 1 to N_{i+1}
which indicate the parents in the coarser graph[i+1]
nd_sz{i} is a vector of size N_i that contains the size of the supernode in the graph{i}
Note
if "graph" is a list of length k, then "parents" will be a list of length k-1
"""
N, N = W.shape
if rid is None:
rid = np.random.permutation(range(N))
ss = np.array(W.sum(axis=0)).squeeze()
rid = np.argsort(ss)
parents = []
degree = W.sum(axis=0) - W.diagonal()
graphs = []
graphs.append(W)
print('Heavy Edge Matching coarsening with Xavier version')
for _ in range(levels):
# CHOOSE THE WEIGHTS FOR THE PAIRING
# weights = ones(N,1) # metis weights
weights = degree # graclus weights
# weights = supernode_size # other possibility
weights = np.array(weights).squeeze()
# PAIR THE VERTICES AND CONSTRUCT THE ROOT VECTOR
idx_row, idx_col, val = scipy.sparse.find(W)
cc = idx_row
rr = idx_col
vv = val
# TO BE SPEEDUP
if not (list(cc) == list(np.sort(cc))):
tmp = cc
cc = rr
rr = tmp
cluster_id = HEM_one_level(cc, rr, vv, rid, weights) # cc is ordered
parents.append(cluster_id)
# COMPUTE THE EDGES WEIGHTS FOR THE NEW GRAPH
nrr = cluster_id[rr]
ncc = cluster_id[cc]
nvv = vv
Nnew = cluster_id.max() + 1
# CSR is more appropriate: row,val pairs appear multiple times
W = scipy.sparse.csr_matrix((nvv, (nrr, ncc)), shape=(Nnew, Nnew))
W.eliminate_zeros()
# Add new graph to the list of all coarsened graphs
graphs.append(W)
N, N = W.shape
# COMPUTE THE DEGREE (OMIT OR NOT SELF LOOPS)
degree = W.sum(axis=0)
# degree = W.sum(axis=0) - W.diagonal()
# CHOOSE THE ORDER IN WHICH VERTICES WILL BE VISTED AT THE NEXT PASS
# [~, rid]=sort(ss); # arthur strategy
# [~, rid]=sort(supernode_size); # thomas strategy
# rid=randperm(N); # metis/graclus strategy
ss = np.array(W.sum(axis=0)).squeeze()
rid = np.argsort(ss)
return graphs, parents
# Coarsen a graph given by rr,cc,vv. rr is assumed to be ordered
def HEM_one_level(rr, cc, vv, rid, weights):
nnz = rr.shape[0]
N = rr[nnz - 1] + 1
marked = np.zeros(N, np.bool)
rowstart = np.zeros(N, np.int32)
rowlength = np.zeros(N, np.int32)
cluster_id = np.zeros(N, np.int32)
oldval = rr[0]
count = 0
clustercount = 0
for ii in range(nnz):
rowlength[count] = rowlength[count] + 1
if rr[ii] > oldval:
oldval = rr[ii]
rowstart[count + 1] = ii
count = count + 1
for ii in range(N):
tid = rid[ii]
if not marked[tid]:
wmax = 0.0
rs = rowstart[tid]
marked[tid] = True
bestneighbor = -1
for jj in range(rowlength[tid]):
nid = cc[rs + jj]
if marked[nid]:
tval = 0.0
else:
# First approach
if 2 == 1:
tval = vv[rs + jj] * (1.0 / weights[tid] + 1.0 / weights[nid])
# Second approach
if 1 == 1:
Wij = vv[rs + jj]
Wii = vv[rowstart[tid]]
Wjj = vv[rowstart[nid]]
di = weights[tid]
dj = weights[nid]
tval = (2. * Wij + Wii + Wjj) * 1. / (di + dj + 1e-9)
if tval > wmax:
wmax = tval
bestneighbor = nid
cluster_id[tid] = clustercount
if bestneighbor > -1:
cluster_id[bestneighbor] = clustercount
marked[bestneighbor] = True
clustercount += 1
return cluster_id
def compute_perm(parents):
"""
Return a list of indices to reorder the adjacency and data matrices so
that the union of two neighbors from layer to layer forms a binary tree.
"""
# Order of last layer is random (chosen by the clustering algorithm).
indices = []
if len(parents) > 0:
M_last = max(parents[-1]) + 1
indices.append(list(range(M_last)))
for parent in parents[::-1]:
# Fake nodes go after real ones.
pool_singeltons = len(parent)
indices_layer = []
for i in indices[-1]:
indices_node = list(np.where(parent == i)[0])
assert 0 <= len(indices_node) <= 2
# Add a node to go with a singelton.
if len(indices_node) is 1:
indices_node.append(pool_singeltons)
pool_singeltons += 1
# Add two nodes as children of a singelton in the parent.
elif len(indices_node) is 0:
indices_node.append(pool_singeltons + 0)
indices_node.append(pool_singeltons + 1)
pool_singeltons += 2
indices_layer.extend(indices_node)
indices.append(indices_layer)
# Sanity checks.
for i, indices_layer in enumerate(indices):
M = M_last * 2 ** i
# Reduction by 2 at each layer (binary tree).
assert len(indices[0] == M)
# The new ordering does not omit an indice.
assert sorted(indices_layer) == list(range(M))
return indices[::-1]
assert (compute_perm([np.array([4, 1, 1, 2, 2, 3, 0, 0, 3]), np.array([2, 1, 0, 1, 0])])
== [[3, 4, 0, 9, 1, 2, 5, 8, 6, 7, 10, 11], [2, 4, 1, 3, 0, 5], [0, 1, 2]])
def perm_adjacency(A, indices):
"""
Permute adjacency matrix, i.e. exchange node ids,
so that binary unions form the clustering tree.
"""
if indices is None:
return A
M, M = A.shape
Mnew = len(indices)
A = A.tocoo()
# Add Mnew - M isolated vertices.
rows = scipy.sparse.coo_matrix((Mnew - M, M), dtype=np.float32)
cols = scipy.sparse.coo_matrix((Mnew, Mnew - M), dtype=np.float32)
A = scipy.sparse.vstack([A, rows])
A = scipy.sparse.hstack([A, cols])
# Permute the rows and the columns.
perm = np.argsort(indices)
A.row = np.array(perm)[A.row]
A.col = np.array(perm)[A.col]
assert np.abs(A - A.T).mean() < 1e-8 # 1e-9
assert type(A) is scipy.sparse.coo.coo_matrix
return A
def perm_data(x, indices):
"""
Permute data matrix, i.e. exchange node ids,
so that binary unions form the clustering tree.
"""
if indices is None:
return x
N, M = x.shape
Mnew = len(indices)
assert Mnew >= M
xnew = np.empty((N, Mnew))
for i, j in enumerate(indices):
# Existing vertex, i.e. real data.
if j < M:
xnew[:, i] = x[:, j]
# Fake vertex because of singeltons.
# They will stay 0 so that max pooling chooses the singelton.
# Or -infty ?
else:
xnew[:, i] = np.zeros(N)
return xnew
import torch as th
"""Compute x,y coordinate for nodes in the graph"""
eps = 1e-8
def get_coordinates(graphs, grid_side, coarsening_levels, perm):
rst = []
for l in range(coarsening_levels + 1):
xs, ys = [], []
for i in range(graphs[l].number_of_nodes()):
cnt = eps
x_accum = 0
y_accum = 0
for j in range(i * 2 ** l, (i + 1) * 2 ** l):
if perm[j] < grid_side ** 2:
x_accum += (perm[j] // grid_side)
y_accum += (perm[j] % grid_side)
cnt += 1
xs.append(x_accum / cnt)
ys.append(y_accum / cnt)
rst.append(th.cat([th.tensor(xs).view(-1, 1), th.tensor(ys).view(-1, 1)], -1))
return rst
"""Cartesian coordinate to polar coordinate"""
def z2polar(edges):
z = edges.dst['xy'] - edges.src['xy']
rho = th.norm(z, dim=-1, p=2)
x, y = z.unbind(dim=-1)
phi = th.atan2(y, x)
return {'u': th.cat([rho.unsqueeze(-1), phi.unsqueeze(-1)], -1)}
# author: xbresson
# code link: https://github.com/xbresson/CE7454_2019/blob/master/codes/labs_lecture14/lab01_ChebGCNs/lib/grid_graph.py
import sklearn
import sklearn.metrics
import scipy.sparse, scipy.sparse.linalg # scipy.spatial.distance
import numpy as np
def grid_graph(grid_side,number_edges,metric):
"""Generate graph of a grid"""
z = grid(grid_side)
dist, idx = distance_sklearn_metrics(z, k=number_edges, metric=metric)
A = adjacency(dist, idx)
print("nb edges: ",A.nnz)
return A
def grid(m, dtype=np.float32):
"""Return coordinates of grid points"""
M = m**2
x = np.linspace(0,1,m, dtype=dtype)
y = np.linspace(0,1,m, dtype=dtype)
xx, yy = np.meshgrid(x, y)
z = np.empty((M,2), dtype)
z[:,0] = xx.reshape(M)
z[:,1] = yy.reshape(M)
return z
def distance_sklearn_metrics(z, k=4, metric='euclidean'):
"""Compute pairwise distances"""
#d = sklearn.metrics.pairwise.pairwise_distances(z, metric=metric, n_jobs=-2)
d = sklearn.metrics.pairwise.pairwise_distances(z, metric=metric, n_jobs=1)
# k-NN
idx = np.argsort(d)[:,1:k+1]
d.sort()
d = d[:,1:k+1]
return d, idx
def adjacency(dist, idx):
"""Return adjacency matrix of a kNN graph"""
M, k = dist.shape
assert M, k == idx.shape
assert dist.min() >= 0
assert dist.max() <= 1
# Pairwise distances
sigma2 = np.mean(dist[:,-1])**2
dist = np.exp(- dist**2 / sigma2)
# Weight matrix
I = np.arange(0, M).repeat(k)
J = idx.reshape(M*k)
V = dist.reshape(M*k)
W = scipy.sparse.coo_matrix((V, (I, J)), shape=(M, M))
# No self-connections
W.setdiag(0)
# Undirected graph
bigger = W.T > W
W = W - W.multiply(bigger) + W.T.multiply(bigger)
assert W.nnz % 2 == 0
assert np.abs(W - W.T).mean() < 1e-10
assert type(W) is scipy.sparse.csr.csr_matrix
return W
import argparse
import time
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from dgl import DGLGraph
from dgl.data import register_data_args, load_data
from dgl.nn.pytorch.conv import ChebConv, GMMConv
from dgl.nn.pytorch.glob import MaxPooling
from grid_graph import grid_graph
from coarsening import coarsen
from coordinate import get_coordinates, z2polar
argparser = argparse.ArgumentParser("MNIST")
argparser.add_argument("--gpu", type=int, default=-1,
help="gpu id, use cpu if set to -1")
argparser.add_argument("--model", type=str, default="chebnet",
help="model to use, chebnet/monet")
argparser.add_argument("--batch-size", type=int, default=100,
help="batch size")
args = argparser.parse_args()
grid_side = 28
number_edges = 8
metric = 'euclidean'
A = grid_graph(28, 8, metric)
coarsening_levels = 4
L, perm = coarsen(A, coarsening_levels)
g_arr = [DGLGraph(csr) for csr in L]
coordinate_arr = get_coordinates(g_arr, grid_side, coarsening_levels, perm)
for g, coordinate_arr in zip(g_arr, coordinate_arr):
g.ndata['xy'] = coordinate_arr
g.apply_edges(z2polar)
def batcher(batch):
g_batch = [[] for _ in range(coarsening_levels + 1)]
x_batch = []
y_batch = []
for x, y in batch:
x = torch.cat([x.view(-1), x.new_zeros(928 - 28 ** 2)], 0)
x = x[perm]
x_batch.append(x)
y_batch.append(y)
for i in range(coarsening_levels + 1):
g_batch[i].append(g_arr[i])
x_batch = torch.cat(x_batch).unsqueeze(-1)
y_batch = torch.LongTensor(y_batch)
g_batch = [dgl.batch(g) for g in g_batch]
return g_batch, x_batch, y_batch
trainset = datasets.MNIST(root='.', train=True, download=True, transform=transforms.ToTensor())
testset = datasets.MNIST(root='.', train=False, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(trainset,
batch_size=args.batch_size,
shuffle=True,
collate_fn=batcher,
num_workers=6)
test_loader = DataLoader(testset,
batch_size=args.batch_size,
shuffle=False,
collate_fn=batcher,
num_workers=6)
class MoNet(nn.Module):
def __init__(self,
n_kernels,
in_feats,
hiddens,
out_feats):
super(MoNet, self).__init__()
self.pool = nn.MaxPool1d(2)
self.layers = nn.ModuleList()
self.readout = MaxPooling()
# Input layer
self.layers.append(
GMMConv(in_feats, hiddens[0], 2, n_kernels))
# Hidden layer
for i in range(1, len(hiddens)):
self.layers.append(GMMConv(hiddens[i - 1], hiddens[i], 2, n_kernels))
self.cls = nn.Sequential(
nn.Linear(hiddens[-1], out_feats),
nn.LogSoftmax()
)
def forward(self, g_arr, feat):
for g, layer in zip(g_arr, self.layers):
u = g.edata['u'].to(feat.device)
feat = self.pool(layer(g, feat, u).transpose(-1, -2).unsqueeze(0))\
.squeeze(0).transpose(-1, -2)
return self.cls(self.readout(g_arr[-1], feat))
class ChebNet(nn.Module):
def __init__(self,
k,
in_feats,
hiddens,
out_feats):
super(ChebNet, self).__init__()
self.pool = nn.MaxPool1d(2)
self.layers = nn.ModuleList()
self.readout = MaxPooling()
# Input layer
self.layers.append(
ChebConv(in_feats, hiddens[0], k))
for i in range(1, len(hiddens)):
self.layers.append(
ChebConv(hiddens[i - 1], hiddens[i], k))
self.cls = nn.Sequential(
nn.Linear(hiddens[-1], out_feats),
nn.LogSoftmax()
)
def forward(self, g_arr, feat):
for g, layer in zip(g_arr, self.layers):
feat = self.pool(layer(g, feat, [2] * g.batch_size).transpose(-1, -2).unsqueeze(0))\
.squeeze(0).transpose(-1, -2)
return self.cls(self.readout(g_arr[-1], feat))
if args.gpu == -1:
device = torch.device('cpu')
else:
device = torch.device(args.gpu)
if args.model == 'chebnet':
model = ChebNet(2, 1, [32, 64, 128, 256], 10)
else:
model = MoNet(10, 1, [32, 64, 128, 256], 10)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
log_interval = 50
for epoch in range(10):
print('epoch {} starts'.format(epoch))
model.train()
hit, tot = 0, 0
loss_accum = 0
for i, (g, x, y) in enumerate(train_loader):
x = x.to(device)
y = y.to(device)
out = model(g, x)
hit += (out.max(-1)[1] == y).sum().item()
tot += len(y)
loss = F.nll_loss(out, y)
loss_accum += loss.item()
if (i + 1) % log_interval == 0:
print('loss: {}, acc: {}'.format(loss_accum / log_interval, hit / tot))
hit, tot = 0, 0
loss_accum = 0
optimizer.zero_grad()
loss.backward()
optimizer.step()
model.eval()
hit, tot = 0, 0
for g, x, y in test_loader:
x = x.to(device)
y = y.to(device)
out = model(g, x)
hit += (out.max(-1)[1] == y).sum().item()
tot += len(y)
print('test acc: ', hit / tot)
MoNet
=====
- paper link: [Geometric deep learning on graphs and manifolds using mixture model CNNs](https://arxiv.org/pdf/1611.08402.pdf)
Dependencies
============
- pytorch 1.1+
Results
=======
Node classification on citation networks:
- Cora: ~0.816
- Pubmed: ~0.763
Image classification on MNIST:
- please refer to [model_zoo/geometric](../model_zoo/geometric).
\ No newline at end of file
import argparse
import time
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.data import register_data_args, load_data
from dgl.nn.pytorch.conv import GMMConv
class MoNet(nn.Module):
def __init__(self,
g,
in_feats,
n_hidden,
out_feats,
n_layers,
dim,
n_kernels,
dropout):
super(MoNet, self).__init__()
self.g = g
self.layers = nn.ModuleList()
self.pseudo_proj = nn.ModuleList()
# Input layer
self.layers.append(
GMMConv(in_feats, n_hidden, dim, n_kernels))
self.pseudo_proj.append(
nn.Sequential(nn.Linear(2, dim), nn.Tanh()))
# Hidden layer
for _ in range(n_layers - 1):
self.layers.append(GMMConv(n_hidden, n_hidden, dim, n_kernels))
self.pseudo_proj.append(
nn.Sequential(nn.Linear(2, dim), nn.Tanh()))
# Output layer
self.layers.append(GMMConv(n_hidden, out_feats, dim, n_kernels))
self.pseudo_proj.append(
nn.Sequential(nn.Linear(2, dim), nn.Tanh()))
self.dropout = nn.Dropout(dropout)
def forward(self, feat, pseudo):
h = feat
for i in range(len(self.layers)):
if i != 0:
h = self.dropout(h)
h = self.layers[i](
self.g, h, self.pseudo_proj[i](pseudo))
return h
def evaluate(model, features, pseudo, labels, mask):
model.eval()
with torch.no_grad():
logits = model(features, pseudo)
logits = logits[mask]
labels = labels[mask]
_, indices = torch.max(logits, dim=1)
correct = torch.sum(indices == labels)
return correct.item() * 1.0 / len(labels)
def main(args):
# load and preprocess dataset
data = load_data(args)
features = torch.FloatTensor(data.features)
labels = torch.LongTensor(data.labels)
if False: #hasattr(torch, 'BoolTensor'):
train_mask = torch.BoolTensor(data.train_mask)
val_mask = torch.BoolTensor(data.val_mask)
test_mask = torch.BoolTensor(data.test_mask)
else:
train_mask = torch.ByteTensor(data.train_mask)
val_mask = torch.ByteTensor(data.val_mask)
test_mask = torch.ByteTensor(data.test_mask)
in_feats = features.shape[1]
n_classes = data.num_labels
n_edges = data.graph.number_of_edges()
print("""----Data statistics------'
#Edges %d
#Classes %d
#Train samples %d
#Val samples %d
#Test samples %d""" %
(n_edges, n_classes,
train_mask.sum().item(),
val_mask.sum().item(),
test_mask.sum().item()))
if args.gpu < 0:
cuda = False
else:
cuda = True
torch.cuda.set_device(args.gpu)
features = features.cuda()
labels = labels.cuda()
train_mask = train_mask.cuda()
val_mask = val_mask.cuda()
test_mask = test_mask.cuda()
print("use cuda:", args.gpu)
# graph preprocess and calculate normalization factor
g = data.graph
g.remove_edges_from(nx.selfloop_edges(g))
g = DGLGraph(g)
n_edges = g.number_of_edges()
us, vs = g.edges()
pseudo = []
for i in range(g.number_of_edges()):
pseudo.append([
1 / np.sqrt(g.in_degree(us[i])),
1 / np.sqrt(g.in_degree(vs[i]))
])
pseudo = torch.Tensor(pseudo)
if cuda:
pseudo = pseudo.cuda()
# create GraphSAGE model
model = MoNet(g,
in_feats,
args.n_hidden,
n_classes,
args.n_layers,
args.pseudo_dim,
args.n_kernels,
args.dropout
)
if cuda:
model.cuda()
loss_fcn = torch.nn.CrossEntropyLoss()
# use optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
# initialize graph
dur = []
for epoch in range(args.n_epochs):
model.train()
if epoch >= 3:
t0 = time.time()
# forward
logits = model(features, pseudo)
loss = loss_fcn(logits[train_mask], labels[train_mask])
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch >= 3:
dur.append(time.time() - t0)
acc = evaluate(model, features, pseudo, labels, val_mask)
print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
"ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
acc, n_edges / np.mean(dur) / 1000))
print()
acc = evaluate(model, features, pseudo, labels, test_mask)
print("Test Accuracy {:.4f}".format(acc))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='MoNet on citation network')
register_data_args(parser)
parser.add_argument("--dropout", type=float, default=0.5,
help="dropout probability")
parser.add_argument("--gpu", type=int, default=-1,
help="gpu")
parser.add_argument("--lr", type=float, default=1e-2,
help="learning rate")
parser.add_argument("--n-epochs", type=int, default=200,
help="number of training epochs")
parser.add_argument("--n-hidden", type=int, default=16,
help="number of hidden gcn units")
parser.add_argument("--n-layers", type=int, default=1,
help="number of hidden gcn layers")
parser.add_argument("--pseudo-dim", type=int, default=2,
help="Pseudo coordinate dimensions in GMMConv, 2 for cora and 3 for pubmed")
parser.add_argument("--n-kernels", type=int, default=3,
help="Number of kernels in GMMConv layer")
parser.add_argument("--weight-decay", type=float, default=5e-4,
help="Weight for L2 loss")
args = parser.parse_args()
print(args)
main(args)
......@@ -10,6 +10,8 @@ https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip
import os
import numpy as np
from .. import backend as F
from .utils import download, extract_archive, get_download_dir, _get_dgl_url
from ..graph import DGLGraph
......@@ -235,8 +237,7 @@ class GINDataset(object):
for g in self.graphs:
g.ndata['attr'] = np.zeros((
g.number_of_nodes(), len(label2idx)))
g.ndata['attr'][range(g.number_of_nodes(
)), [label2idx[nl.item()] for nl in g.ndata['label']]] = 1
g.ndata['attr'][:, [label2idx[F.as_scalar(nl)] for nl in g.ndata['label']]] = 1
# after load, get the #classes and #dim
self.gclasses = len(self.glabel_dict)
......
......@@ -4,5 +4,22 @@
from .graphconv import GraphConv
from .relgraphconv import RelGraphConv
from .tagconv import TAGConv
from .gatconv import GATConv
from .sageconv import SAGEConv
from .gatedgraphconv import GatedGraphConv
from .chebconv import ChebConv
from .agnnconv import AGNNConv
from .appnpconv import APPNPConv
from .densegraphconv import DenseGraphConv
from .densesageconv import DenseSAGEConv
from .densechebconv import DenseChebConv
from .edgeconv import EdgeConv
from .ginconv import GINConv
from .gmmconv import GMMConv
from .nnconv import NNConv
from .sgconv import SGConv
__all__ = ['GraphConv', 'TAGConv', 'RelGraphConv']
__all__ = ['GraphConv', 'TAGConv', 'RelGraphConv', 'GATConv',
'SAGEConv', 'GatedGraphConv', 'ChebConv', 'AGNNConv',
'APPNPConv', 'DenseGraphConv', 'DenseSAGEConv', 'DenseChebConv',
'EdgeConv', 'GINConv', 'GMMConv', 'NNConv', 'SGConv']
"""MXNet Module for Attention-based Graph Neural Network layer"""
# pylint: disable= no-member, arguments-differ, invalid-name
import mxnet as mx
from mxnet.gluon import nn
from .... import function as fn
from ..softmax import edge_softmax
from ..utils import normalize
class AGNNConv(nn.Block):
r"""Attention-based Graph Neural Network layer from paper `Attention-based
Graph Neural Network for Semi-Supervised Learning
<https://arxiv.org/abs/1803.03735>`__.
.. math::
H^{l+1} = P H^{l}
where :math:`P` is computed as:
.. math::
P_{ij} = \mathrm{softmax}_i ( \beta \cdot \cos(h_i^l, h_j^l))
Parameters
----------
init_beta : float, optional
The :math:`\beta` in the formula.
learn_beta : bool, optional
If True, :math:`\beta` will be learnable parameter.
"""
def __init__(self,
init_beta=1.,
learn_beta=True):
super(AGNNConv, self).__init__()
with self.name_scope():
self.beta = self.params.get('beta',
shape=(1,),
grad_req='write' if learn_beta else 'null',
init=mx.init.Constant(init_beta))
def forward(self, graph, feat):
r"""Compute AGNN Layer.
Parameters
----------
graph : DGLGraph
The graph.
feat : mxnet.NDArray
The input feature of shape :math:`(N, *)` :math:`N` is the
number of nodes, and :math:`*` could be of any shape.
Returns
-------
mxnet.NDArray
The output feature of shape :math:`(N, *)` where :math:`*`
should be the same as input shape.
"""
graph = graph.local_var()
graph.ndata['h'] = feat
graph.ndata['norm_h'] = normalize(feat, p=2, axis=-1)
# compute cosine distance
graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos'))
cos = graph.edata.pop('cos')
e = self.beta.data(feat.context) * cos
graph.edata['p'] = edge_softmax(graph, e)
graph.update_all(fn.u_mul_e('h', 'p', 'm'), fn.sum('m', 'h'))
return graph.ndata.pop('h')
"""MXNet Module for APPNPConv"""
# pylint: disable= no-member, arguments-differ, invalid-name
import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn
from .... import function as fn
class APPNPConv(nn.Block):
r"""Approximate Personalized Propagation of Neural Predictions
layer from paper `Predict then Propagate: Graph Neural Networks
meet Personalized PageRank <https://arxiv.org/pdf/1810.05997.pdf>`__.
.. math::
H^{0} & = X
H^{t+1} & = (1-\alpha)\left(\hat{D}^{-1/2}
\hat{A} \hat{D}^{-1/2} H^{t} + \alpha H^{0}\right)
Parameters
----------
k : int
Number of iterations :math:`K`.
alpha : float
The teleport probability :math:`\alpha`.
edge_drop : float, optional
Dropout rate on edges that controls the
messages received by each node. Default: ``0``.
"""
def __init__(self,
k,
alpha,
edge_drop=0.):
super(APPNPConv, self).__init__()
self._k = k
self._alpha = alpha
with self.name_scope():
self.edge_drop = nn.Dropout(edge_drop)
def forward(self, graph, feat):
r"""Compute APPNP layer.
Parameters
----------
graph : DGLGraph
The graph.
feat : mx.NDArray
The input feature of shape :math:`(N, *)` :math:`N` is the
number of nodes, and :math:`*` could be of any shape.
Returns
-------
mx.NDArray
The output feature of shape :math:`(N, *)` where :math:`*`
should be the same as input shape.
"""
graph = graph.local_var()
norm = mx.nd.power(mx.nd.clip(
graph.in_degrees().astype(feat.dtype), a_min=1, a_max=float("inf")), -0.5)
shp = norm.shape + (1,) * (feat.ndim - 1)
norm = norm.reshape(shp).as_in_context(feat.context)
feat_0 = feat
for _ in range(self._k):
# normalization by src node
feat = feat * norm
graph.ndata['h'] = feat
graph.edata['w'] = self.edge_drop(
nd.ones((graph.number_of_edges(), 1), ctx=feat.context))
graph.update_all(fn.u_mul_e('h', 'w', 'm'),
fn.sum('m', 'h'))
feat = graph.ndata.pop('h')
# normalization by dst node
feat = feat * norm
feat = (1 - self._alpha) * feat + self._alpha * feat_0
return feat
"""MXNet Module for Chebyshev Spectral Graph Convolution layer"""
# pylint: disable= no-member, arguments-differ, invalid-name
import math
import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn
from .... import laplacian_lambda_max, broadcast_nodes, function as fn
class ChebConv(nn.Block):
r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
Neural Networks on Graphs with Fast Localized Spectral Filtering
<https://arxiv.org/pdf/1606.09375.pdf>`__.
.. math::
h_i^{l+1} &= \sum_{k=0}^{K-1} W^{k, l}z_i^{k, l}
Z^{0, l} &= H^{l}
Z^{1, l} &= \hat{L} \cdot H^{l}
Z^{k, l} &= 2 \cdot \hat{L} \cdot Z^{k-1, l} - Z^{k-2, l}
\hat{L} &= 2\left(I - \hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2}\right)/\lambda_{max} - I
Parameters
----------
in_feats: int
Number of input features.
out_feats: int
Number of output features.
k : int
Chebyshev filter size.
bias : bool, optional
If True, adds a learnable bias to the output. Default: ``True``.
"""
def __init__(self,
in_feats,
out_feats,
k,
bias=True):
super(ChebConv, self).__init__()
self._in_feats = in_feats
self._out_feats = out_feats
self._k = k
with self.name_scope():
self.fc = nn.Sequential()
for _ in range(k):
self.fc.add(
nn.Dense(out_feats, use_bias=False,
weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)),
in_units=in_feats)
)
if bias:
self.bias = self.params.get('bias', shape=(out_feats,),
init=mx.init.Zero())
else:
self.bias = None
def forward(self, graph, feat, lambda_max=None):
r"""Compute ChebNet layer.
Parameters
----------
graph : DGLGraph or BatchedDGLGraph
The graph.
feat : mxnet.NDArray
The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
is size of input feature, :math:`N` is the number of nodes.
lambda_max : list or mxnet.NDArray or None, optional.
A list(tensor) with length :math:`B`, stores the largest eigenvalue
of the normalized laplacian of each individual graph in ``graph``,
where :math:`B` is the batch size of the input graph. Default: None.
If None, this method would compute the list by calling
``dgl.laplacian_lambda_max``.
Returns
-------
mxnet.NDArray
The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
is size of output feature.
"""
with graph.local_scope():
degs = graph.in_degrees().astype('float32')
norm = mx.nd.power(mx.nd.clip(degs, a_min=1, a_max=float("inf")), -0.5)
norm = norm.expand_dims(-1).as_in_context(feat.context)
if lambda_max is None:
lambda_max = laplacian_lambda_max(graph)
if isinstance(lambda_max, list):
lambda_max = nd.array(lambda_max).as_in_context(feat.context)
if lambda_max.ndim == 1:
lambda_max = lambda_max.expand_dims(-1)
# broadcast from (B, 1) to (N, 1)
lambda_max = broadcast_nodes(graph, lambda_max)
# T0(X)
Tx_0 = feat
rst = self.fc[0](Tx_0)
# T1(X)
if self._k > 1:
graph.ndata['h'] = Tx_0 * norm
graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
h = graph.ndata.pop('h') * norm
# Λ = 2 * (I - D ^ -1/2 A D ^ -1/2) / lambda_max - I
# = - 2(D ^ -1/2 A D ^ -1/2) / lambda_max + (2 / lambda_max - 1) I
Tx_1 = -2. * h / lambda_max + Tx_0 * (2. / lambda_max - 1)
rst = rst + self.fc[1](Tx_1)
# Ti(x), i = 2...k
for i in range(2, self._k):
graph.ndata['h'] = Tx_1 * norm
graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h'))
h = graph.ndata.pop('h') * norm
# Tx_k = 2 * Λ * Tx_(k-1) - Tx_(k-2)
# = - 4(D ^ -1/2 A D ^ -1/2) / lambda_max Tx_(k-1) +
# (4 / lambda_max - 2) Tx_(k-1) -
# Tx_(k-2)
Tx_2 = -4. * h / lambda_max + Tx_1 * (4. / lambda_max - 2) - Tx_0
rst = rst + self.fc[i](Tx_2)
Tx_1, Tx_0 = Tx_2, Tx_1
# add bias
if self.bias is not None:
rst = rst + self.bias.data(feat.context)
return rst
"""MXNet Module for DenseChebConv"""
# pylint: disable= no-member, arguments-differ, invalid-name
import math
import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn
class DenseChebConv(nn.Block):
r"""Chebyshev Spectral Graph Convolution layer from paper `Convolutional
Neural Networks on Graphs with Fast Localized Spectral Filtering
<https://arxiv.org/pdf/1606.09375.pdf>`__.
We recommend to use this module when inducing ChebConv operations on dense
graphs / k-hop graphs.
Parameters
----------
in_feats: int
Number of input features.
out_feats: int
Number of output features.
k : int
Chebyshev filter size.
bias : bool, optional
If True, adds a learnable bias to the output. Default: ``True``.
See also
--------
ChebConv
"""
def __init__(self,
in_feats,
out_feats,
k,
bias=True):
super(DenseChebConv, self).__init__()
self._in_feats = in_feats
self._out_feats = out_feats
self._k = k
with self.name_scope():
self.fc = nn.Sequential()
for _ in range(k):
self.fc.add(
nn.Dense(out_feats, in_units=in_feats, use_bias=False,
weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)))
)
if bias:
self.bias = self.params.get('bias', shape=(out_feats,),
init=mx.init.Zero())
else:
self.bias = None
def forward(self, adj, feat, lambda_max=None):
r"""Compute (Dense) Chebyshev Spectral Graph Convolution layer.
Parameters
----------
adj : mxnet.NDArray
The adjacency matrix of the graph to apply Graph Convolution on,
should be of shape :math:`(N, N)`, where a row represents the destination
and a column represents the source.
feat : mxnet.NDArray
The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
is size of input feature, :math:`N` is the number of nodes.
lambda_max : float or None, optional
A float value indicates the largest eigenvalue of given graph.
Default: None.
Returns
-------
mxnet.NDArray
The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
is size of output feature.
"""
A = adj.astype(feat.dtype).as_in_context(feat.context)
num_nodes = A.shape[0]
in_degree = 1. / nd.clip(A.sum(axis=1), 1, float('inf')).sqrt()
D_invsqrt = nd.diag(in_degree)
I = nd.eye(num_nodes, ctx=A.context)
L = I - nd.dot(D_invsqrt, nd.dot(A, D_invsqrt))
if lambda_max is None:
# NOTE(zihao): this only works for directed graph.
lambda_max = (nd.linalg.syevd(L)[1]).max()
L_hat = 2 * L / lambda_max - I
Z = [nd.eye(num_nodes, ctx=A.context)]
Zh = self.fc[0](feat)
for i in range(1, self._k):
if i == 1:
Z.append(L_hat)
else:
Z.append(2 * nd.dot(L_hat, Z[-1]) - Z[-2])
Zh = Zh + nd.dot(Z[i], self.fc[i](feat))
if self.bias is not None:
Zh = Zh + self.bias.data(feat.context)
return Zh
"""MXNet Module for DenseGraphConv"""
# pylint: disable= no-member, arguments-differ, invalid-name
import math
import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn
class DenseGraphConv(nn.Block):
"""Graph Convolutional Network layer where the graph structure
is given by an adjacency matrix.
We recommend user to use this module when inducing graph convolution
on dense graphs / k-hop graphs.
Parameters
----------
in_feats : int
Input feature size.
out_feats : int
Output feature size.
norm : bool
If True, the normalizer :math:`c_{ij}` is applied. Default: ``True``.
bias : bool
If True, adds a learnable bias to the output. Default: ``True``.
activation : callable activation function/layer or None, optional
If not None, applies an activation function to the updated node features.
Default: ``None``.
See also
--------
GraphConv
"""
def __init__(self,
in_feats,
out_feats,
norm=True,
bias=True,
activation=None):
super(DenseGraphConv, self).__init__()
self._in_feats = in_feats
self._out_feats = out_feats
self._norm = norm
with self.name_scope():
self.weight = self.params.get('weight', shape=(in_feats, out_feats),
init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
if bias:
self.bias = self.params.get('bias', shape=(out_feats,),
init=mx.init.Zero())
else:
self.bias = None
self._activation = activation
def forward(self, adj, feat):
r"""Compute (Dense) Graph Convolution layer.
Parameters
----------
adj : mxnet.NDArray
The adjacency matrix of the graph to apply Graph Convolution on,
should be of shape :math:`(N, N)`, where a row represents the destination
and a column represents the source.
feat : mxnet.NDArray
The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
is size of input feature, :math:`N` is the number of nodes.
Returns
-------
mxnet.NDArray
The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
is size of output feature.
"""
adj = adj.astype(feat.dtype).as_in_context(feat.context)
if self._norm:
in_degrees = adj.sum(axis=1)
norm = nd.power(in_degrees, -0.5)
shp = norm.shape + (1,) * (feat.ndim - 1)
norm = norm.reshape(shp).as_in_context(feat.context)
feat = feat * norm
if self._in_feats > self._out_feats:
# mult W first to reduce the feature size for aggregation.
feat = nd.dot(feat, self.weight.data(feat.context))
rst = nd.dot(adj, feat)
else:
# aggregate first then mult W
rst = nd.dot(adj, feat)
rst = nd.dot(rst, self.weight.data(feat.context))
if self._norm:
rst = rst * norm
if self.bias is not None:
rst = rst + self.bias.data(feat.context)
if self._activation is not None:
rst = self._activation(rst)
return rst
"""MXNet Module for DenseGraphSAGE"""
# pylint: disable= no-member, arguments-differ, invalid-name
import math
import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn
class DenseSAGEConv(nn.Block):
"""GraphSAGE layer where the graph structure is given by an
adjacency matrix.
We recommend to use this module when inducing GraphSAGE operations
on dense graphs / k-hop graphs.
Note that we only support gcn aggregator in DenseSAGEConv.
Parameters
----------
in_feats : int
Input feature size.
out_feats : int
Output feature size.
feat_drop : float, optional
Dropout rate on features. Default: 0.
bias : bool
If True, adds a learnable bias to the output. Default: ``True``.
norm : callable activation function/layer or None, optional
If not None, applies normalization to the updated node features.
activation : callable activation function/layer or None, optional
If not None, applies an activation function to the updated node features.
Default: ``None``.
See also
--------
SAGEConv
"""
def __init__(self,
in_feats,
out_feats,
feat_drop=0.,
bias=True,
norm=None,
activation=None):
super(DenseSAGEConv, self).__init__()
self._in_feats = in_feats
self._out_feats = out_feats
self._norm = norm
with self.name_scope():
self.feat_drop = nn.Dropout(feat_drop)
self.activation = activation
self.fc = nn.Dense(out_feats, in_units=in_feats, use_bias=bias,
weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)))
def forward(self, adj, feat):
r"""Compute (Dense) Graph SAGE layer.
Parameters
----------
adj : mxnet.NDArray
The adjacency matrix of the graph to apply Graph Convolution on,
should be of shape :math:`(N, N)`, where a row represents the destination
and a column represents the source.
feat : mxnet.NDArray
The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
is size of input feature, :math:`N` is the number of nodes.
Returns
-------
mxnet.NDArray
The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
is size of output feature.
"""
adj = adj.astype(feat.dtype).as_in_context(feat.context)
feat = self.feat_drop(feat)
in_degrees = adj.sum(axis=1, keepdims=True)
h_neigh = (nd.dot(adj, feat) + feat) / (in_degrees + 1)
rst = self.fc(h_neigh)
# activation
if self.activation is not None:
rst = self.activation(rst)
# normalization
if self._norm is not None:
rst = self._norm(rst)
return rst
"""MXNet Module for EdgeConv Layer"""
# pylint: disable= no-member, arguments-differ, invalid-name
import mxnet as mx
from mxnet.gluon import nn
from .... import function as fn
class EdgeConv(nn.Block):
r"""EdgeConv layer.
Introduced in "`Dynamic Graph CNN for Learning on Point Clouds
<https://arxiv.org/pdf/1801.07829>`__". Can be described as follows:
.. math::
x_i^{(l+1)} = \max_{j \in \mathcal{N}(i)} \mathrm{ReLU}(
\Theta \cdot (x_j^{(l)} - x_i^{(l)}) + \Phi \cdot x_i^{(l)})
where :math:`\mathcal{N}(i)` is the neighbor of :math:`i`.
Parameters
----------
in_feat : int
Input feature size.
out_feat : int
Output feature size.
batch_norm : bool
Whether to include batch normalization on messages.
"""
def __init__(self,
in_feat,
out_feat,
batch_norm=False):
super(EdgeConv, self).__init__()
self.batch_norm = batch_norm
with self.name_scope():
self.theta = nn.Dense(out_feat, in_units=in_feat,
weight_initializer=mx.init.Xavier())
self.phi = nn.Dense(out_feat, in_units=in_feat,
weight_initializer=mx.init.Xavier())
if batch_norm:
self.bn = nn.BatchNorm(in_channels=out_feat)
def message(self, edges):
r"""The message computation function
"""
theta_x = self.theta(edges.dst['x'] - edges.src['x'])
phi_x = self.phi(edges.src['x'])
return {'e': theta_x + phi_x}
def forward(self, g, h):
r"""Forward computation
Parameters
----------
g : DGLGraph
The graph.
h : mxnet.NDArray
:math:`(N, D)` where :math:`N` is the number of nodes and
:math:`D` is the number of feature dimensions.
Returns
-------
mxnet.NDArray
New node features.
"""
with g.local_scope():
g.ndata['x'] = h
if not self.batch_norm:
g.update_all(self.message, fn.max('e', 'x'))
else:
g.apply_edges(self.message)
g.edata['e'] = self.bn(g.edata['e'])
g.update_all(fn.copy_e('e', 'm'), fn.max('m', 'x'))
return g.ndata['x']
"""MXNet modules for graph attention networks(GAT)."""
# pylint: disable= no-member, arguments-differ, invalid-name
import math
import mxnet as mx
from mxnet.gluon import nn
from mxnet.gluon.contrib.nn import Identity
from .... import function as fn
from ..softmax import edge_softmax
#pylint: enable=W0235
class GATConv(nn.Block):
r"""Apply `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`__
over an input signal.
.. math::
h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and
node :math:`j`:
.. math::
\alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
Parameters
----------
in_feats : int
Input feature size.
out_feats : int
Output feature size.
num_heads : int
Number of heads in Multi-Head Attention.
feat_drop : float, optional
Dropout rate on feature, defaults: ``0``.
attn_drop : float, optional
Dropout rate on attention weight, defaults: ``0``.
negative_slope : float, optional
LeakyReLU angle of negative slope.
residual : bool, optional
If True, use residual connection.
activation : callable activation function/layer or None, optional.
If not None, applies an activation function to the updated node features.
Default: ``None``.
"""
def __init__(self,
in_feats,
out_feats,
num_heads,
feat_drop=0.,
attn_drop=0.,
negative_slope=0.2,
residual=False,
activation=None):
super(GATConv, self).__init__()
self._num_heads = num_heads
self._in_feats = in_feats
self._out_feats = out_feats
with self.name_scope():
self.fc = nn.Dense(out_feats * num_heads, use_bias=False,
weight_initializer=mx.init.Xavier(magnitude=math.sqrt(2.0)),
in_units=in_feats)
self.attn_l = self.params.get('attn_l',
shape=(1, num_heads, out_feats),
init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
self.attn_r = self.params.get('attn_r',
shape=(1, num_heads, out_feats),
init=mx.init.Xavier(magnitude=math.sqrt(2.0)))
self.feat_drop = nn.Dropout(feat_drop)
self.attn_drop = nn.Dropout(attn_drop)
self.leaky_relu = nn.LeakyReLU(negative_slope)
if residual:
if in_feats != out_feats:
self.res_fc = nn.Dense(out_feats * num_heads, use_bias=False,
weight_initializer=mx.init.Xavier(
magnitude=math.sqrt(2.0)),
in_units=in_feats)
else:
self.res_fc = Identity()
else:
self.res_fc = None
self.activation = activation
def forward(self, graph, feat):
r"""Compute graph attention network layer.
Parameters
----------
graph : DGLGraph
The graph.
feat : mxnet.NDArray
The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
is size of input feature, :math:`N` is the number of nodes.
Returns
-------
mxnet.NDArray
The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
is the number of heads, and :math:`D_{out}` is size of output feature.
"""
graph = graph.local_var()
h = self.feat_drop(feat)
feat = self.fc(h).reshape(-1, self._num_heads, self._out_feats)
el = (feat * self.attn_l.data(feat.context)).sum(axis=-1).expand_dims(-1)
er = (feat * self.attn_r.data(feat.context)).sum(axis=-1).expand_dims(-1)
graph.ndata.update({'ft': feat, 'el': el, 'er': er})
# compute edge attention
graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
e = self.leaky_relu(graph.edata.pop('e'))
# compute softmax
graph.edata['a'] = self.attn_drop(edge_softmax(graph, e))
graph.update_all(fn.u_mul_e('ft', 'a', 'm'),
fn.sum('m', 'ft'))
rst = graph.ndata['ft']
# residual
if self.res_fc is not None:
resval = self.res_fc(h).reshape(h.shape[0], -1, self._out_feats)
rst = rst + resval
# activation
if self.activation:
rst = self.activation(rst)
return rst
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment