Unverified Commit a00636a0 authored by VoVAllen's avatar VoVAllen Committed by GitHub
Browse files

[Tensorflow] Several nn & example (#1191)

* several nn example

* appnp

* fix lint

* lint

* add dgi

* fix

* fix

* fix

* fff

* docs

* 111

* fix

* change init

* change result

* tiaocan+1

* fix

* fix lint

* fix

* fix
parent 31a7d509
...@@ -9,3 +9,4 @@ NN Modules ...@@ -9,3 +9,4 @@ NN Modules
nn.pytorch nn.pytorch
nn.mxnet nn.mxnet
nn.tensorflow
.. _apinn-tensorflow:
NN Modules (Tensorflow)
====================
.. contents:: Contents
:local:
We welcome your contribution! If you want a model to be implemented in DGL as a NN module,
please `create an issue <https://github.com/dmlc/dgl/issues>`_ started with "[Feature Request] NN Module XXXModel".
If you want to contribute a NN module, please `create a pull request <https://github.com/dmlc/dgl/pulls>`_ started
with "[NN] XXXModel in tensorflow NN Modules" and our team member would review this PR.
Conv Layers
----------------------------------------
.. automodule:: dgl.nn.tensorflow.conv
GraphConv
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.conv.GraphConv
:members: weight, bias, forward, reset_parameters
:show-inheritance:
RelGraphConv
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.conv.RelGraphConv
:members: forward
:show-inheritance:
GATConv
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.conv.GATConv
:members: forward
:show-inheritance:
SAGEConv
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.conv.SAGEConv
:members: forward
:show-inheritance:
SGConv
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.conv.SGConv
:members: forward
:show-inheritance:
APPNPConv
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.conv.APPNPConv
:members: forward
:show-inheritance:
GINConv
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.conv.GINConv
:members: forward
:show-inheritance:
Global Pooling Layers
----------------------------------------
.. automodule:: dgl.nn.tensorflow.glob
SumPooling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.glob.SumPooling
:members:
:show-inheritance:
AvgPooling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.glob.AvgPooling
:members:
:show-inheritance:
MaxPooling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.glob.MaxPooling
:members:
:show-inheritance:
SortPooling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.glob.SortPooling
:members:
:show-inheritance:
GlobalAttentionPooling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.tensorflow.glob.GlobalAttentionPooling
:members:
:show-inheritance:
Utility Modules
----------------------------------------
Edge Softmax
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. automodule:: dgl.nn.tensorflow.softmax
:members: edge_softmax
...@@ -31,5 +31,5 @@ class GCN(nn.Module): ...@@ -31,5 +31,5 @@ class GCN(nn.Module):
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
if i != 0: if i != 0:
h = self.dropout(h) h = self.dropout(h)
h = layer(h, self.g) h = layer(self.g, h)
return h return h
...@@ -41,11 +41,11 @@ python3 train_ppi.py --gpu=0 ...@@ -41,11 +41,11 @@ python3 train_ppi.py --gpu=0
Results Results
------- -------
| Dataset | Test Accuracy | Time(s) | Baseline#1 times(s) | Baseline#2 times(s) | | Dataset | Test Accuracy | Time(s) | Baseline#1 times(s) | Baseline#2 times(s) |
| ------- | ------------- | ------- | ------------------- | ------------------- | | -------- | ------------- | ------- | ------------------- | ------------------- |
| Cora | 84.02(0.40) | 0.0113 | 0.0982 (**8.7x**) | 0.0424 (**3.8x**) | | Cora | 84.02(0.40) | 0.0113 | 0.0982 (**8.7x**) | 0.0424 (**3.8x**) |
| Citeseer | 70.91(0.79) | 0.0111 | n/a | n/a | | Citeseer | 70.91(0.79) | 0.0111 | n/a | n/a |
| Pubmed | 78.57(0.75) | 0.0115 | n/a | n/a | | Pubmed | 78.57(0.75) | 0.0115 | n/a | n/a |
* All the accuracy numbers are obtained after 300 epochs. * All the accuracy numbers are obtained after 300 epochs.
* The time measures how long it takes to train one epoch. * The time measures how long it takes to train one epoch.
......
Deep Graph Infomax (DGI)
========================
- Paper link: [https://arxiv.org/abs/1809.10341](https://arxiv.org/abs/1809.10341)
- Author's code repo (in Pytorch):
[https://github.com/PetarV-/DGI](https://github.com/PetarV-/DGI)
Dependencies
------------
- tensorflow 2.1+
- requests
```bash
pip install tensorflow requests
```
How to run
----------
Run with following:
```bash
python3 train.py --dataset=cora --gpu=0 --self-loop
```
```bash
python3 train.py --dataset=citeseer --gpu=0
```
```bash
python3 train.py --dataset=pubmed --gpu=0
```
Results
-------
* cora: ~81.6 (80.9-82.9) (paper: 82.3)
* citeseer: ~70.2 (paper: 71.8)
* pubmed: ~77.2 (paper: 76.8)
"""
Deep Graph Infomax in DGL
References
----------
Papers: https://arxiv.org/abs/1809.10341
Author's code: https://github.com/PetarV-/DGI
"""
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import math
from gcn import GCN
class Encoder(layers.Layer):
def __init__(self, g, in_feats, n_hidden, n_layers, activation, dropout):
super(Encoder, self).__init__()
self.g = g
self.conv = GCN(g, in_feats, n_hidden, n_hidden,
n_layers, activation, dropout)
def call(self, features, corrupt=False):
if corrupt:
perm = np.random.permutation(self.g.number_of_nodes())
features = tf.gather(features, perm)
features = self.conv(features)
return features
class Discriminator(layers.Layer):
def __init__(self, n_hidden):
super(Discriminator, self).__init__()
uinit = tf.keras.initializers.RandomUniform(
-1.0/math.sqrt(n_hidden), 1.0/math.sqrt(n_hidden))
self.weight = tf.Variable(initial_value=uinit(
shape=(n_hidden, n_hidden), dtype='float32'), trainable=True)
def call(self, features, summary):
features = tf.matmul(features, tf.matmul(
self.weight, tf.expand_dims(summary, -1)))
return features
class DGI(tf.keras.Model):
def __init__(self, g, in_feats, n_hidden, n_layers, activation, dropout):
super(DGI, self).__init__()
self.encoder = Encoder(g, in_feats, n_hidden,
n_layers, activation, dropout)
self.discriminator = Discriminator(n_hidden)
self.loss = tf.nn.sigmoid_cross_entropy_with_logits
def call(self, features):
positive = self.encoder(features, corrupt=False)
negative = self.encoder(features, corrupt=True)
summary = tf.nn.sigmoid(tf.reduce_mean(positive, axis=0))
positive = self.discriminator(positive, summary)
negative = self.discriminator(negative, summary)
l1 = self.loss(tf.ones(positive.shape),positive)
l2 = self.loss(tf.zeros(negative.shape), negative)
return tf.reduce_mean(l1) + tf.reduce_mean(l2)
class Classifier(layers.Layer):
def __init__(self, n_hidden, n_classes):
super(Classifier, self).__init__()
self.fc = layers.Dense(n_classes)
def call(self, features):
features = self.fc(features)
return features
"""
This code was copied from the GCN implementation in DGL examples.
"""
import tensorflow as tf
from tensorflow.keras import layers
from dgl.nn.tensorflow import GraphConv
class GCN(layers.Layer):
def __init__(self,
g,
in_feats,
n_hidden,
n_classes,
n_layers,
activation,
dropout):
super(GCN, self).__init__()
self.g = g
self.layers =[]
# input layer
self.layers.append(GraphConv(in_feats, n_hidden, activation=activation))
# hidden layers
for i in range(n_layers - 1):
self.layers.append(GraphConv(n_hidden, n_hidden, activation=activation))
# output layer
self.layers.append(GraphConv(n_hidden, n_classes))
self.dropout = layers.Dropout(dropout)
def call(self, features):
h = features
for i, layer in enumerate(self.layers):
if i != 0:
h = self.dropout(h)
h = layer(self.g, h)
return h
import argparse
import time
import numpy as np
import networkx as nx
import tensorflow as tf
from tensorflow.keras import layers
from dgl import DGLGraph
from dgl.data import register_data_args, load_data
from dgi import DGI, Classifier
def evaluate(model, features, labels, mask):
logits = model(features, training=False)
logits = logits[mask]
labels = labels[mask]
indices = tf.math.argmax(logits, axis=1)
acc = tf.reduce_mean(tf.cast(indices == labels, dtype=tf.float32))
return acc.numpy().item()
def main(args):
# load and preprocess dataset
data = load_data(args)
if args.gpu < 0:
device = "/cpu:0"
else:
device = "/gpu:{}".format(args.gpu)
with tf.device(device):
features = tf.convert_to_tensor(data.features, dtype=tf.float32)
labels = tf.convert_to_tensor(data.labels, dtype=tf.int64)
train_mask = tf.convert_to_tensor(data.train_mask, dtype=tf.bool)
val_mask = tf.convert_to_tensor(data.val_mask, dtype=tf.bool)
test_mask = tf.convert_to_tensor(data.test_mask, dtype=tf.bool)
in_feats = features.shape[1]
n_classes = data.num_labels
n_edges = data.graph.number_of_edges()
# graph preprocess
g = data.graph
# add self loop
if args.self_loop:
g.remove_edges_from(nx.selfloop_edges(g))
g.add_edges_from(zip(g.nodes(), g.nodes()))
g = DGLGraph(g)
n_edges = g.number_of_edges()
# create DGI model
dgi = DGI(g,
in_feats,
args.n_hidden,
args.n_layers,
tf.keras.layers.PReLU(alpha_initializer=tf.constant_initializer(0.25)),
args.dropout)
dgi_optimizer = tf.keras.optimizers.Adam(
learning_rate=args.dgi_lr)
# train deep graph infomax
cnt_wait = 0
best = 1e9
best_t = 0
dur = []
for epoch in range(args.n_dgi_epochs):
if epoch >= 3:
t0 = time.time()
with tf.GradientTape() as tape:
loss = dgi(features)
# Manually Weight Decay
# We found Tensorflow has a different implementation on weight decay
# of Adam(W) optimizer with PyTorch. And this results in worse results.
# Manually adding weights to the loss to do weight decay solves this problem.
for weight in dgi.trainable_weights:
loss = loss + \
args.weight_decay * tf.nn.l2_loss(weight)
grads = tape.gradient(loss, dgi.trainable_weights)
dgi_optimizer.apply_gradients(zip(grads, dgi.trainable_weights))
if loss < best:
best = loss
best_t = epoch
cnt_wait = 0
dgi.save_weights('best_dgi.pkl')
else:
cnt_wait += 1
if cnt_wait == args.patience:
print('Early stopping!')
break
if epoch >= 3:
dur.append(time.time() - t0)
print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
"ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.numpy().item(),
n_edges / np.mean(dur) / 1000))
# create classifier model
classifier = Classifier(args.n_hidden, n_classes)
classifier_optimizer = tf.keras.optimizers.Adam(learning_rate=args.classifier_lr)
# train classifier
print('Loading {}th epoch'.format(best_t))
dgi.load_weights('best_dgi.pkl')
embeds = dgi.encoder(features, corrupt=False)
embeds = tf.stop_gradient(embeds)
dur = []
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True)
for epoch in range(args.n_classifier_epochs):
if epoch >= 3:
t0 = time.time()
with tf.GradientTape() as tape:
preds = classifier(embeds)
loss = loss_fcn(labels[train_mask], preds[train_mask])
# Manually Weight Decay
# We found Tensorflow has a different implementation on weight decay
# of Adam(W) optimizer with PyTorch. And this results in worse results.
# Manually adding weights to the loss to do weight decay solves this problem.
# In original code, there's no weight decay applied in this part
# link: https://github.com/PetarV-/DGI/blob/master/execute.py#L121
# for weight in classifier.trainable_weights:
# loss = loss + \
# args.weight_decay * tf.nn.l2_loss(weight)
grads = tape.gradient(loss, classifier.trainable_weights)
classifier_optimizer.apply_gradients(zip(grads, classifier.trainable_weights))
if epoch >= 3:
dur.append(time.time() - t0)
acc = evaluate(classifier, embeds, labels, val_mask)
print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
"ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.numpy().item(),
acc, n_edges / np.mean(dur) / 1000))
print()
acc = evaluate(classifier, embeds, labels, test_mask)
print("Test Accuracy {:.4f}".format(acc))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='DGI')
register_data_args(parser)
parser.add_argument("--dropout", type=float, default=0.,
help="dropout probability")
parser.add_argument("--gpu", type=int, default=-1,
help="gpu")
parser.add_argument("--dgi-lr", type=float, default=1e-3,
help="dgi learning rate")
parser.add_argument("--classifier-lr", type=float, default=1e-2,
help="classifier learning rate")
parser.add_argument("--n-dgi-epochs", type=int, default=300,
help="number of training epochs")
parser.add_argument("--n-classifier-epochs", type=int, default=300,
help="number of training epochs")
parser.add_argument("--n-hidden", type=int, default=512,
help="number of hidden gcn units")
parser.add_argument("--n-layers", type=int, default=1,
help="number of hidden gcn layers")
parser.add_argument("--weight-decay", type=float, default=0.,
help="Weight for L2 loss")
parser.add_argument("--patience", type=int, default=20,
help="early stop patience condition")
parser.add_argument("--self-loop", action='store_true',
help="graph self-loop (default=False)")
parser.set_defaults(self_loop=False)
args = parser.parse_args()
print(args)
main(args)
Graph Attention Networks (GAT)
============
- Paper link: [https://arxiv.org/abs/1710.10903](https://arxiv.org/abs/1710.10903)
- Author's code repo (in Tensorflow):
[https://github.com/PetarV-/GAT](https://github.com/PetarV-/GAT).
- Popular pytorch implementation:
[https://github.com/Diego999/pyGAT](https://github.com/Diego999/pyGAT).
Dependencies
------------
- tensorflow 2.1.0+
- requests
```bash
pip install tensorflow requests
```
How to run
----------
Run with following:
```bash
python3 train.py --dataset=cora --gpu=0
```
```bash
python3 train.py --dataset=citeseer --gpu=0 --early-stop
```
```bash
python3 train.py --dataset=pubmed --gpu=0 --num-out-heads=8 --weight-decay=0.001 --early-stop
```
Results
-------
| Dataset | Test Accuracy | Baseline (paper) |
| -------- | ------------- | ---------------- |
| Cora | 84.2 | 83.0(+-0.7) |
| Citeseer | 70.9 | 72.5(+-0.7) |
| Pubmed | 78.5 | 79.0(+-0.3) |
* All the accuracy numbers are obtained after 200 epochs.
* All time is measured on EC2 p3.2xlarge instance w/ V100 GPU.
"""
Graph Attention Networks in DGL using SPMV optimization.
References
----------
Paper: https://arxiv.org/abs/1710.10903
Author's code: https://github.com/PetarV-/GAT
Pytorch implementation: https://github.com/Diego999/pyGAT
"""
import tensorflow as tf
from tensorflow.keras import layers
import dgl.function as fn
from dgl.nn.tensorflow import edge_softmax, GATConv
class GAT(tf.keras.Model):
def __init__(self,
g,
num_layers,
in_dim,
num_hidden,
num_classes,
heads,
activation,
feat_drop,
attn_drop,
negative_slope,
residual):
super(GAT, self).__init__()
self.g = g
self.num_layers = num_layers
self.gat_layers = []
self.activation = activation
# input projection (no residual)
self.gat_layers.append(GATConv(
in_dim, num_hidden, heads[0],
feat_drop, attn_drop, negative_slope, False, self.activation))
# hidden layers
for l in range(1, num_layers):
# due to multi-head, the in_dim = num_hidden * num_heads
self.gat_layers.append(GATConv(
num_hidden * heads[l-1], num_hidden, heads[l],
feat_drop, attn_drop, negative_slope, residual, self.activation))
# output projection
self.gat_layers.append(GATConv(
num_hidden * heads[-2], num_classes, heads[-1],
feat_drop, attn_drop, negative_slope, residual, None))
def call(self, inputs):
h = inputs
for l in range(self.num_layers):
h = self.gat_layers[l](self.g, h)
h = tf.reshape(h, (h.shape[0], -1))
# output projection
logits = tf.reduce_mean(self.gat_layers[-1](self.g, h), axis=1)
return logits
"""
Graph Attention Networks in DGL using SPMV optimization.
Multiple heads are also batched together for faster training.
Compared with the original paper, this code does not implement
early stopping.
References
----------
Paper: https://arxiv.org/abs/1710.10903
Author's code: https://github.com/PetarV-/GAT
Pytorch implementation: https://github.com/Diego999/pyGAT
"""
import argparse
import numpy as np
import networkx as nx
import time
import tensorflow as tf
from dgl import DGLGraph
from dgl.data import register_data_args, load_data
from gat import GAT
from utils import EarlyStopping
def accuracy(logits, labels):
indices = tf.math.argmax(logits, axis=1)
acc = tf.reduce_mean(tf.cast(indices == labels, dtype=tf.float32))
return acc.numpy().item()
def evaluate(model, features, labels, mask):
logits = model(features, training=False)
logits = logits[mask]
labels = labels[mask]
return accuracy(logits, labels)
def main(args):
# load and preprocess dataset
data = load_data(args)
if args.gpu < 0:
device = "/cpu:0"
else:
device = "/gpu:{}".format(args.gpu)
with tf.device(device):
features = tf.convert_to_tensor(data.features, dtype=tf.float32)
labels = tf.convert_to_tensor(data.labels, dtype=tf.int64)
train_mask = tf.convert_to_tensor(data.train_mask, dtype=tf.bool)
val_mask = tf.convert_to_tensor(data.val_mask, dtype=tf.bool)
test_mask = tf.convert_to_tensor(data.test_mask, dtype=tf.bool)
num_feats = features.shape[1]
n_classes = data.num_labels
n_edges = data.graph.number_of_edges()
print("""----Data statistics------'
#Edges %d
#Classes %d
#Train samples %d
#Val samples %d
#Test samples %d""" %
(n_edges, n_classes,
train_mask.numpy().sum(),
val_mask.numpy().sum(),
test_mask.numpy().sum()))
g = data.graph
# add self loop
g.remove_edges_from(nx.selfloop_edges(g))
g = DGLGraph(g)
g.add_edges(g.nodes(), g.nodes())
n_edges = g.number_of_edges()
# create model
heads = ([args.num_heads] * args.num_layers) + [args.num_out_heads]
model = GAT(g,
args.num_layers,
num_feats,
args.num_hidden,
n_classes,
heads,
tf.nn.elu,
args.in_drop,
args.attn_drop,
args.negative_slope,
args.residual)
print(model)
if args.early_stop:
stopper = EarlyStopping(patience=100)
# loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(
# from_logits=False)
loss_fcn = tf.nn.sparse_softmax_cross_entropy_with_logits
# use optimizer
optimizer = tf.keras.optimizers.Adam(
learning_rate=args.lr, epsilon=1e-8)
# initialize graph
dur = []
for epoch in range(args.epochs):
if epoch >= 3:
t0 = time.time()
# forward
with tf.GradientTape() as tape:
tape.watch(model.trainable_weights)
logits = model(features, training=True)
loss_value = tf.reduce_mean(loss_fcn(
labels=labels[train_mask], logits=logits[train_mask]))
# Manually Weight Decay
# We found Tensorflow has a different implementation on weight decay
# of Adam(W) optimizer with PyTorch. And this results in worse results.
# Manually adding weights to the loss to do weight decay solves this problem.
for weight in model.trainable_weights:
loss_value = loss_value + \
args.weight_decay*tf.nn.l2_loss(weight)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
if epoch >= 3:
dur.append(time.time() - t0)
train_acc = accuracy(logits[train_mask], labels[train_mask])
if args.fastmode:
val_acc = accuracy(logits[val_mask], labels[val_mask])
else:
val_acc = evaluate(model, features, labels, val_mask)
if args.early_stop:
if stopper.step(val_acc, model):
break
print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | TrainAcc {:.4f} |"
" ValAcc {:.4f} | ETputs(KTEPS) {:.2f}".
format(epoch, np.mean(dur), loss_value.numpy().item(), train_acc,
val_acc, n_edges / np.mean(dur) / 1000))
print()
if args.early_stop:
model.load_weights('es_checkpoint.pb')
acc = evaluate(model, features, labels, test_mask)
print("Test Accuracy {:.4f}".format(acc))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='GAT')
register_data_args(parser)
parser.add_argument("--gpu", type=int, default=-1,
help="which GPU to use. Set -1 to use CPU.")
parser.add_argument("--epochs", type=int, default=200,
help="number of training epochs")
parser.add_argument("--num-heads", type=int, default=8,
help="number of hidden attention heads")
parser.add_argument("--num-out-heads", type=int, default=1,
help="number of output attention heads")
parser.add_argument("--num-layers", type=int, default=1,
help="number of hidden layers")
parser.add_argument("--num-hidden", type=int, default=8,
help="number of hidden units")
parser.add_argument("--residual", action="store_true", default=False,
help="use residual connection")
parser.add_argument("--in-drop", type=float, default=.6,
help="input feature dropout")
parser.add_argument("--attn-drop", type=float, default=.6,
help="attention dropout")
parser.add_argument("--lr", type=float, default=0.005,
help="learning rate")
parser.add_argument('--weight-decay', type=float, default=5e-4,
help="weight decay")
parser.add_argument('--negative-slope', type=float, default=0.2,
help="the negative slope of leaky relu")
parser.add_argument('--early-stop', action='store_true', default=False,
help="indicates whether to use early stop or not")
parser.add_argument('--fastmode', action="store_true", default=False,
help="skip re-evaluate the validation set")
args = parser.parse_args()
print(args)
main(args)
import numpy as np
class EarlyStopping:
def __init__(self, patience=10):
self.patience = patience
self.counter = 0
self.best_score = None
self.early_stop = False
def step(self, acc, model):
score = acc
if self.best_score is None:
self.best_score = score
self.save_checkpoint(model)
elif score < self.best_score:
self.counter += 1
print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.save_checkpoint(model)
self.counter = 0
return self.early_stop
def save_checkpoint(self, model):
'''Saves model when validation loss decrease.'''
model.save_weights('es_checkpoint.pb')
Graph Convolutional Networks (GCN)
============
- Paper link: [https://arxiv.org/abs/1609.02907](https://arxiv.org/abs/1609.02907)
- Author's code repo: [https://github.com/tkipf/gcn](https://github.com/tkipf/gcn). Note that the original code is
implemented with Tensorflow for the paper.
Dependencies
------------
- Tensorflow 2.1+
- requests
``bash
pip install tensorflow requests
``
Codes
-----
The folder contains three implementations of GCN:
- `gcn.py` uses DGL's predefined graph convolution module.
- `gcn_mp.py` uses user-defined message and reduce functions.
- `gcn_builtin.py` improves from `gcn_mp.py` by using DGL's builtin functions
so SPMV optimization could be applied.
Results
-------
Run with following (available dataset: "cora", "citeseer", "pubmed")
```bash
python3 train.py --dataset cora --gpu 0 --self-loop
```
* cora: ~0.810 (0.79-0.83) (paper: 0.815)
* citeseer: 0.707 (paper: 0.703)
* pubmed: 0.792 (paper: 0.790)
"""GCN using DGL nn package
References:
- Semi-Supervised Classification with Graph Convolutional Networks
- Paper: https://arxiv.org/abs/1609.02907
- Code: https://github.com/tkipf/gcn
"""
import tensorflow as tf
from tensorflow.keras import layers
from dgl.nn.tensorflow import GraphConv
class GCN(tf.keras.Model):
def __init__(self,
g,
in_feats,
n_hidden,
n_classes,
n_layers,
activation,
dropout):
super(GCN, self).__init__()
self.g = g
self.layer_list = []
# input layer
self.layer_list.append(GraphConv(in_feats, n_hidden, activation=activation))
# hidden layers
for i in range(n_layers - 1):
self.layer_list.append(GraphConv(n_hidden, n_hidden, activation=activation))
# output layer
self.layer_list.append(GraphConv(n_hidden, n_classes))
self.dropout = layers.Dropout(dropout)
def call(self, features):
h = features
for i, layer in enumerate(self.layer_list):
if i != 0:
h = self.dropout(h)
h = layer(self.g, h)
return h
...@@ -21,7 +21,8 @@ class GCNLayer(layers.Layer): ...@@ -21,7 +21,8 @@ class GCNLayer(layers.Layer):
super(GCNLayer, self).__init__() super(GCNLayer, self).__init__()
self.g = g self.g = g
w_init = tf.random_normal_initializer() w_init = tf.keras.initializers.VarianceScaling(
scale=1.0, mode="fan_out", distribution="uniform")
self.weight = tf.Variable(initial_value=w_init(shape=(in_feats, out_feats), self.weight = tf.Variable(initial_value=w_init(shape=(in_feats, out_feats),
dtype='float32'), dtype='float32'),
trainable=True) trainable=True)
...@@ -144,7 +145,7 @@ def main(args): ...@@ -144,7 +145,7 @@ def main(args):
args.dropout) args.dropout)
optimizer = tf.keras.optimizers.Adam( optimizer = tf.keras.optimizers.Adam(
learning_rate=args.lr, decay=args.weight_decay) learning_rate=args.lr)
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True) from_logits=True)
...@@ -157,6 +158,13 @@ def main(args): ...@@ -157,6 +158,13 @@ def main(args):
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
logits = model(features) logits = model(features)
loss_value = loss_fcn(labels[train_mask], logits[train_mask]) loss_value = loss_fcn(labels[train_mask], logits[train_mask])
# Manually Weight Decay
# We found Tensorflow has a different implementation on weight decay
# of Adam(W) optimizer with PyTorch. And this results in worse results.
# Manually adding weights to the loss to do weight decay solves this problem.
for weight in model.trainable_weights:
loss_value = loss_value + \
args.weight_decay*tf.nn.l2_loss(weight)
grads = tape.gradient(loss_value, model.trainable_weights) grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights)) optimizer.apply_gradients(zip(grads, model.trainable_weights))
......
...@@ -151,7 +151,7 @@ def main(args): ...@@ -151,7 +151,7 @@ def main(args):
args.dropout) args.dropout)
optimizer = tf.keras.optimizers.Adam( optimizer = tf.keras.optimizers.Adam(
learning_rate=args.lr, decay=args.weight_decay) learning_rate=args.lr)
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True) from_logits=True)
...@@ -164,7 +164,13 @@ def main(args): ...@@ -164,7 +164,13 @@ def main(args):
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
logits = model(features) logits = model(features)
loss_value = loss_fcn(labels[train_mask], logits[train_mask]) loss_value = loss_fcn(labels[train_mask], logits[train_mask])
# Manually Weight Decay
# We found Tensorflow has a different implementation on weight decay
# of Adam(W) optimizer with PyTorch. And this results in worse results.
# Manually adding weights to the loss to do weight decay solves this problem.
for weight in model.trainable_weights:
loss_value = loss_value + \
args.weight_decay*tf.nn.l2_loss(weight)
grads = tape.gradient(loss_value, model.trainable_weights) grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights)) optimizer.apply_gradients(zip(grads, model.trainable_weights))
......
import argparse
import time
import numpy as np
import networkx as nx
import tensorflow as tf
from dgl import DGLGraph
from dgl.data import register_data_args, load_data
from gcn import GCN
def evaluate(model, features, labels, mask):
logits = model(features, training=False)
logits = logits[mask]
labels = labels[mask]
indices = tf.math.argmax(logits, axis=1)
acc = tf.reduce_mean(tf.cast(indices == labels, dtype=tf.float32))
return acc.numpy().item()
def main(args):
# load and preprocess dataset
data = load_data(args)
if args.gpu < 0:
device = "/cpu:0"
else:
device = "/gpu:{}".format(args.gpu)
with tf.device(device):
features = tf.convert_to_tensor(data.features, dtype=tf.float32)
labels = tf.convert_to_tensor(data.labels, dtype=tf.int64)
train_mask = tf.convert_to_tensor(data.train_mask, dtype=tf.bool)
val_mask = tf.convert_to_tensor(data.val_mask, dtype=tf.bool)
test_mask = tf.convert_to_tensor(data.test_mask, dtype=tf.bool)
in_feats = features.shape[1]
n_classes = data.num_labels
n_edges = data.graph.number_of_edges()
print("""----Data statistics------'
#Edges %d
#Classes %d
#Train samples %d
#Val samples %d
#Test samples %d""" %
(n_edges, n_classes,
train_mask.numpy().sum(),
val_mask.numpy().sum(),
test_mask.numpy().sum()))
# graph preprocess and calculate normalization factor
g = data.graph
if args.self_loop:
g.remove_edges_from(nx.selfloop_edges(g))
g.add_edges_from(zip(g.nodes(), g.nodes()))
g = DGLGraph(g)
n_edges = g.number_of_edges()
# normalization
degs = tf.cast(tf.identity(g.in_degrees()), dtype=tf.float32)
norm = tf.math.pow(degs, -0.5)
norm = tf.where(tf.math.is_inf(norm), tf.zeros_like(norm), norm)
g.ndata['norm'] = tf.expand_dims(norm, -1)
# create GCN model
model = GCN(g,
in_feats,
args.n_hidden,
n_classes,
args.n_layers,
tf.nn.relu,
args.dropout)
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True)
# use optimizer
optimizer = tf.keras.optimizers.Adam(
learning_rate=args.lr, epsilon=1e-8)
# initialize graph
dur = []
for epoch in range(args.n_epochs):
if epoch >= 3:
t0 = time.time()
# forward
with tf.GradientTape() as tape:
logits = model(features)
loss_value = loss_fcn(labels[train_mask], logits[train_mask])
# Manually Weight Decay
# We found Tensorflow has a different implementation on weight decay
# of Adam(W) optimizer with PyTorch. And this results in worse results.
# Manually adding weights to the loss to do weight decay solves this problem.
for weight in model.trainable_weights:
loss_value = loss_value + \
args.weight_decay*tf.nn.l2_loss(weight)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
if epoch >= 3:
dur.append(time.time() - t0)
acc = evaluate(model, features, labels, val_mask)
print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
"ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss_value.numpy().item(),
acc, n_edges / np.mean(dur) / 1000))
acc = evaluate(model, features, labels, test_mask)
print("Test Accuracy {:.4f}".format(acc))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='GCN')
register_data_args(parser)
parser.add_argument("--dropout", type=float, default=0.5,
help="dropout probability")
parser.add_argument("--gpu", type=int, default=-1,
help="gpu")
parser.add_argument("--lr", type=float, default=1e-2,
help="learning rate")
parser.add_argument("--n-epochs", type=int, default=200,
help="number of training epochs")
parser.add_argument("--n-hidden", type=int, default=16,
help="number of hidden gcn units")
parser.add_argument("--n-layers", type=int, default=1,
help="number of hidden gcn layers")
parser.add_argument("--weight-decay", type=float, default=5e-4,
help="Weight for L2 loss")
parser.add_argument("--self-loop", action='store_true',
help="graph self-loop (default=False)")
parser.set_defaults(self_loop=False)
args = parser.parse_args()
print(args)
main(args)
# Relational-GCN
* Paper: [https://arxiv.org/abs/1703.06103](https://arxiv.org/abs/1703.06103)
* Author's code for entity classification: [https://github.com/tkipf/relational-gcn](https://github.com/tkipf/relational-gcn)
* Author's code for link prediction: [https://github.com/MichSchli/RelationPrediction](https://github.com/MichSchli/RelationPrediction)
### Dependencies
* Tensorflow 2.1+
* requests
* rdflib
* pandas
```
pip install requests torch rdflib pandas
```
Example code was tested with rdflib 4.2.2 and pandas 0.23.4
### Entity Classification
AIFB: accuracy 97.22% (DGL), 95.83% (paper)
```
python3 entity_classify.py -d aifb --testing --gpu 0
```
MUTAG: accuracy 75% (DGL), 73.23% (paper)
```
python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0
```
BGS: accuracy 79.3% (DGL n-base=25), 83.10% (paper n-base=40)
```
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 25 --testing --gpu 0 --relabel
```
"""
Modeling Relational Data with Graph Convolutional Networks
Paper: https://arxiv.org/abs/1703.06103
Code: https://github.com/tkipf/relational-gcn
Difference compared to tkipf/relation-gcn
* l2norm applied to all weights
* remove nodes that won't be touched
"""
import argparse
import numpy as np
import time
import tensorflow as tf
from tensorflow.keras import layers
from dgl import DGLGraph
from dgl.nn.tensorflow import RelGraphConv
from dgl.contrib.data import load_data
from functools import partial
from model import BaseRGCN
class EntityClassify(BaseRGCN):
def create_features(self):
features = tf.range(self.num_nodes)
return features
def build_input_layer(self):
return RelGraphConv(self.num_nodes, self.h_dim, self.num_rels, "basis",
self.num_bases, activation=tf.nn.relu, self_loop=self.use_self_loop,
dropout=self.dropout)
def build_hidden_layer(self, idx):
return RelGraphConv(self.h_dim, self.h_dim, self.num_rels, "basis",
self.num_bases, activation=tf.nn.relu, self_loop=self.use_self_loop,
dropout=self.dropout)
def build_output_layer(self):
return RelGraphConv(self.h_dim, self.out_dim, self.num_rels, "basis",
self.num_bases, activation=partial(tf.nn.softmax, axis=1),
self_loop=self.use_self_loop)
def acc(logits, labels, mask):
logits = tf.gather(logits, mask)
labels = tf.gather(labels, mask)
indices = tf.math.argmax(logits, axis=1)
acc = tf.reduce_mean(tf.cast(indices == labels, dtype=tf.float32))
return acc
def main(args):
# load graph data
data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel)
num_nodes = data.num_nodes
num_rels = data.num_rels
num_classes = data.num_classes
labels = data.labels
train_idx = data.train_idx
test_idx = data.test_idx
# split dataset into train, validate, test
if args.validation:
val_idx = train_idx[:len(train_idx) // 5]
train_idx = train_idx[len(train_idx) // 5:]
else:
val_idx = train_idx
# since the nodes are featureless, the input feature is then the node id.
feats = tf.range(num_nodes, dtype=tf.int64)
# edge type and normalization factor
edge_type = tf.convert_to_tensor(data.edge_type)
edge_norm = tf.expand_dims(tf.convert_to_tensor(data.edge_norm), 1)
labels = tf.reshape(tf.convert_to_tensor(labels), (-1, ))
# check cuda
if args.gpu < 0:
device = "/cpu:0"
use_cuda = False
else:
device = "/gpu:{}".format(args.gpu)
use_cuda = True
with tf.device(device):
# create graph
g = DGLGraph()
g.add_nodes(num_nodes)
g.add_edges(data.edge_src, data.edge_dst)
# create model
model = EntityClassify(len(g),
args.n_hidden,
num_classes,
num_rels,
num_bases=args.n_bases,
num_hidden_layers=args.n_layers - 2,
dropout=args.dropout,
use_self_loop=args.use_self_loop,
use_cuda=use_cuda)
# optimizer
optimizer = tf.keras.optimizers.Adam(
learning_rate=args.lr)
# training loop
print("start training...")
forward_time = []
backward_time = []
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=False)
for epoch in range(args.n_epochs):
t0 = time.time()
with tf.GradientTape() as tape:
logits = model(g, feats, edge_type, edge_norm)
loss = loss_fcn(tf.gather(labels, train_idx), tf.gather(logits, train_idx))
# Manually Weight Decay
# We found Tensorflow has a different implementation on weight decay
# of Adam(W) optimizer with PyTorch. And this results in worse results.
# Manually adding weights to the loss to do weight decay solves this problem.
for weight in model.trainable_weights:
loss = loss + \
args.l2norm * tf.nn.l2_loss(weight)
t1 = time.time()
grads = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
t2 = time.time()
forward_time.append(t1 - t0)
backward_time.append(t2 - t1)
print("Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}".
format(epoch, forward_time[-1], backward_time[-1]))
train_acc = acc(logits, labels, train_idx)
val_loss = loss_fcn(tf.gather(labels, val_idx), tf.gather(logits, val_idx))
val_acc = acc(logits, labels, val_idx)
print("Train Accuracy: {:.4f} | Train Loss: {:.4f} | Validation Accuracy: {:.4f} | Validation loss: {:.4f}".
format(train_acc, loss.numpy().item(), val_acc, val_loss.numpy().item()))
print()
logits = model(g, feats, edge_type, edge_norm)
test_loss = loss_fcn(tf.gather(labels, test_idx), tf.gather(logits, test_idx))
test_acc = acc(logits, labels, test_idx)
print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.numpy().item()))
print()
print("Mean forward time: {:4f}".format(np.mean(forward_time[len(forward_time) // 4:])))
print("Mean backward time: {:4f}".format(np.mean(backward_time[len(backward_time) // 4:])))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='RGCN')
parser.add_argument("--dropout", type=float, default=0,
help="dropout probability")
parser.add_argument("--n-hidden", type=int, default=16,
help="number of hidden units")
parser.add_argument("--gpu", type=int, default=-1,
help="gpu")
parser.add_argument("--lr", type=float, default=1e-2,
help="learning rate")
parser.add_argument("--n-bases", type=int, default=-1,
help="number of filter weight matrices, default: -1 [use all]")
parser.add_argument("--n-layers", type=int, default=2,
help="number of propagation rounds")
parser.add_argument("-e", "--n-epochs", type=int, default=50,
help="number of training epochs")
parser.add_argument("-d", "--dataset", type=str, required=True,
help="dataset to use")
parser.add_argument("--l2norm", type=float, default=0,
help="l2 norm coef")
parser.add_argument("--relabel", default=False, action='store_true',
help="remove untouched nodes and relabel")
parser.add_argument("--use-self-loop", default=False, action='store_true',
help="include self feature as a special relation")
fp = parser.add_mutually_exclusive_group(required=False)
fp.add_argument('--validation', dest='validation', action='store_true')
fp.add_argument('--testing', dest='validation', action='store_false')
parser.set_defaults(validation=True)
args = parser.parse_args()
print(args)
args.bfs_level = args.n_layers + 1 # pruning used nodes for memory
main(args)
import tensorflow as tf
from tensorflow.keras import layers
class BaseRGCN(layers.Layer):
def __init__(self, num_nodes, h_dim, out_dim, num_rels, num_bases,
num_hidden_layers=1, dropout=0,
use_self_loop=False, use_cuda=False):
super(BaseRGCN, self).__init__()
self.num_nodes = num_nodes
self.h_dim = h_dim
self.out_dim = out_dim
self.num_rels = num_rels
self.num_bases = None if num_bases < 0 else num_bases
self.num_hidden_layers = num_hidden_layers
self.dropout = dropout
self.use_self_loop = use_self_loop
self.use_cuda = use_cuda
# create rgcn layers
self.build_model()
def build_model(self):
self.layers = []
# i2h
i2h = self.build_input_layer()
if i2h is not None:
self.layers.append(i2h)
# h2h
for idx in range(self.num_hidden_layers):
h2h = self.build_hidden_layer(idx)
self.layers.append(h2h)
# h2o
h2o = self.build_output_layer()
if h2o is not None:
self.layers.append(h2o)
def build_input_layer(self):
return None
def build_hidden_layer(self, idx):
raise NotImplementedError
def build_output_layer(self):
return None
def call(self, g, h, r, norm):
for layer in self.layers:
h = layer(g, h, r, norm)
return h
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment