Unverified Commit aee10679 authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

[Dataset] RDF dataset with DGL-Dataset template (#1869)



* update rdf builtin dataset

* Fix

* use new dataset

* fix

* rdf dataset using new framework

* tf work

* Fix mxnet

* Fix tensorflow

* Fix mxnet

* Update

* upd

* update some docstring

* clean some dead code
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-51-214.ec2.internal>
parent 15411d93
......@@ -5,7 +5,7 @@
* Author's code for link prediction: [https://github.com/MichSchli/RelationPrediction](https://github.com/MichSchli/RelationPrediction)
### Dependencies
Two extra python packages are needed for this example:
Two extra python packages are needed for this example:
- MXNet nightly build
- requests
......@@ -20,17 +20,17 @@ pip install requests rdflib pandas
Example code was tested with rdflib 4.2.2 and pandas 0.23.4
### Entity Classification
AIFB: accuracy 97.22% (DGL), 95.83% (paper)
AIFB: accuracy 97.22% (5 runs, DGL), 95.83% (paper)
```
DGLBACKEND=mxnet python3 entity_classify.py -d aifb --testing --gpu 0
```
MUTAG: accuracy 73.53% (DGL), 73.23% (paper)
MUTAG: accuracy 70.59% (5 runs, DGL), 73.23% (paper)
```
DGLBACKEND=mxnet python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 40 --testing --gpu 0
```
BGS: accuracy 75.86% (DGL, n-basese=20, OOM when >20), 83.10% (paper)
BGS: accuracy 86.21% (5 runs, DGL, n-basese=20), 83.10% (paper)
```
DGLBACKEND=mxnet python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 20 --testing --gpu 0 --relabel
DGLBACKEND=mxnet python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 20 --testing --gpu 0
```
......@@ -14,10 +14,11 @@ import time
import mxnet as mx
from mxnet import gluon
import mxnet.ndarray as F
from dgl import DGLGraph
import dgl
from dgl.nn.mxnet import RelGraphConv
from dgl.contrib.data import load_data
from functools import partial
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import BaseRGCN
......@@ -39,13 +40,29 @@ class EntityClassify(BaseRGCN):
def main(args):
# load graph data
data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel)
num_nodes = data.num_nodes
num_rels = data.num_rels
num_classes = data.num_classes
labels = data.labels
train_idx = data.train_idx
test_idx = data.test_idx
if args.dataset == 'aifb':
dataset = AIFBDataset()
elif args.dataset == 'mutag':
dataset = MUTAGDataset()
elif args.dataset == 'bgs':
dataset = BGSDataset()
elif args.dataset == 'am':
dataset = AMDataset()
else:
raise ValueError()
# Load from hetero-graph
hg = dataset[0]
num_rels = len(hg.canonical_etypes)
num_of_ntype = len(hg.ntypes)
category = dataset.predict_category
num_classes = dataset.num_classes
train_mask = hg.nodes[category].data.pop('train_mask')
test_mask = hg.nodes[category].data.pop('test_mask')
train_idx = mx.nd.array(np.nonzero(train_mask.asnumpy())[0], dtype='int64')
test_idx = mx.nd.array(np.nonzero(test_mask.asnumpy())[0], dtype='int64')
labels = mx.nd.array(hg.nodes[category].data.pop('labels'), dtype='int64')
# split dataset into train, validate, test
if args.validation:
......@@ -54,13 +71,35 @@ def main(args):
else:
val_idx = train_idx
train_idx = mx.nd.array(train_idx)
# calculate norm for each edge type and store in edge
for canonical_etype in hg.canonical_etypes:
u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
v = v.asnumpy()
_, inverse_index, count = np.unique(v, return_inverse=True, return_counts=True)
degrees = count[inverse_index]
norm = np.ones(eid.shape[0]) / degrees
hg.edges[canonical_etype].data['norm'] = mx.nd.expand_dims(mx.nd.array(norm), axis=1)
# get target category id
category_id = len(hg.ntypes)
for i, ntype in enumerate(hg.ntypes):
if ntype == category:
category_id = i
g = dgl.to_homo(hg)
num_nodes = g.number_of_nodes()
node_ids = mx.nd.arange(num_nodes)
edge_norm = g.edata['norm']
edge_type = g.edata[dgl.ETYPE]
# find out the target node ids in g
node_tids = g.ndata[dgl.NTYPE]
loc = (node_tids == category_id)
loc = mx.nd.array(np.nonzero(loc.asnumpy())[0], dtype='int64')
target_idx = node_ids[loc]
# since the nodes are featureless, the input feature is then the node id.
feats = mx.nd.arange(num_nodes, dtype='int32')
# edge type and normalization factor
edge_type = mx.nd.array(data.edge_type, dtype='int32')
edge_norm = mx.nd.array(data.edge_norm).expand_dims(1)
labels = mx.nd.array(labels).reshape((-1))
# check cuda
use_cuda = args.gpu >= 0
......@@ -71,16 +110,12 @@ def main(args):
edge_norm = edge_norm.as_in_context(ctx)
labels = labels.as_in_context(ctx)
train_idx = train_idx.as_in_context(ctx)
g = g.to(ctx)
else:
ctx = mx.cpu(0)
# create graph
g = DGLGraph()
g.add_nodes(num_nodes)
g.add_edges(data.edge_src, data.edge_dst)
# create model
model = EntityClassify(len(g),
model = EntityClassify(num_nodes,
args.n_hidden,
num_classes,
num_rels,
......@@ -103,6 +138,7 @@ def main(args):
t0 = time.time()
with mx.autograd.record():
pred = model(g, feats, edge_type, edge_norm)
pred = pred[target_idx]
loss = loss_fcn(pred[train_idx], labels[train_idx])
t1 = time.time()
loss.backward()
......@@ -113,13 +149,15 @@ def main(args):
backward_time.append(t2 - t1)
print("Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}".
format(epoch, forward_time[-1], backward_time[-1]))
train_acc = F.sum(pred[train_idx].argmax(axis=1) == labels[train_idx]).asscalar() / train_idx.shape[0]
val_acc = F.sum(pred[val_idx].argmax(axis=1) == labels[val_idx]).asscalar() / len(val_idx)
train_acc = F.sum(mx.nd.cast(pred[train_idx].argmax(axis=1), 'int64') == labels[train_idx]).asscalar() / train_idx.shape[0]
val_acc = F.sum(mx.nd.cast(pred[val_idx].argmax(axis=1), 'int64') == labels[val_idx]).asscalar() / len(val_idx)
print("Train Accuracy: {:.4f} | Validation Accuracy: {:.4f}".format(train_acc, val_acc))
print()
logits = model.forward(g, feats, edge_type, edge_norm)
test_acc = F.sum(logits[test_idx].argmax(axis=1) == labels[test_idx]).asscalar() / len(test_idx)
logits = logits[target_idx]
test_acc = F.sum(mx.nd.cast(logits[test_idx].argmax(axis=1), 'int64') == labels[test_idx]).asscalar() / len(test_idx)
print("Test Accuracy: {:.4f}".format(test_acc))
print()
......@@ -147,8 +185,6 @@ if __name__ == '__main__':
help="dataset to use")
parser.add_argument("--l2norm", type=float, default=0,
help="l2 norm coef")
parser.add_argument("--relabel", default=False, action='store_true',
help="remove untouched nodes and relabel")
parser.add_argument("--use-self-loop", default=False, action='store_true',
help="include self feature as a special relation")
fp = parser.add_mutually_exclusive_group(required=False)
......
......@@ -36,46 +36,47 @@ Example code was tested with rdflib 4.2.2 and pandas 0.23.4
All experiments use one-hot encoding as featureless input. Best accuracy reported.
AIFB: accuracy 97.22% (DGL), 95.83% (paper)
AIFB: accuracy 96.11% (5 runs, DGL), 95.83% (paper)
```
python3 entity_classify.py -d aifb --testing --gpu 0
```
MUTAG: accuracy 73.53% (DGL), 73.23% (paper)
MUTAG: accuracy 72.06% (5 runs, DGL), 73.23% (paper)
```
python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0
```
BGS: accuracy 93.10% (DGL), 83.10% (paper)
BGS: accuracy 91.73% (5 runs, DGL), 83.10% (paper)
```
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0
```
AM: accuracy 91.41% (DGL), 89.29% (paper)
AM: accuracy 88.28% (5 runs, DGL), 89.29% (paper)
```
python3 entity_classify.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0
```
### Entity Classification w/ minibatch training
Accuracy numbers are reported by 10 runs.
Accuracy numbers are reported by 5 runs.
AIFB: accuracy best=97.22% avg=93.33%
AIFB: accuracy best=97.22% avg=94.44%
```
python3 entity_classify_mb.py -d aifb --testing --gpu 0 --fanout=8
```
MUTAG: accuracy best=76.47% avg=68.38%
MUTAG: accuracy best=76.47% avg=67.37%
```
python3 entity_classify_mb.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size=50 --fanout=8
```
BGS: accuracy best=96.55% avg=92.41%
BGS: accuracy best=96.55% avg=91.04%
```
python3 entity_classify_mb.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0
```
AM: accuracy best=90.91% avg=88.43%
AM: accuracy best=89.39% avg=88.55%
```
python3 entity_classify_mb.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0
```
......
......@@ -9,28 +9,30 @@ import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl.data.rdf import AIFB, MUTAG, BGS, AM
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import EntityClassify
def main(args):
# load graph data
if args.dataset == 'aifb':
dataset = AIFB()
dataset = AIFBDataset()
elif args.dataset == 'mutag':
dataset = MUTAG()
dataset = MUTAGDataset()
elif args.dataset == 'bgs':
dataset = BGS()
dataset = BGSDataset()
elif args.dataset == 'am':
dataset = AM()
dataset = AMDataset()
else:
raise ValueError()
g = dataset.graph
g = dataset[0]
category = dataset.predict_category
num_classes = dataset.num_classes
train_idx = dataset.train_idx
test_idx = dataset.test_idx
labels = dataset.labels
train_mask = g.nodes[category].data.pop('train_mask')
test_mask = g.nodes[category].data.pop('test_mask')
train_idx = th.nonzero(train_mask).squeeze()
test_idx = th.nonzero(test_mask).squeeze()
labels = g.nodes[category].data.pop('labels')
category_id = len(g.ntypes)
for i, ntype in enumerate(g.ntypes):
if ntype == category:
......
......@@ -13,7 +13,7 @@ from torch.utils.data import DataLoader
from functools import partial
import dgl
from dgl.data.rdf import AIFB, MUTAG, BGS, AM
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import EntityClassify, RelGraphEmbed
def extract_embed(node_embed, input_nodes):
......@@ -45,22 +45,24 @@ def evaluate(model, loader, node_embed, labels, category, device):
def main(args):
# load graph data
if args.dataset == 'aifb':
dataset = AIFB()
dataset = AIFBDataset()
elif args.dataset == 'mutag':
dataset = MUTAG()
dataset = MUTAGDataset()
elif args.dataset == 'bgs':
dataset = BGS()
dataset = BGSDataset()
elif args.dataset == 'am':
dataset = AM()
dataset = AMDataset()
else:
raise ValueError()
g = dataset.graph
g = dataset[0]
category = dataset.predict_category
num_classes = dataset.num_classes
train_idx = dataset.train_idx
test_idx = dataset.test_idx
labels = dataset.labels
train_mask = g.nodes[category].data.pop('train_mask')
test_mask = g.nodes[category].data.pop('test_mask')
train_idx = th.nonzero(train_mask).squeeze()
test_idx = th.nonzero(test_mask).squeeze()
labels = g.nodes[category].data.pop('labels')
# split dataset into train, validate, test
if args.validation:
......
......@@ -11,21 +11,22 @@ from entity_classify import EntityClassify
def main(args):
# load graph data
if args.dataset == 'aifb':
dataset = AIFB()
dataset = AIFBDataset()
elif args.dataset == 'mutag':
dataset = MUTAG()
dataset = MUTAGDataset()
elif args.dataset == 'bgs':
dataset = BGS()
dataset = BGSDataset()
elif args.dataset == 'am':
dataset = AM()
dataset = AMDataset()
else:
raise ValueError()
g = dataset.graph
g = dataset[0]
category = dataset.predict_category
num_classes = dataset.num_classes
test_idx = dataset.test_idx
labels = dataset.labels
test_mask = g.nodes[category].data.pop('test_mask')
test_idx = th.nonzero(test_mask).squeeze()
labels = g.nodes[category].data.pop('labels')
# check cuda
use_cuda = args.gpu >= 0 and th.cuda.is_available()
......@@ -42,7 +43,6 @@ def main(args):
num_bases=args.n_bases,
num_hidden_layers=args.n_layers - 2,
use_self_loop=args.use_self_loop)
# training loop
model.load_state_dict(th.load(args.model_path))
if use_cuda:
model.cuda()
......@@ -54,7 +54,7 @@ def main(args):
test_acc = th.sum(logits[test_idx].argmax(dim=1) == labels[test_idx]).item() / len(test_idx)
print("Test Acc: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.item()))
print()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='RGCN')
parser.add_argument("--n-hidden", type=int, default=16,
......
......@@ -17,44 +17,44 @@ pip install requests torch rdflib pandas
Example code was tested with rdflib 4.2.2 and pandas 0.23.4
### Entity Classification
AIFB: accuracy 97.22% (DGL), 95.83% (paper)
AIFB: accuracy 92.59% (3 runs, DGL), 95.83% (paper)
```
python3 entity_classify.py -d aifb --testing --gpu 0
```
MUTAG: accuracy 75% (DGL), 73.23% (paper)
MUTAG: accuracy 72.55% (3 runs, DGL), 73.23% (paper)
```
python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0
```
BGS: accuracy 82.76% (DGL), 83.10% (paper)
BGS: accuracy 89.66% (3 runs, DGL), 83.10% (paper)
```
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --relabel
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0
```
AM: accuracy 87.37% (DGL), 89.29% (paper)
AM: accuracy 89.73% (3 runs, DGL), 89.29% (paper)
```
python3 entity_classify.py -d am --n-bases=40 --n-hidden=10 --l2norm=5e-4 --testing
```
### Entity Classification with minibatch
AIFB: accuracy avg(5 runs) 94.99%, best 97.22% (DGL)
AIFB: accuracy avg(5 runs) 90.56%, best 94.44% (DGL)
```
python3 entity_classify_mp.py -d aifb --testing --gpu 0 --fanout=20 --batch-size 128
```
MUTAG: accuracy avg(5 runs) 67.06%, best 80.88% (DGL)
MUTAG: accuracy avg(5 runs) 66.77%, best 69.12% (DGL)
```
python3 entity_classify_mp.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size 256 --use-self-loop --n-epochs 40 --dropout=0.3
python3 entity_classify_mp.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size 256 --use-self-loop --n-epochs 40
```
BGS: accuracy avg(5 runs) 84.14%, best 89.66% (DGL)
BGS: accuracy avg(5 runs) 91.72%, best 96.55% (DGL)
```
python3 entity_classify_mp.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout 40 --n-epochs=40 --batch-size=128
```
AM: accuracy avg(5 runs) 88.28%, best 90.91% (DGL)
AM: accuracy avg(5 runs) 88.28%, best 90.40% (DGL)
```
python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout 35 --batch-size 256 --lr 1e-2 --n-hidden 16 --use-self-loop --n-epochs=40
```
......
......@@ -13,10 +13,10 @@ import numpy as np
import time
import torch
import torch.nn.functional as F
from dgl import DGLGraph
import dgl
from dgl.nn.pytorch import RelGraphConv
from dgl.contrib.data import load_data
from functools import partial
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import BaseRGCN
......@@ -44,13 +44,29 @@ class EntityClassify(BaseRGCN):
def main(args):
# load graph data
data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel)
num_nodes = data.num_nodes
num_rels = data.num_rels
num_classes = data.num_classes
labels = data.labels
train_idx = data.train_idx
test_idx = data.test_idx
if args.dataset == 'aifb':
dataset = AIFBDataset()
elif args.dataset == 'mutag':
dataset = MUTAGDataset()
elif args.dataset == 'bgs':
dataset = BGSDataset()
elif args.dataset == 'am':
dataset = AMDataset()
else:
raise ValueError()
# Load from hetero-graph
hg = dataset[0]
num_rels = len(hg.canonical_etypes)
num_of_ntype = len(hg.ntypes)
category = dataset.predict_category
num_classes = dataset.num_classes
train_mask = hg.nodes[category].data.pop('train_mask')
test_mask = hg.nodes[category].data.pop('test_mask')
train_idx = torch.nonzero(train_mask).squeeze()
test_idx = torch.nonzero(test_mask).squeeze()
labels = hg.nodes[category].data.pop('labels')
# split dataset into train, validate, test
if args.validation:
......@@ -59,14 +75,35 @@ def main(args):
else:
val_idx = train_idx
# calculate norm for each edge type and store in edge
for canonical_etype in hg.canonical_etypes:
u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
_, inverse_index, count = torch.unique(v, return_inverse=True, return_counts=True)
degrees = count[inverse_index]
norm = torch.ones(eid.shape[0]).float() / degrees.float()
norm = norm.unsqueeze(1)
hg.edges[canonical_etype].data['norm'] = norm
# get target category id
category_id = len(hg.ntypes)
for i, ntype in enumerate(hg.ntypes):
if ntype == category:
category_id = i
g = dgl.to_homo(hg)
num_nodes = g.number_of_nodes()
node_ids = torch.arange(num_nodes)
edge_norm = g.edata['norm']
edge_type = g.edata[dgl.ETYPE].long()
# find out the target node ids in g
node_tids = g.ndata[dgl.NTYPE]
loc = (node_tids == category_id)
target_idx = node_ids[loc]
# since the nodes are featureless, the input feature is then the node id.
feats = torch.arange(num_nodes)
# edge type and normalization factor
edge_type = torch.from_numpy(data.edge_type).long()
edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1).long()
labels = torch.from_numpy(labels).view(-1).long()
# check cuda
use_cuda = args.gpu >= 0 and torch.cuda.is_available()
if use_cuda:
......@@ -76,13 +113,8 @@ def main(args):
edge_norm = edge_norm.cuda()
labels = labels.cuda()
# create graph
g = DGLGraph()
g.add_nodes(num_nodes)
g.add_edges(data.edge_src, data.edge_dst)
# create model
model = EntityClassify(len(g),
model = EntityClassify(num_nodes,
args.n_hidden,
num_classes,
num_rels,
......@@ -108,6 +140,7 @@ def main(args):
optimizer.zero_grad()
t0 = time.time()
logits = model(g, feats, edge_type, edge_norm)
logits = logits[target_idx]
loss = F.cross_entropy(logits[train_idx], labels[train_idx])
t1 = time.time()
loss.backward()
......@@ -127,6 +160,7 @@ def main(args):
model.eval()
logits = model.forward(g, feats, edge_type, edge_norm)
logits = logits[target_idx]
test_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
test_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx]).item() / len(test_idx)
print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.item()))
......@@ -156,8 +190,6 @@ if __name__ == '__main__':
help="dataset to use")
parser.add_argument("--l2norm", type=float, default=0,
help="l2 norm coef")
parser.add_argument("--relabel", default=False, action='store_true',
help="remove untouched nodes and relabel")
parser.add_argument("--use-self-loop", default=False, action='store_true',
help="include self feature as a special relation")
fp = parser.add_mutually_exclusive_group(required=False)
......@@ -167,5 +199,4 @@ if __name__ == '__main__':
args = parser.parse_args()
print(args)
args.bfs_level = args.n_layers + 1 # pruning used nodes for memory
main(args)
......@@ -21,7 +21,7 @@ import dgl
from dgl import DGLGraph
from functools import partial
from dgl.data.rdf import AIFB, MUTAG, BGS, AM
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import RelGraphEmbedLayer
from dgl.nn import RelGraphConv
from utils import thread_wrapped_func
......@@ -321,32 +321,34 @@ def run(proc_id, n_gpus, args, devices, dataset):
print("{}/{} Mean forward time: {:4f}".format(proc_id, n_gpus,
np.mean(forward_time[len(forward_time) // 4:])))
print("{}/{} Mean backward time: {:4f}".format(proc_id, n_gpus,
np.mean(backward_time[len(backward_time) // 4:])))
np.mean(backward_time[len(backward_time) // 4:])))
def main(args, devices):
# load graph data
ogb_dataset = False
if args.dataset == 'aifb':
dataset = AIFB()
dataset = AIFBDataset()
elif args.dataset == 'mutag':
dataset = MUTAG()
dataset = MUTAGDataset()
elif args.dataset == 'bgs':
dataset = BGS()
dataset = BGSDataset()
elif args.dataset == 'am':
dataset = AM()
dataset = AMDataset()
else:
raise ValueError()
# Load from hetero-graph
hg = dataset.graph
hg = dataset[0]
num_rels = len(hg.canonical_etypes)
num_of_ntype = len(hg.ntypes)
category = dataset.predict_category
num_classes = dataset.num_classes
train_idx = dataset.train_idx
test_idx = dataset.test_idx
labels = dataset.labels
train_mask = hg.nodes[category].data.pop('train_mask')
test_mask = hg.nodes[category].data.pop('test_mask')
labels = hg.nodes[category].data.pop('labels')
train_idx = th.nonzero(train_mask).squeeze()
test_idx = th.nonzero(test_mask).squeeze()
# split dataset into train, validate, test
if args.validation:
......@@ -356,14 +358,14 @@ def main(args, devices):
val_idx = train_idx
# calculate norm for each edge type and store in edge
for canonical_etypes in hg.canonical_etypes:
u, v, eid = hg.all_edges(form='all', etype=canonical_etypes)
for canonical_etype in hg.canonical_etypes:
u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
_, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
degrees = count[inverse_index]
norm = th.ones(eid.shape[0]) / degrees
norm = norm.unsqueeze(1)
hg.edges[canonical_etypes].data['norm'] = norm
hg.edges[canonical_etype].data['norm'] = norm
# get target category id
category_id = len(hg.ntypes)
for i, ntype in enumerate(hg.ntypes):
......@@ -385,7 +387,7 @@ def main(args, devices):
n_gpus = len(devices)
# cpu
if devices[0] == -1:
run(0, 0, args, ['cpu'],
run(0, 0, args, ['cpu'],
(g, num_of_ntype, num_classes, num_rels, target_idx,
train_idx, val_idx, test_idx, labels))
# gpu
......
......@@ -11,23 +11,24 @@
* pandas
```
pip install requests torch rdflib pandas
pip install requests tensorflow rdflib pandas
export DGLBACKEND=tensorflow
```
Example code was tested with rdflib 4.2.2 and pandas 0.23.4
### Entity Classification
AIFB: accuracy 97.22% (DGL), 95.83% (paper)
AIFB: accuracy 92.78% (5 runs, DGL), 95.83% (paper)
```
python3 entity_classify.py -d aifb --testing --gpu 0
```
MUTAG: accuracy 75% (DGL), 73.23% (paper)
MUTAG: accuracy 71.47% (5 runs, DGL), 73.23% (paper)
```
python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0
```
BGS: accuracy 79.3% (DGL n-base=25), 83.10% (paper n-base=40)
BGS: accuracy 93.10% (5 runs, DGL n-base=25), 83.10% (paper n-base=40)
```
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 25 --testing --gpu 0 --relabel
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 25 --testing --gpu 0
```
......@@ -13,10 +13,10 @@ import numpy as np
import time
import tensorflow as tf
from tensorflow.keras import layers
from dgl import DGLGraph
import dgl
from dgl.nn.tensorflow import RelGraphConv
from dgl.contrib.data import load_data
from functools import partial
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import BaseRGCN
......@@ -49,28 +49,56 @@ def acc(logits, labels, mask):
def main(args):
# load graph data
data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel)
num_nodes = data.num_nodes
num_rels = data.num_rels
num_classes = data.num_classes
labels = data.labels
train_idx = data.train_idx
test_idx = data.test_idx
# split dataset into train, validate, test
if args.validation:
val_idx = train_idx[:len(train_idx) // 5]
train_idx = train_idx[len(train_idx) // 5:]
if args.dataset == 'aifb':
dataset = AIFBDataset()
elif args.dataset == 'mutag':
dataset = MUTAGDataset()
elif args.dataset == 'bgs':
dataset = BGSDataset()
elif args.dataset == 'am':
dataset = AMDataset()
else:
val_idx = train_idx
# since the nodes are featureless, the input feature is then the node id.
feats = tf.range(num_nodes, dtype=tf.int64)
# edge type and normalization factor
edge_type = tf.convert_to_tensor(data.edge_type)
edge_norm = tf.expand_dims(tf.convert_to_tensor(data.edge_norm), 1)
labels = tf.reshape(tf.convert_to_tensor(labels), (-1, ))
raise ValueError()
# preprocessing in cpu
with tf.device("/cpu:0"):
# Load from hetero-graph
hg = dataset[0]
num_rels = len(hg.canonical_etypes)
num_of_ntype = len(hg.ntypes)
category = dataset.predict_category
num_classes = dataset.num_classes
train_mask = hg.nodes[category].data.pop('train_mask')
test_mask = hg.nodes[category].data.pop('test_mask')
train_idx = tf.squeeze(tf.where(train_mask))
test_idx = tf.squeeze(tf.where(test_mask))
labels = hg.nodes[category].data.pop('labels')
# split dataset into train, validate, test
if args.validation:
val_idx = train_idx[:len(train_idx) // 5]
train_idx = train_idx[len(train_idx) // 5:]
else:
val_idx = train_idx
# calculate norm for each edge type and store in edge
for canonical_etype in hg.canonical_etypes:
u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
_, inverse_index, count = tf.unique_with_counts(v)
degrees = tf.gather(count, inverse_index)
norm = tf.ones(eid.shape[0]) / tf.cast(degrees, tf.float32)
norm = tf.expand_dims(norm, 1)
hg.edges[canonical_etype].data['norm'] = norm
# get target category id
category_id = len(hg.ntypes)
for i, ntype in enumerate(hg.ntypes):
if ntype == category:
category_id = i
# edge type and normalization factor
g = dgl.to_homo(hg)
# check cuda
if args.gpu < 0:
......@@ -78,25 +106,32 @@ def main(args):
use_cuda = False
else:
device = "/gpu:{}".format(args.gpu)
g = g.to(device)
use_cuda = True
with tf.device(device):
num_nodes = g.number_of_nodes()
node_ids = tf.range(num_nodes, dtype=tf.int64)
edge_norm = g.edata['norm']
edge_type = tf.cast(g.edata[dgl.ETYPE], tf.int64)
# create graph
g = DGLGraph()
g.add_nodes(num_nodes)
g.add_edges(data.edge_src, data.edge_dst)
# find out the target node ids in g
node_tids = g.ndata[dgl.NTYPE]
loc = (node_tids == category_id)
target_idx = tf.squeeze(tf.where(loc))
# since the nodes are featureless, the input feature is then the node id.
feats = tf.range(num_nodes, dtype=tf.int64)
with tf.device(device):
# create model
model = EntityClassify(len(g),
args.n_hidden,
num_classes,
num_rels,
num_bases=args.n_bases,
num_hidden_layers=args.n_layers - 2,
dropout=args.dropout,
use_self_loop=args.use_self_loop,
use_cuda=use_cuda)
model = EntityClassify(num_nodes,
args.n_hidden,
num_classes,
num_rels,
num_bases=args.n_bases,
num_hidden_layers=args.n_layers - 2,
dropout=args.dropout,
use_self_loop=args.use_self_loop,
use_cuda=use_cuda)
# optimizer
optimizer = tf.keras.optimizers.Adam(
......@@ -111,9 +146,10 @@ def main(args):
t0 = time.time()
with tf.GradientTape() as tape:
logits = model(g, feats, edge_type, edge_norm)
logits = tf.gather(logits, target_idx)
loss = loss_fcn(tf.gather(labels, train_idx), tf.gather(logits, train_idx))
# Manually Weight Decay
# We found Tensorflow has a different implementation on weight decay
# We found Tensorflow has a different implementation on weight decay
# of Adam(W) optimizer with PyTorch. And this results in worse results.
# Manually adding weights to the loss to do weight decay solves this problem.
for weight in model.trainable_weights:
......@@ -136,6 +172,7 @@ def main(args):
print()
logits = model(g, feats, edge_type, edge_norm)
logits = tf.gather(logits, target_idx)
test_loss = loss_fcn(tf.gather(labels, test_idx), tf.gather(logits, test_idx))
test_acc = acc(logits, labels, test_idx)
print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.numpy().item()))
......@@ -165,8 +202,6 @@ if __name__ == '__main__':
help="dataset to use")
parser.add_argument("--l2norm", type=float, default=0,
help="l2 norm coef")
parser.add_argument("--relabel", default=False, action='store_true',
help="remove untouched nodes and relabel")
parser.add_argument("--use-self-loop", default=False, action='store_true',
help="include self feature as a special relation")
fp = parser.add_mutually_exclusive_group(required=False)
......
......@@ -4,6 +4,7 @@
from __future__ import absolute_import
import os, sys
import abc
from .utils import download, extract_archive, get_download_dir, makedirs
from ..utils import retry_method_with_fix
......@@ -37,7 +38,7 @@ class DGLDataset(object):
Default: ~/.dgl/
save_dir : str
Directory to save the processed dataset.
Default: ~/.dgl/
Default: same as raw_dir
force_reload : bool
Whether to reload the dataset. Default: False
verbose : bool
......@@ -190,14 +191,16 @@ class DGLDataset(object):
"""
return self._verbose
@abc.abstractmethod
def __getitem__(self, idx):
r"""Gets the data object at index.
"""
raise NotImplementedError
pass
@abc.abstractmethod
def __len__(self):
r"""The number of examples in the dataset."""
raise NotImplementedError
pass
class DGLBuiltinDataset(DGLDataset):
r"""The Basic DGL Builtin Dataset.
......
This diff is collapsed.
......@@ -5,13 +5,16 @@ import os
import sys
import hashlib
import warnings
import numpy as np
import warnings
import requests
import pickle
import errno
import numpy as np
from .graph_serialize import save_graphs, load_graphs, load_labels
from .tensor_serialize import save_tensors, load_tensors
from .. import backend as F
__all__ = ['loadtxt','download', 'check_sha1', 'extract_archive',
'get_download_dir', 'Subset', 'split_dataset',
'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors"]
......@@ -237,6 +240,13 @@ def get_download_dir():
os.makedirs(dirname)
return dirname
def makedirs(path):
try:
os.makedirs(os.path.expanduser(os.path.normpath(path)))
except OSError as e:
if e.errno != errno.EEXIST and os.path.isdir(path):
raise e
def save_info(path, info):
""" Save dataset related information into disk.
......@@ -268,6 +278,39 @@ def load_info(path):
info = pickle.load(pf)
return info
def deprecate_property(old, new):
warnings.warn('Property {} will be deprecated, please use {} instead.'.format(old, new))
def deprecate_function(old, new):
warnings.warn('Function {} will be deprecated, please use {} instead.'.format(old, new))
def deprecate_class(old, new):
warnings.warn('Class {} will be deprecated, please use {} instead.'.format(old, new))
def idx2mask(idx, len):
"""Create mask."""
mask = np.zeros(len)
mask[idx] = 1
return mask
def generate_mask_tensor(mask):
"""Generate mask tensor according to different backend
For torch and tensorflow, it will create a bool tensor
For mxnet, it will create a float tensor
Parameters
----------
mask: numpy ndarray
input mask tensor
"""
assert isinstance(mask, np.ndarray), "input for generate_mask_tensor" \
"should be an numpy ndarray"
if F.backend_name == 'mxnet':
return F.tensor(mask, dtype=F.data_type_dict['float32'])
else:
return F.tensor(mask, dtype=F.data_type_dict['bool'])
class Subset(object):
"""Subset of a dataset at specified indices
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment