"vscode:/vscode.git/clone" did not exist on "8088cc94f2155403f6b09cd54edadafa68daa977"
Unverified Commit aee10679 authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

[Dataset] RDF dataset with DGL-Dataset template (#1869)



* update rdf builtin dataset

* Fix

* use new dataset

* fix

* rdf dataset using new framework

* tf work

* Fix mxnet

* Fix tensorflow

* Fix mxnet

* Update

* upd

* update some docstring

* clean some dead code
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-51-214.ec2.internal>
parent 15411d93
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* Author's code for link prediction: [https://github.com/MichSchli/RelationPrediction](https://github.com/MichSchli/RelationPrediction) * Author's code for link prediction: [https://github.com/MichSchli/RelationPrediction](https://github.com/MichSchli/RelationPrediction)
### Dependencies ### Dependencies
Two extra python packages are needed for this example: Two extra python packages are needed for this example:
- MXNet nightly build - MXNet nightly build
- requests - requests
...@@ -20,17 +20,17 @@ pip install requests rdflib pandas ...@@ -20,17 +20,17 @@ pip install requests rdflib pandas
Example code was tested with rdflib 4.2.2 and pandas 0.23.4 Example code was tested with rdflib 4.2.2 and pandas 0.23.4
### Entity Classification ### Entity Classification
AIFB: accuracy 97.22% (DGL), 95.83% (paper) AIFB: accuracy 97.22% (5 runs, DGL), 95.83% (paper)
``` ```
DGLBACKEND=mxnet python3 entity_classify.py -d aifb --testing --gpu 0 DGLBACKEND=mxnet python3 entity_classify.py -d aifb --testing --gpu 0
``` ```
MUTAG: accuracy 73.53% (DGL), 73.23% (paper) MUTAG: accuracy 70.59% (5 runs, DGL), 73.23% (paper)
``` ```
DGLBACKEND=mxnet python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 40 --testing --gpu 0 DGLBACKEND=mxnet python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 40 --testing --gpu 0
``` ```
BGS: accuracy 75.86% (DGL, n-basese=20, OOM when >20), 83.10% (paper) BGS: accuracy 86.21% (5 runs, DGL, n-basese=20), 83.10% (paper)
``` ```
DGLBACKEND=mxnet python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 20 --testing --gpu 0 --relabel DGLBACKEND=mxnet python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 20 --testing --gpu 0
``` ```
...@@ -14,10 +14,11 @@ import time ...@@ -14,10 +14,11 @@ import time
import mxnet as mx import mxnet as mx
from mxnet import gluon from mxnet import gluon
import mxnet.ndarray as F import mxnet.ndarray as F
from dgl import DGLGraph import dgl
from dgl.nn.mxnet import RelGraphConv from dgl.nn.mxnet import RelGraphConv
from dgl.contrib.data import load_data from dgl.contrib.data import load_data
from functools import partial from functools import partial
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import BaseRGCN from model import BaseRGCN
...@@ -39,13 +40,29 @@ class EntityClassify(BaseRGCN): ...@@ -39,13 +40,29 @@ class EntityClassify(BaseRGCN):
def main(args): def main(args):
# load graph data # load graph data
data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel) if args.dataset == 'aifb':
num_nodes = data.num_nodes dataset = AIFBDataset()
num_rels = data.num_rels elif args.dataset == 'mutag':
num_classes = data.num_classes dataset = MUTAGDataset()
labels = data.labels elif args.dataset == 'bgs':
train_idx = data.train_idx dataset = BGSDataset()
test_idx = data.test_idx elif args.dataset == 'am':
dataset = AMDataset()
else:
raise ValueError()
# Load from hetero-graph
hg = dataset[0]
num_rels = len(hg.canonical_etypes)
num_of_ntype = len(hg.ntypes)
category = dataset.predict_category
num_classes = dataset.num_classes
train_mask = hg.nodes[category].data.pop('train_mask')
test_mask = hg.nodes[category].data.pop('test_mask')
train_idx = mx.nd.array(np.nonzero(train_mask.asnumpy())[0], dtype='int64')
test_idx = mx.nd.array(np.nonzero(test_mask.asnumpy())[0], dtype='int64')
labels = mx.nd.array(hg.nodes[category].data.pop('labels'), dtype='int64')
# split dataset into train, validate, test # split dataset into train, validate, test
if args.validation: if args.validation:
...@@ -54,13 +71,35 @@ def main(args): ...@@ -54,13 +71,35 @@ def main(args):
else: else:
val_idx = train_idx val_idx = train_idx
train_idx = mx.nd.array(train_idx) # calculate norm for each edge type and store in edge
for canonical_etype in hg.canonical_etypes:
u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
v = v.asnumpy()
_, inverse_index, count = np.unique(v, return_inverse=True, return_counts=True)
degrees = count[inverse_index]
norm = np.ones(eid.shape[0]) / degrees
hg.edges[canonical_etype].data['norm'] = mx.nd.expand_dims(mx.nd.array(norm), axis=1)
# get target category id
category_id = len(hg.ntypes)
for i, ntype in enumerate(hg.ntypes):
if ntype == category:
category_id = i
g = dgl.to_homo(hg)
num_nodes = g.number_of_nodes()
node_ids = mx.nd.arange(num_nodes)
edge_norm = g.edata['norm']
edge_type = g.edata[dgl.ETYPE]
# find out the target node ids in g
node_tids = g.ndata[dgl.NTYPE]
loc = (node_tids == category_id)
loc = mx.nd.array(np.nonzero(loc.asnumpy())[0], dtype='int64')
target_idx = node_ids[loc]
# since the nodes are featureless, the input feature is then the node id. # since the nodes are featureless, the input feature is then the node id.
feats = mx.nd.arange(num_nodes, dtype='int32') feats = mx.nd.arange(num_nodes, dtype='int32')
# edge type and normalization factor
edge_type = mx.nd.array(data.edge_type, dtype='int32')
edge_norm = mx.nd.array(data.edge_norm).expand_dims(1)
labels = mx.nd.array(labels).reshape((-1))
# check cuda # check cuda
use_cuda = args.gpu >= 0 use_cuda = args.gpu >= 0
...@@ -71,16 +110,12 @@ def main(args): ...@@ -71,16 +110,12 @@ def main(args):
edge_norm = edge_norm.as_in_context(ctx) edge_norm = edge_norm.as_in_context(ctx)
labels = labels.as_in_context(ctx) labels = labels.as_in_context(ctx)
train_idx = train_idx.as_in_context(ctx) train_idx = train_idx.as_in_context(ctx)
g = g.to(ctx)
else: else:
ctx = mx.cpu(0) ctx = mx.cpu(0)
# create graph
g = DGLGraph()
g.add_nodes(num_nodes)
g.add_edges(data.edge_src, data.edge_dst)
# create model # create model
model = EntityClassify(len(g), model = EntityClassify(num_nodes,
args.n_hidden, args.n_hidden,
num_classes, num_classes,
num_rels, num_rels,
...@@ -103,6 +138,7 @@ def main(args): ...@@ -103,6 +138,7 @@ def main(args):
t0 = time.time() t0 = time.time()
with mx.autograd.record(): with mx.autograd.record():
pred = model(g, feats, edge_type, edge_norm) pred = model(g, feats, edge_type, edge_norm)
pred = pred[target_idx]
loss = loss_fcn(pred[train_idx], labels[train_idx]) loss = loss_fcn(pred[train_idx], labels[train_idx])
t1 = time.time() t1 = time.time()
loss.backward() loss.backward()
...@@ -113,13 +149,15 @@ def main(args): ...@@ -113,13 +149,15 @@ def main(args):
backward_time.append(t2 - t1) backward_time.append(t2 - t1)
print("Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}". print("Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}".
format(epoch, forward_time[-1], backward_time[-1])) format(epoch, forward_time[-1], backward_time[-1]))
train_acc = F.sum(pred[train_idx].argmax(axis=1) == labels[train_idx]).asscalar() / train_idx.shape[0]
val_acc = F.sum(pred[val_idx].argmax(axis=1) == labels[val_idx]).asscalar() / len(val_idx) train_acc = F.sum(mx.nd.cast(pred[train_idx].argmax(axis=1), 'int64') == labels[train_idx]).asscalar() / train_idx.shape[0]
val_acc = F.sum(mx.nd.cast(pred[val_idx].argmax(axis=1), 'int64') == labels[val_idx]).asscalar() / len(val_idx)
print("Train Accuracy: {:.4f} | Validation Accuracy: {:.4f}".format(train_acc, val_acc)) print("Train Accuracy: {:.4f} | Validation Accuracy: {:.4f}".format(train_acc, val_acc))
print() print()
logits = model.forward(g, feats, edge_type, edge_norm) logits = model.forward(g, feats, edge_type, edge_norm)
test_acc = F.sum(logits[test_idx].argmax(axis=1) == labels[test_idx]).asscalar() / len(test_idx) logits = logits[target_idx]
test_acc = F.sum(mx.nd.cast(logits[test_idx].argmax(axis=1), 'int64') == labels[test_idx]).asscalar() / len(test_idx)
print("Test Accuracy: {:.4f}".format(test_acc)) print("Test Accuracy: {:.4f}".format(test_acc))
print() print()
...@@ -147,8 +185,6 @@ if __name__ == '__main__': ...@@ -147,8 +185,6 @@ if __name__ == '__main__':
help="dataset to use") help="dataset to use")
parser.add_argument("--l2norm", type=float, default=0, parser.add_argument("--l2norm", type=float, default=0,
help="l2 norm coef") help="l2 norm coef")
parser.add_argument("--relabel", default=False, action='store_true',
help="remove untouched nodes and relabel")
parser.add_argument("--use-self-loop", default=False, action='store_true', parser.add_argument("--use-self-loop", default=False, action='store_true',
help="include self feature as a special relation") help="include self feature as a special relation")
fp = parser.add_mutually_exclusive_group(required=False) fp = parser.add_mutually_exclusive_group(required=False)
......
...@@ -36,46 +36,47 @@ Example code was tested with rdflib 4.2.2 and pandas 0.23.4 ...@@ -36,46 +36,47 @@ Example code was tested with rdflib 4.2.2 and pandas 0.23.4
All experiments use one-hot encoding as featureless input. Best accuracy reported. All experiments use one-hot encoding as featureless input. Best accuracy reported.
AIFB: accuracy 97.22% (DGL), 95.83% (paper)
AIFB: accuracy 96.11% (5 runs, DGL), 95.83% (paper)
``` ```
python3 entity_classify.py -d aifb --testing --gpu 0 python3 entity_classify.py -d aifb --testing --gpu 0
``` ```
MUTAG: accuracy 73.53% (DGL), 73.23% (paper) MUTAG: accuracy 72.06% (5 runs, DGL), 73.23% (paper)
``` ```
python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0
``` ```
BGS: accuracy 93.10% (DGL), 83.10% (paper) BGS: accuracy 91.73% (5 runs, DGL), 83.10% (paper)
``` ```
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0
``` ```
AM: accuracy 91.41% (DGL), 89.29% (paper) AM: accuracy 88.28% (5 runs, DGL), 89.29% (paper)
``` ```
python3 entity_classify.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 python3 entity_classify.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0
``` ```
### Entity Classification w/ minibatch training ### Entity Classification w/ minibatch training
Accuracy numbers are reported by 10 runs. Accuracy numbers are reported by 5 runs.
AIFB: accuracy best=97.22% avg=93.33% AIFB: accuracy best=97.22% avg=94.44%
``` ```
python3 entity_classify_mb.py -d aifb --testing --gpu 0 --fanout=8 python3 entity_classify_mb.py -d aifb --testing --gpu 0 --fanout=8
``` ```
MUTAG: accuracy best=76.47% avg=68.38% MUTAG: accuracy best=76.47% avg=67.37%
``` ```
python3 entity_classify_mb.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size=50 --fanout=8 python3 entity_classify_mb.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size=50 --fanout=8
``` ```
BGS: accuracy best=96.55% avg=92.41% BGS: accuracy best=96.55% avg=91.04%
``` ```
python3 entity_classify_mb.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 python3 entity_classify_mb.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0
``` ```
AM: accuracy best=90.91% avg=88.43% AM: accuracy best=89.39% avg=88.55%
``` ```
python3 entity_classify_mb.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 python3 entity_classify_mb.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0
``` ```
......
...@@ -9,28 +9,30 @@ import torch as th ...@@ -9,28 +9,30 @@ import torch as th
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from dgl.data.rdf import AIFB, MUTAG, BGS, AM from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import EntityClassify from model import EntityClassify
def main(args): def main(args):
# load graph data # load graph data
if args.dataset == 'aifb': if args.dataset == 'aifb':
dataset = AIFB() dataset = AIFBDataset()
elif args.dataset == 'mutag': elif args.dataset == 'mutag':
dataset = MUTAG() dataset = MUTAGDataset()
elif args.dataset == 'bgs': elif args.dataset == 'bgs':
dataset = BGS() dataset = BGSDataset()
elif args.dataset == 'am': elif args.dataset == 'am':
dataset = AM() dataset = AMDataset()
else: else:
raise ValueError() raise ValueError()
g = dataset.graph g = dataset[0]
category = dataset.predict_category category = dataset.predict_category
num_classes = dataset.num_classes num_classes = dataset.num_classes
train_idx = dataset.train_idx train_mask = g.nodes[category].data.pop('train_mask')
test_idx = dataset.test_idx test_mask = g.nodes[category].data.pop('test_mask')
labels = dataset.labels train_idx = th.nonzero(train_mask).squeeze()
test_idx = th.nonzero(test_mask).squeeze()
labels = g.nodes[category].data.pop('labels')
category_id = len(g.ntypes) category_id = len(g.ntypes)
for i, ntype in enumerate(g.ntypes): for i, ntype in enumerate(g.ntypes):
if ntype == category: if ntype == category:
......
...@@ -13,7 +13,7 @@ from torch.utils.data import DataLoader ...@@ -13,7 +13,7 @@ from torch.utils.data import DataLoader
from functools import partial from functools import partial
import dgl import dgl
from dgl.data.rdf import AIFB, MUTAG, BGS, AM from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import EntityClassify, RelGraphEmbed from model import EntityClassify, RelGraphEmbed
def extract_embed(node_embed, input_nodes): def extract_embed(node_embed, input_nodes):
...@@ -45,22 +45,24 @@ def evaluate(model, loader, node_embed, labels, category, device): ...@@ -45,22 +45,24 @@ def evaluate(model, loader, node_embed, labels, category, device):
def main(args): def main(args):
# load graph data # load graph data
if args.dataset == 'aifb': if args.dataset == 'aifb':
dataset = AIFB() dataset = AIFBDataset()
elif args.dataset == 'mutag': elif args.dataset == 'mutag':
dataset = MUTAG() dataset = MUTAGDataset()
elif args.dataset == 'bgs': elif args.dataset == 'bgs':
dataset = BGS() dataset = BGSDataset()
elif args.dataset == 'am': elif args.dataset == 'am':
dataset = AM() dataset = AMDataset()
else: else:
raise ValueError() raise ValueError()
g = dataset.graph g = dataset[0]
category = dataset.predict_category category = dataset.predict_category
num_classes = dataset.num_classes num_classes = dataset.num_classes
train_idx = dataset.train_idx train_mask = g.nodes[category].data.pop('train_mask')
test_idx = dataset.test_idx test_mask = g.nodes[category].data.pop('test_mask')
labels = dataset.labels train_idx = th.nonzero(train_mask).squeeze()
test_idx = th.nonzero(test_mask).squeeze()
labels = g.nodes[category].data.pop('labels')
# split dataset into train, validate, test # split dataset into train, validate, test
if args.validation: if args.validation:
......
...@@ -11,21 +11,22 @@ from entity_classify import EntityClassify ...@@ -11,21 +11,22 @@ from entity_classify import EntityClassify
def main(args): def main(args):
# load graph data # load graph data
if args.dataset == 'aifb': if args.dataset == 'aifb':
dataset = AIFB() dataset = AIFBDataset()
elif args.dataset == 'mutag': elif args.dataset == 'mutag':
dataset = MUTAG() dataset = MUTAGDataset()
elif args.dataset == 'bgs': elif args.dataset == 'bgs':
dataset = BGS() dataset = BGSDataset()
elif args.dataset == 'am': elif args.dataset == 'am':
dataset = AM() dataset = AMDataset()
else: else:
raise ValueError() raise ValueError()
g = dataset.graph g = dataset[0]
category = dataset.predict_category category = dataset.predict_category
num_classes = dataset.num_classes num_classes = dataset.num_classes
test_idx = dataset.test_idx test_mask = g.nodes[category].data.pop('test_mask')
labels = dataset.labels test_idx = th.nonzero(test_mask).squeeze()
labels = g.nodes[category].data.pop('labels')
# check cuda # check cuda
use_cuda = args.gpu >= 0 and th.cuda.is_available() use_cuda = args.gpu >= 0 and th.cuda.is_available()
...@@ -42,7 +43,6 @@ def main(args): ...@@ -42,7 +43,6 @@ def main(args):
num_bases=args.n_bases, num_bases=args.n_bases,
num_hidden_layers=args.n_layers - 2, num_hidden_layers=args.n_layers - 2,
use_self_loop=args.use_self_loop) use_self_loop=args.use_self_loop)
# training loop
model.load_state_dict(th.load(args.model_path)) model.load_state_dict(th.load(args.model_path))
if use_cuda: if use_cuda:
model.cuda() model.cuda()
...@@ -54,7 +54,7 @@ def main(args): ...@@ -54,7 +54,7 @@ def main(args):
test_acc = th.sum(logits[test_idx].argmax(dim=1) == labels[test_idx]).item() / len(test_idx) test_acc = th.sum(logits[test_idx].argmax(dim=1) == labels[test_idx]).item() / len(test_idx)
print("Test Acc: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.item())) print("Test Acc: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.item()))
print() print()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='RGCN') parser = argparse.ArgumentParser(description='RGCN')
parser.add_argument("--n-hidden", type=int, default=16, parser.add_argument("--n-hidden", type=int, default=16,
......
...@@ -17,44 +17,44 @@ pip install requests torch rdflib pandas ...@@ -17,44 +17,44 @@ pip install requests torch rdflib pandas
Example code was tested with rdflib 4.2.2 and pandas 0.23.4 Example code was tested with rdflib 4.2.2 and pandas 0.23.4
### Entity Classification ### Entity Classification
AIFB: accuracy 97.22% (DGL), 95.83% (paper) AIFB: accuracy 92.59% (3 runs, DGL), 95.83% (paper)
``` ```
python3 entity_classify.py -d aifb --testing --gpu 0 python3 entity_classify.py -d aifb --testing --gpu 0
``` ```
MUTAG: accuracy 75% (DGL), 73.23% (paper) MUTAG: accuracy 72.55% (3 runs, DGL), 73.23% (paper)
``` ```
python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0
``` ```
BGS: accuracy 82.76% (DGL), 83.10% (paper) BGS: accuracy 89.66% (3 runs, DGL), 83.10% (paper)
``` ```
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --relabel python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0
``` ```
AM: accuracy 87.37% (DGL), 89.29% (paper) AM: accuracy 89.73% (3 runs, DGL), 89.29% (paper)
``` ```
python3 entity_classify.py -d am --n-bases=40 --n-hidden=10 --l2norm=5e-4 --testing python3 entity_classify.py -d am --n-bases=40 --n-hidden=10 --l2norm=5e-4 --testing
``` ```
### Entity Classification with minibatch ### Entity Classification with minibatch
AIFB: accuracy avg(5 runs) 94.99%, best 97.22% (DGL) AIFB: accuracy avg(5 runs) 90.56%, best 94.44% (DGL)
``` ```
python3 entity_classify_mp.py -d aifb --testing --gpu 0 --fanout=20 --batch-size 128 python3 entity_classify_mp.py -d aifb --testing --gpu 0 --fanout=20 --batch-size 128
``` ```
MUTAG: accuracy avg(5 runs) 67.06%, best 80.88% (DGL) MUTAG: accuracy avg(5 runs) 66.77%, best 69.12% (DGL)
``` ```
python3 entity_classify_mp.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size 256 --use-self-loop --n-epochs 40 --dropout=0.3 python3 entity_classify_mp.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size 256 --use-self-loop --n-epochs 40
``` ```
BGS: accuracy avg(5 runs) 84.14%, best 89.66% (DGL) BGS: accuracy avg(5 runs) 91.72%, best 96.55% (DGL)
``` ```
python3 entity_classify_mp.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout 40 --n-epochs=40 --batch-size=128 python3 entity_classify_mp.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout 40 --n-epochs=40 --batch-size=128
``` ```
AM: accuracy avg(5 runs) 88.28%, best 90.91% (DGL) AM: accuracy avg(5 runs) 88.28%, best 90.40% (DGL)
``` ```
python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout 35 --batch-size 256 --lr 1e-2 --n-hidden 16 --use-self-loop --n-epochs=40 python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout 35 --batch-size 256 --lr 1e-2 --n-hidden 16 --use-self-loop --n-epochs=40
``` ```
......
...@@ -13,10 +13,10 @@ import numpy as np ...@@ -13,10 +13,10 @@ import numpy as np
import time import time
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from dgl import DGLGraph import dgl
from dgl.nn.pytorch import RelGraphConv from dgl.nn.pytorch import RelGraphConv
from dgl.contrib.data import load_data
from functools import partial from functools import partial
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import BaseRGCN from model import BaseRGCN
...@@ -44,13 +44,29 @@ class EntityClassify(BaseRGCN): ...@@ -44,13 +44,29 @@ class EntityClassify(BaseRGCN):
def main(args): def main(args):
# load graph data # load graph data
data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel) if args.dataset == 'aifb':
num_nodes = data.num_nodes dataset = AIFBDataset()
num_rels = data.num_rels elif args.dataset == 'mutag':
num_classes = data.num_classes dataset = MUTAGDataset()
labels = data.labels elif args.dataset == 'bgs':
train_idx = data.train_idx dataset = BGSDataset()
test_idx = data.test_idx elif args.dataset == 'am':
dataset = AMDataset()
else:
raise ValueError()
# Load from hetero-graph
hg = dataset[0]
num_rels = len(hg.canonical_etypes)
num_of_ntype = len(hg.ntypes)
category = dataset.predict_category
num_classes = dataset.num_classes
train_mask = hg.nodes[category].data.pop('train_mask')
test_mask = hg.nodes[category].data.pop('test_mask')
train_idx = torch.nonzero(train_mask).squeeze()
test_idx = torch.nonzero(test_mask).squeeze()
labels = hg.nodes[category].data.pop('labels')
# split dataset into train, validate, test # split dataset into train, validate, test
if args.validation: if args.validation:
...@@ -59,14 +75,35 @@ def main(args): ...@@ -59,14 +75,35 @@ def main(args):
else: else:
val_idx = train_idx val_idx = train_idx
# calculate norm for each edge type and store in edge
for canonical_etype in hg.canonical_etypes:
u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
_, inverse_index, count = torch.unique(v, return_inverse=True, return_counts=True)
degrees = count[inverse_index]
norm = torch.ones(eid.shape[0]).float() / degrees.float()
norm = norm.unsqueeze(1)
hg.edges[canonical_etype].data['norm'] = norm
# get target category id
category_id = len(hg.ntypes)
for i, ntype in enumerate(hg.ntypes):
if ntype == category:
category_id = i
g = dgl.to_homo(hg)
num_nodes = g.number_of_nodes()
node_ids = torch.arange(num_nodes)
edge_norm = g.edata['norm']
edge_type = g.edata[dgl.ETYPE].long()
# find out the target node ids in g
node_tids = g.ndata[dgl.NTYPE]
loc = (node_tids == category_id)
target_idx = node_ids[loc]
# since the nodes are featureless, the input feature is then the node id. # since the nodes are featureless, the input feature is then the node id.
feats = torch.arange(num_nodes) feats = torch.arange(num_nodes)
# edge type and normalization factor
edge_type = torch.from_numpy(data.edge_type).long()
edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1).long()
labels = torch.from_numpy(labels).view(-1).long()
# check cuda # check cuda
use_cuda = args.gpu >= 0 and torch.cuda.is_available() use_cuda = args.gpu >= 0 and torch.cuda.is_available()
if use_cuda: if use_cuda:
...@@ -76,13 +113,8 @@ def main(args): ...@@ -76,13 +113,8 @@ def main(args):
edge_norm = edge_norm.cuda() edge_norm = edge_norm.cuda()
labels = labels.cuda() labels = labels.cuda()
# create graph
g = DGLGraph()
g.add_nodes(num_nodes)
g.add_edges(data.edge_src, data.edge_dst)
# create model # create model
model = EntityClassify(len(g), model = EntityClassify(num_nodes,
args.n_hidden, args.n_hidden,
num_classes, num_classes,
num_rels, num_rels,
...@@ -108,6 +140,7 @@ def main(args): ...@@ -108,6 +140,7 @@ def main(args):
optimizer.zero_grad() optimizer.zero_grad()
t0 = time.time() t0 = time.time()
logits = model(g, feats, edge_type, edge_norm) logits = model(g, feats, edge_type, edge_norm)
logits = logits[target_idx]
loss = F.cross_entropy(logits[train_idx], labels[train_idx]) loss = F.cross_entropy(logits[train_idx], labels[train_idx])
t1 = time.time() t1 = time.time()
loss.backward() loss.backward()
...@@ -127,6 +160,7 @@ def main(args): ...@@ -127,6 +160,7 @@ def main(args):
model.eval() model.eval()
logits = model.forward(g, feats, edge_type, edge_norm) logits = model.forward(g, feats, edge_type, edge_norm)
logits = logits[target_idx]
test_loss = F.cross_entropy(logits[test_idx], labels[test_idx]) test_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
test_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx]).item() / len(test_idx) test_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx]).item() / len(test_idx)
print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.item())) print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.item()))
...@@ -156,8 +190,6 @@ if __name__ == '__main__': ...@@ -156,8 +190,6 @@ if __name__ == '__main__':
help="dataset to use") help="dataset to use")
parser.add_argument("--l2norm", type=float, default=0, parser.add_argument("--l2norm", type=float, default=0,
help="l2 norm coef") help="l2 norm coef")
parser.add_argument("--relabel", default=False, action='store_true',
help="remove untouched nodes and relabel")
parser.add_argument("--use-self-loop", default=False, action='store_true', parser.add_argument("--use-self-loop", default=False, action='store_true',
help="include self feature as a special relation") help="include self feature as a special relation")
fp = parser.add_mutually_exclusive_group(required=False) fp = parser.add_mutually_exclusive_group(required=False)
...@@ -167,5 +199,4 @@ if __name__ == '__main__': ...@@ -167,5 +199,4 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
args.bfs_level = args.n_layers + 1 # pruning used nodes for memory
main(args) main(args)
...@@ -21,7 +21,7 @@ import dgl ...@@ -21,7 +21,7 @@ import dgl
from dgl import DGLGraph from dgl import DGLGraph
from functools import partial from functools import partial
from dgl.data.rdf import AIFB, MUTAG, BGS, AM from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import RelGraphEmbedLayer from model import RelGraphEmbedLayer
from dgl.nn import RelGraphConv from dgl.nn import RelGraphConv
from utils import thread_wrapped_func from utils import thread_wrapped_func
...@@ -321,32 +321,34 @@ def run(proc_id, n_gpus, args, devices, dataset): ...@@ -321,32 +321,34 @@ def run(proc_id, n_gpus, args, devices, dataset):
print("{}/{} Mean forward time: {:4f}".format(proc_id, n_gpus, print("{}/{} Mean forward time: {:4f}".format(proc_id, n_gpus,
np.mean(forward_time[len(forward_time) // 4:]))) np.mean(forward_time[len(forward_time) // 4:])))
print("{}/{} Mean backward time: {:4f}".format(proc_id, n_gpus, print("{}/{} Mean backward time: {:4f}".format(proc_id, n_gpus,
np.mean(backward_time[len(backward_time) // 4:]))) np.mean(backward_time[len(backward_time) // 4:])))
def main(args, devices): def main(args, devices):
# load graph data # load graph data
ogb_dataset = False ogb_dataset = False
if args.dataset == 'aifb': if args.dataset == 'aifb':
dataset = AIFB() dataset = AIFBDataset()
elif args.dataset == 'mutag': elif args.dataset == 'mutag':
dataset = MUTAG() dataset = MUTAGDataset()
elif args.dataset == 'bgs': elif args.dataset == 'bgs':
dataset = BGS() dataset = BGSDataset()
elif args.dataset == 'am': elif args.dataset == 'am':
dataset = AM() dataset = AMDataset()
else: else:
raise ValueError() raise ValueError()
# Load from hetero-graph # Load from hetero-graph
hg = dataset.graph hg = dataset[0]
num_rels = len(hg.canonical_etypes) num_rels = len(hg.canonical_etypes)
num_of_ntype = len(hg.ntypes) num_of_ntype = len(hg.ntypes)
category = dataset.predict_category category = dataset.predict_category
num_classes = dataset.num_classes num_classes = dataset.num_classes
train_idx = dataset.train_idx train_mask = hg.nodes[category].data.pop('train_mask')
test_idx = dataset.test_idx test_mask = hg.nodes[category].data.pop('test_mask')
labels = dataset.labels labels = hg.nodes[category].data.pop('labels')
train_idx = th.nonzero(train_mask).squeeze()
test_idx = th.nonzero(test_mask).squeeze()
# split dataset into train, validate, test # split dataset into train, validate, test
if args.validation: if args.validation:
...@@ -356,14 +358,14 @@ def main(args, devices): ...@@ -356,14 +358,14 @@ def main(args, devices):
val_idx = train_idx val_idx = train_idx
# calculate norm for each edge type and store in edge # calculate norm for each edge type and store in edge
for canonical_etypes in hg.canonical_etypes: for canonical_etype in hg.canonical_etypes:
u, v, eid = hg.all_edges(form='all', etype=canonical_etypes) u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
_, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True) _, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
degrees = count[inverse_index] degrees = count[inverse_index]
norm = th.ones(eid.shape[0]) / degrees norm = th.ones(eid.shape[0]) / degrees
norm = norm.unsqueeze(1) norm = norm.unsqueeze(1)
hg.edges[canonical_etypes].data['norm'] = norm hg.edges[canonical_etype].data['norm'] = norm
# get target category id # get target category id
category_id = len(hg.ntypes) category_id = len(hg.ntypes)
for i, ntype in enumerate(hg.ntypes): for i, ntype in enumerate(hg.ntypes):
...@@ -385,7 +387,7 @@ def main(args, devices): ...@@ -385,7 +387,7 @@ def main(args, devices):
n_gpus = len(devices) n_gpus = len(devices)
# cpu # cpu
if devices[0] == -1: if devices[0] == -1:
run(0, 0, args, ['cpu'], run(0, 0, args, ['cpu'],
(g, num_of_ntype, num_classes, num_rels, target_idx, (g, num_of_ntype, num_classes, num_rels, target_idx,
train_idx, val_idx, test_idx, labels)) train_idx, val_idx, test_idx, labels))
# gpu # gpu
......
...@@ -11,23 +11,24 @@ ...@@ -11,23 +11,24 @@
* pandas * pandas
``` ```
pip install requests torch rdflib pandas pip install requests tensorflow rdflib pandas
export DGLBACKEND=tensorflow
``` ```
Example code was tested with rdflib 4.2.2 and pandas 0.23.4 Example code was tested with rdflib 4.2.2 and pandas 0.23.4
### Entity Classification ### Entity Classification
AIFB: accuracy 97.22% (DGL), 95.83% (paper) AIFB: accuracy 92.78% (5 runs, DGL), 95.83% (paper)
``` ```
python3 entity_classify.py -d aifb --testing --gpu 0 python3 entity_classify.py -d aifb --testing --gpu 0
``` ```
MUTAG: accuracy 75% (DGL), 73.23% (paper) MUTAG: accuracy 71.47% (5 runs, DGL), 73.23% (paper)
``` ```
python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0
``` ```
BGS: accuracy 79.3% (DGL n-base=25), 83.10% (paper n-base=40) BGS: accuracy 93.10% (5 runs, DGL n-base=25), 83.10% (paper n-base=40)
``` ```
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 25 --testing --gpu 0 --relabel python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 25 --testing --gpu 0
``` ```
...@@ -13,10 +13,10 @@ import numpy as np ...@@ -13,10 +13,10 @@ import numpy as np
import time import time
import tensorflow as tf import tensorflow as tf
from tensorflow.keras import layers from tensorflow.keras import layers
from dgl import DGLGraph import dgl
from dgl.nn.tensorflow import RelGraphConv from dgl.nn.tensorflow import RelGraphConv
from dgl.contrib.data import load_data
from functools import partial from functools import partial
from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from model import BaseRGCN from model import BaseRGCN
...@@ -49,28 +49,56 @@ def acc(logits, labels, mask): ...@@ -49,28 +49,56 @@ def acc(logits, labels, mask):
def main(args): def main(args):
# load graph data # load graph data
data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel) if args.dataset == 'aifb':
num_nodes = data.num_nodes dataset = AIFBDataset()
num_rels = data.num_rels elif args.dataset == 'mutag':
num_classes = data.num_classes dataset = MUTAGDataset()
labels = data.labels elif args.dataset == 'bgs':
train_idx = data.train_idx dataset = BGSDataset()
test_idx = data.test_idx elif args.dataset == 'am':
dataset = AMDataset()
# split dataset into train, validate, test
if args.validation:
val_idx = train_idx[:len(train_idx) // 5]
train_idx = train_idx[len(train_idx) // 5:]
else: else:
val_idx = train_idx raise ValueError()
# since the nodes are featureless, the input feature is then the node id. # preprocessing in cpu
feats = tf.range(num_nodes, dtype=tf.int64) with tf.device("/cpu:0"):
# Load from hetero-graph
# edge type and normalization factor hg = dataset[0]
edge_type = tf.convert_to_tensor(data.edge_type)
edge_norm = tf.expand_dims(tf.convert_to_tensor(data.edge_norm), 1) num_rels = len(hg.canonical_etypes)
labels = tf.reshape(tf.convert_to_tensor(labels), (-1, )) num_of_ntype = len(hg.ntypes)
category = dataset.predict_category
num_classes = dataset.num_classes
train_mask = hg.nodes[category].data.pop('train_mask')
test_mask = hg.nodes[category].data.pop('test_mask')
train_idx = tf.squeeze(tf.where(train_mask))
test_idx = tf.squeeze(tf.where(test_mask))
labels = hg.nodes[category].data.pop('labels')
# split dataset into train, validate, test
if args.validation:
val_idx = train_idx[:len(train_idx) // 5]
train_idx = train_idx[len(train_idx) // 5:]
else:
val_idx = train_idx
# calculate norm for each edge type and store in edge
for canonical_etype in hg.canonical_etypes:
u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
_, inverse_index, count = tf.unique_with_counts(v)
degrees = tf.gather(count, inverse_index)
norm = tf.ones(eid.shape[0]) / tf.cast(degrees, tf.float32)
norm = tf.expand_dims(norm, 1)
hg.edges[canonical_etype].data['norm'] = norm
# get target category id
category_id = len(hg.ntypes)
for i, ntype in enumerate(hg.ntypes):
if ntype == category:
category_id = i
# edge type and normalization factor
g = dgl.to_homo(hg)
# check cuda # check cuda
if args.gpu < 0: if args.gpu < 0:
...@@ -78,25 +106,32 @@ def main(args): ...@@ -78,25 +106,32 @@ def main(args):
use_cuda = False use_cuda = False
else: else:
device = "/gpu:{}".format(args.gpu) device = "/gpu:{}".format(args.gpu)
g = g.to(device)
use_cuda = True use_cuda = True
num_nodes = g.number_of_nodes()
with tf.device(device): node_ids = tf.range(num_nodes, dtype=tf.int64)
edge_norm = g.edata['norm']
edge_type = tf.cast(g.edata[dgl.ETYPE], tf.int64)
# create graph # find out the target node ids in g
g = DGLGraph() node_tids = g.ndata[dgl.NTYPE]
g.add_nodes(num_nodes) loc = (node_tids == category_id)
g.add_edges(data.edge_src, data.edge_dst) target_idx = tf.squeeze(tf.where(loc))
# since the nodes are featureless, the input feature is then the node id.
feats = tf.range(num_nodes, dtype=tf.int64)
with tf.device(device):
# create model # create model
model = EntityClassify(len(g), model = EntityClassify(num_nodes,
args.n_hidden, args.n_hidden,
num_classes, num_classes,
num_rels, num_rels,
num_bases=args.n_bases, num_bases=args.n_bases,
num_hidden_layers=args.n_layers - 2, num_hidden_layers=args.n_layers - 2,
dropout=args.dropout, dropout=args.dropout,
use_self_loop=args.use_self_loop, use_self_loop=args.use_self_loop,
use_cuda=use_cuda) use_cuda=use_cuda)
# optimizer # optimizer
optimizer = tf.keras.optimizers.Adam( optimizer = tf.keras.optimizers.Adam(
...@@ -111,9 +146,10 @@ def main(args): ...@@ -111,9 +146,10 @@ def main(args):
t0 = time.time() t0 = time.time()
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
logits = model(g, feats, edge_type, edge_norm) logits = model(g, feats, edge_type, edge_norm)
logits = tf.gather(logits, target_idx)
loss = loss_fcn(tf.gather(labels, train_idx), tf.gather(logits, train_idx)) loss = loss_fcn(tf.gather(labels, train_idx), tf.gather(logits, train_idx))
# Manually Weight Decay # Manually Weight Decay
# We found Tensorflow has a different implementation on weight decay # We found Tensorflow has a different implementation on weight decay
# of Adam(W) optimizer with PyTorch. And this results in worse results. # of Adam(W) optimizer with PyTorch. And this results in worse results.
# Manually adding weights to the loss to do weight decay solves this problem. # Manually adding weights to the loss to do weight decay solves this problem.
for weight in model.trainable_weights: for weight in model.trainable_weights:
...@@ -136,6 +172,7 @@ def main(args): ...@@ -136,6 +172,7 @@ def main(args):
print() print()
logits = model(g, feats, edge_type, edge_norm) logits = model(g, feats, edge_type, edge_norm)
logits = tf.gather(logits, target_idx)
test_loss = loss_fcn(tf.gather(labels, test_idx), tf.gather(logits, test_idx)) test_loss = loss_fcn(tf.gather(labels, test_idx), tf.gather(logits, test_idx))
test_acc = acc(logits, labels, test_idx) test_acc = acc(logits, labels, test_idx)
print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.numpy().item())) print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.numpy().item()))
...@@ -165,8 +202,6 @@ if __name__ == '__main__': ...@@ -165,8 +202,6 @@ if __name__ == '__main__':
help="dataset to use") help="dataset to use")
parser.add_argument("--l2norm", type=float, default=0, parser.add_argument("--l2norm", type=float, default=0,
help="l2 norm coef") help="l2 norm coef")
parser.add_argument("--relabel", default=False, action='store_true',
help="remove untouched nodes and relabel")
parser.add_argument("--use-self-loop", default=False, action='store_true', parser.add_argument("--use-self-loop", default=False, action='store_true',
help="include self feature as a special relation") help="include self feature as a special relation")
fp = parser.add_mutually_exclusive_group(required=False) fp = parser.add_mutually_exclusive_group(required=False)
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
from __future__ import absolute_import from __future__ import absolute_import
import os, sys import os, sys
import abc
from .utils import download, extract_archive, get_download_dir, makedirs from .utils import download, extract_archive, get_download_dir, makedirs
from ..utils import retry_method_with_fix from ..utils import retry_method_with_fix
...@@ -37,7 +38,7 @@ class DGLDataset(object): ...@@ -37,7 +38,7 @@ class DGLDataset(object):
Default: ~/.dgl/ Default: ~/.dgl/
save_dir : str save_dir : str
Directory to save the processed dataset. Directory to save the processed dataset.
Default: ~/.dgl/ Default: same as raw_dir
force_reload : bool force_reload : bool
Whether to reload the dataset. Default: False Whether to reload the dataset. Default: False
verbose : bool verbose : bool
...@@ -190,14 +191,16 @@ class DGLDataset(object): ...@@ -190,14 +191,16 @@ class DGLDataset(object):
""" """
return self._verbose return self._verbose
@abc.abstractmethod
def __getitem__(self, idx): def __getitem__(self, idx):
r"""Gets the data object at index. r"""Gets the data object at index.
""" """
raise NotImplementedError pass
@abc.abstractmethod
def __len__(self): def __len__(self):
r"""The number of examples in the dataset.""" r"""The number of examples in the dataset."""
raise NotImplementedError pass
class DGLBuiltinDataset(DGLDataset): class DGLBuiltinDataset(DGLDataset):
r"""The Basic DGL Builtin Dataset. r"""The Basic DGL Builtin Dataset.
......
"""RDF datasets """RDF datasets
Datasets from "A Collection of Benchmark Datasets for Datasets from "A Collection of Benchmark Datasets for
Systematic Evaluations of Machine Learning on Systematic Evaluations of Machine Learning on
the Semantic Web" the Semantic Web"
...@@ -7,19 +6,20 @@ the Semantic Web" ...@@ -7,19 +6,20 @@ the Semantic Web"
import os import os
from collections import OrderedDict from collections import OrderedDict
import itertools import itertools
import rdflib as rdf
import abc import abc
import re import re
import rdflib as rdf
import networkx as nx import networkx as nx
import numpy as np import numpy as np
import dgl import dgl
import dgl.backend as F import dgl.backend as F
from .utils import download, extract_archive, get_download_dir, _get_dgl_url from .dgl_dataset import DGLBuiltinDataset
from ..utils import retry_method_with_fix from .utils import save_graphs, load_graphs, save_info, load_info, _get_dgl_url
from .utils import generate_mask_tensor, idx2mask, deprecate_property, deprecate_class
__all__ = ['AIFB', 'MUTAG', 'BGS', 'AM'] __all__ = ['AIFB', 'MUTAG', 'BGS', 'AM', 'AIFBDataset', 'MUTAGDataset', 'BGSDataset', 'AMDataset']
# Dictionary for renaming reserved node/edge type names to the ones # Dictionary for renaming reserved node/edge type names to the ones
# that are allowed by nn.Module. # that are allowed by nn.Module.
...@@ -30,7 +30,6 @@ RENAME_DICT = { ...@@ -30,7 +30,6 @@ RENAME_DICT = {
class Entity: class Entity:
"""Class for entities """Class for entities
Parameters Parameters
---------- ----------
id : str id : str
...@@ -38,8 +37,8 @@ class Entity: ...@@ -38,8 +37,8 @@ class Entity:
cls : str cls : str
Type of this entity Type of this entity
""" """
def __init__(self, id, cls): def __init__(self, e_id, cls):
self.id = id self.id = e_id
self.cls = cls self.cls = cls
def __str__(self): def __str__(self):
...@@ -47,7 +46,6 @@ class Entity: ...@@ -47,7 +46,6 @@ class Entity:
class Relation: class Relation:
"""Class for relations """Class for relations
Parameters Parameters
---------- ----------
cls : str cls : str
...@@ -59,7 +57,7 @@ class Relation: ...@@ -59,7 +57,7 @@ class Relation:
def __str__(self): def __str__(self):
return str(self.cls) return str(self.cls)
class RDFGraphDataset: class RDFGraphDataset(DGLBuiltinDataset):
"""Base graph dataset class from RDF tuples. """Base graph dataset class from RDF tuples.
To derive from this, implement the following abstract methods: To derive from this, implement the following abstract methods:
...@@ -68,10 +66,8 @@ class RDFGraphDataset: ...@@ -68,10 +66,8 @@ class RDFGraphDataset:
* ``process_tuple`` * ``process_tuple``
* ``process_idx_file_line`` * ``process_idx_file_line``
* ``predict_category`` * ``predict_category``
Preprocessed graph and other data will be cached in the download folder Preprocessed graph and other data will be cached in the download folder
to speedup data loading. to speedup data loading.
The dataset should contain a "trainingSet.tsv" and a "testSet.tsv" file The dataset should contain a "trainingSet.tsv" and a "testSet.tsv" file
for training and testing samples. for training and testing samples.
...@@ -92,50 +88,57 @@ class RDFGraphDataset: ...@@ -92,50 +88,57 @@ class RDFGraphDataset:
Parameters Parameters
---------- ----------
url : str or path
URL to download the raw dataset.
name : str name : str
Name of the dataset Name of the dataset
force_reload : bool, optional url : str or path
If true, force load and process from raw data. Ignore cached pre-processed data. URL to download the raw dataset.
predict_category : str
Predict category.
print_every : int, optional print_every : int, optional
Log for every X tuples. Preprocessing log for every X tuples.
insert_reverse : bool, optional insert_reverse : bool, optional
If true, add reverse edge and reverse relations to the final graph. If true, add reverse edge and reverse relations to the final graph.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool, optional
If true, force load and process from raw data. Ignore cached pre-processed data.
verbose: bool
Whether to print out progress information. Default: True.
""" """
def __init__(self, url, name, def __init__(self, name, url, predict_category,
force_reload=False,
print_every=10000, print_every=10000,
insert_reverse=True): insert_reverse=True,
download_dir = get_download_dir() raw_dir=None,
zip_file_path = os.path.join(download_dir, '{}.zip'.format(name)) force_reload=False,
self._dir = os.path.join(download_dir, name) verbose=True):
self._url = url
self._zip_file_path = zip_file_path
self._load(print_every, insert_reverse, force_reload)
def _download(self):
download(self._url, path=self._zip_file_path)
extract_archive(self._zip_file_path, self._dir)
@retry_method_with_fix(_download)
def _load(self, print_every, insert_reverse, force_reload):
self._print_every = print_every
self._insert_reverse = insert_reverse self._insert_reverse = insert_reverse
if not force_reload and self.has_cache(): self._print_every = print_every
print('Found cached graph. Load cache ...') self._predict_category = predict_category
self.load_cache()
else: super(RDFGraphDataset, self).__init__(name, url,
raw_tuples = self.load_raw_tuples() raw_dir=raw_dir,
self.process_raw_tuples(raw_tuples) force_reload=force_reload,
print('#Training samples:', len(self.train_idx)) verbose=verbose)
print('#Testing samples:', len(self.test_idx))
print('#Classes:', self.num_classes) def process(self):
print('Predict category:', self.predict_category) raw_tuples = self.load_raw_tuples(self.raw_path)
self.process_raw_tuples(raw_tuples, self.raw_path)
def load_raw_tuples(self):
def load_raw_tuples(self, root_path):
"""Loading raw RDF dataset
Parameters
----------
root_path : str
Root path containing the data
Returns
-------
Loaded rdf data
"""
raw_rdf_graphs = [] raw_rdf_graphs = []
for i, filename in enumerate(os.listdir(self._dir)): for _, filename in enumerate(os.listdir(root_path)):
fmt = None fmt = None
if filename.endswith('nt'): if filename.endswith('nt'):
fmt = 'nt' fmt = 'nt'
...@@ -145,11 +148,20 @@ class RDFGraphDataset: ...@@ -145,11 +148,20 @@ class RDFGraphDataset:
continue continue
g = rdf.Graph() g = rdf.Graph()
print('Parsing file %s ...' % filename) print('Parsing file %s ...' % filename)
g.parse(os.path.join(self._dir, filename), format=fmt) g.parse(os.path.join(root_path, filename), format=fmt)
raw_rdf_graphs.append(g) raw_rdf_graphs.append(g)
return itertools.chain(*raw_rdf_graphs) return itertools.chain(*raw_rdf_graphs)
def process_raw_tuples(self, raw_tuples): def process_raw_tuples(self, raw_tuples, root_path):
"""Processing raw RDF dataset
Parameters
----------
raw_tuples:
Raw rdf tuples
root_path: str
Root path containing the data
"""
mg = nx.MultiDiGraph() mg = nx.MultiDiGraph()
ent_classes = OrderedDict() ent_classes = OrderedDict()
rel_classes = OrderedDict() rel_classes = OrderedDict()
...@@ -164,7 +176,7 @@ class RDFGraphDataset: ...@@ -164,7 +176,7 @@ class RDFGraphDataset:
sorted_tuples.sort() sorted_tuples.sort()
for i, (sbj, pred, obj) in enumerate(sorted_tuples): for i, (sbj, pred, obj) in enumerate(sorted_tuples):
if i % self._print_every == 0: if self.verbose and i % self._print_every == 0:
print('Processed %d tuples, found %d valid tuples.' % (i, len(src))) print('Processed %d tuples, found %d valid tuples.' % (i, len(src)))
sbjent = self.parse_entity(sbj) sbjent = self.parse_entity(sbj)
rel = self.parse_relation(pred) rel = self.parse_relation(pred)
...@@ -200,7 +212,8 @@ class RDFGraphDataset: ...@@ -200,7 +212,8 @@ class RDFGraphDataset:
# add reverse edge with reverse relation # add reverse edge with reverse relation
if self._insert_reverse: if self._insert_reverse:
print('Adding reverse edges ...') if self.verbose:
print('Adding reverse edges ...')
newsrc = np.hstack([src, dst]) newsrc = np.hstack([src, dst])
newdst = np.hstack([dst, src]) newdst = np.hstack([dst, src])
src = newsrc src = newsrc
...@@ -208,28 +221,68 @@ class RDFGraphDataset: ...@@ -208,28 +221,68 @@ class RDFGraphDataset:
etid = np.hstack([etid, etid + len(etypes)]) etid = np.hstack([etid, etid + len(etypes)])
etypes.extend(['rev-%s' % t for t in etypes]) etypes.extend(['rev-%s' % t for t in etypes])
self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes) hg = self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes)
print('Load training/validation/testing split ...') if self.verbose:
idmap = F.asnumpy(self.graph.nodes[self.predict_category].data[dgl.NID]) print('Load training/validation/testing split ...')
idmap = F.asnumpy(hg.nodes[self.predict_category].data[dgl.NID])
glb2lcl = {glbid : lclid for lclid, glbid in enumerate(idmap)} glb2lcl = {glbid : lclid for lclid, glbid in enumerate(idmap)}
def findidfn(ent): def findidfn(ent):
if ent not in entities: if ent not in entities:
return None return None
else: else:
return glb2lcl[entities[ent]] return glb2lcl[entities[ent]]
self.load_data_split(findidfn) self._hg = hg
train_idx, test_idx, labels, num_classes = self.load_data_split(findidfn, root_path)
train_mask = idx2mask(train_idx, self._hg.number_of_nodes(self.predict_category))
test_mask = idx2mask(test_idx, self._hg.number_of_nodes(self.predict_category))
labels = F.tensor(labels, F.data_type_dict['int64'])
train_mask = generate_mask_tensor(train_mask)
test_mask = generate_mask_tensor(test_mask)
self._hg.nodes[self.predict_category].data['train_mask'] = train_mask
self._hg.nodes[self.predict_category].data['test_mask'] = test_mask
self._hg.nodes[self.predict_category].data['labels'] = labels
self._num_classes = num_classes
self.save_cache(mg, src, dst, ntid, etid, ntypes, etypes) # save for compatability
self._train_idx = F.tensor(train_idx)
self._test_idx = F.tensor(test_idx)
def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes): def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes):
"""Build the graphs
Parameters
----------
mg: MultiDiGraph
Input graph
src: Numpy array
Source nodes
dst: Numpy array
Destination nodes
ntid: Numpy array
Node types for each node
etid: Numpy array
Edge types for each edge
ntypes: list
Node types
etypes: list
Edge types
Returns
-------
g: DGLGraph
"""
# create homo graph # create homo graph
print('Creating one whole graph ...') if self.verbose:
print('Creating one whole graph ...')
g = dgl.graph((src, dst)) g = dgl.graph((src, dst))
g.ndata[dgl.NTYPE] = F.tensor(ntid) g.ndata[dgl.NTYPE] = F.tensor(ntid)
g.edata[dgl.ETYPE] = F.tensor(etid) g.edata[dgl.ETYPE] = F.tensor(etid)
print('Total #nodes:', g.number_of_nodes()) if self.verbose:
print('Total #edges:', g.number_of_edges()) print('Total #nodes:', g.number_of_nodes())
print('Total #edges:', g.number_of_edges())
# rename names such as 'type' so that they an be used as keys # rename names such as 'type' so that they an be used as keys
# to nn.ModuleDict # to nn.ModuleDict
...@@ -240,71 +293,72 @@ class RDFGraphDataset: ...@@ -240,71 +293,72 @@ class RDFGraphDataset:
mg.add_edge(sty, dty, key=RENAME_DICT.get(ety, ety)) mg.add_edge(sty, dty, key=RENAME_DICT.get(ety, ety))
# convert to heterograph # convert to heterograph
print('Convert to heterograph ...') if self.verbose:
print('Convert to heterograph ...')
hg = dgl.to_hetero(g, hg = dgl.to_hetero(g,
ntypes, ntypes,
etypes, etypes,
metagraph=mg) metagraph=mg)
print('#Node types:', len(hg.ntypes)) if self.verbose:
print('#Canonical edge types:', len(hg.etypes)) print('#Node types:', len(hg.ntypes))
print('#Unique edge type names:', len(set(hg.etypes))) print('#Canonical edge types:', len(hg.etypes))
self.graph = hg print('#Unique edge type names:', len(set(hg.etypes)))
return hg
def save_cache(self, mg, src, dst, ntid, etid, ntypes, etypes):
nx.write_gpickle(mg, os.path.join(self._dir, 'cached_mg.gpickle'))
np.save(os.path.join(self._dir, 'cached_src.npy'), src)
np.save(os.path.join(self._dir, 'cached_dst.npy'), dst)
np.save(os.path.join(self._dir, 'cached_ntid.npy'), ntid)
np.save(os.path.join(self._dir, 'cached_etid.npy'), etid)
save_strlist(os.path.join(self._dir, 'cached_ntypes.txt'), ntypes)
save_strlist(os.path.join(self._dir, 'cached_etypes.txt'), etypes)
np.save(os.path.join(self._dir, 'cached_train_idx.npy'), F.asnumpy(self.train_idx))
np.save(os.path.join(self._dir, 'cached_test_idx.npy'), F.asnumpy(self.test_idx))
np.save(os.path.join(self._dir, 'cached_labels.npy'), F.asnumpy(self.labels))
def has_cache(self): def load_data_split(self, ent2id, root_path):
return (os.path.exists(os.path.join(self._dir, 'cached_mg.gpickle')) """Load data split
and os.path.exists(os.path.join(self._dir, 'cached_src.npy'))
and os.path.exists(os.path.join(self._dir, 'cached_dst.npy')) Parameters
and os.path.exists(os.path.join(self._dir, 'cached_ntid.npy')) ----------
and os.path.exists(os.path.join(self._dir, 'cached_etid.npy')) ent2id: func
and os.path.exists(os.path.join(self._dir, 'cached_ntypes.txt')) A function mapping entity to id
and os.path.exists(os.path.join(self._dir, 'cached_etypes.txt')) root_path: str
and os.path.exists(os.path.join(self._dir, 'cached_train_idx.npy')) Root path containing the data
and os.path.exists(os.path.join(self._dir, 'cached_test_idx.npy'))
and os.path.exists(os.path.join(self._dir, 'cached_labels.npy'))) Return
------
def load_cache(self): train_idx: Numpy array
mg = nx.read_gpickle(os.path.join(self._dir, 'cached_mg.gpickle')) Training set
src = np.load(os.path.join(self._dir, 'cached_src.npy')) test_idx: Numpy array
dst = np.load(os.path.join(self._dir, 'cached_dst.npy')) Testing set
ntid = np.load(os.path.join(self._dir, 'cached_ntid.npy')) labels: Numpy array
etid = np.load(os.path.join(self._dir, 'cached_etid.npy')) Labels
ntypes = load_strlist(os.path.join(self._dir, 'cached_ntypes.txt')) num_classes: int
etypes = load_strlist(os.path.join(self._dir, 'cached_etypes.txt')) Number of classes
self.train_idx = F.tensor(np.load(os.path.join(self._dir, 'cached_train_idx.npy'))) """
self.test_idx = F.tensor(np.load(os.path.join(self._dir, 'cached_test_idx.npy')))
labels = np.load(os.path.join(self._dir, 'cached_labels.npy'))
self.num_classes = labels.max() + 1
self.labels = F.tensor(labels)
self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes)
def load_data_split(self, ent2id):
label_dict = {} label_dict = {}
labels = np.zeros((self.graph.number_of_nodes(self.predict_category),)) - 1 labels = np.zeros((self._hg.number_of_nodes(self.predict_category),)) - 1
train_idx = self.parse_idx_file( train_idx = self.parse_idx_file(
os.path.join(self._dir, 'trainingSet.tsv'), os.path.join(root_path, 'trainingSet.tsv'),
ent2id, label_dict, labels) ent2id, label_dict, labels)
test_idx = self.parse_idx_file( test_idx = self.parse_idx_file(
os.path.join(self._dir, 'testSet.tsv'), os.path.join(root_path, 'testSet.tsv'),
ent2id, label_dict, labels) ent2id, label_dict, labels)
self.train_idx = F.tensor(train_idx) train_idx = np.array(train_idx)
self.test_idx = F.tensor(test_idx) test_idx = np.array(test_idx)
self.labels = F.tensor(labels).long() labels = np.array(labels)
self.num_classes = len(label_dict) num_classes = len(label_dict)
return train_idx, test_idx, labels, num_classes
def parse_idx_file(self, filename, ent2id, label_dict, labels): def parse_idx_file(self, filename, ent2id, label_dict, labels):
"""Parse idx files
Parameters
----------
filename: str
File to parse
ent2id: func
A function mapping entity to id
label_dict: dict
Map label to label id
labels: dict
Map entity id to label id
Return
------
idx: list
Entity idss
"""
idx = [] idx = []
with open(filename, 'r') as f: with open(filename, 'r') as f:
for i, line in enumerate(f): for i, line in enumerate(f):
...@@ -322,18 +376,100 @@ class RDFGraphDataset: ...@@ -322,18 +376,100 @@ class RDFGraphDataset:
labels[entid] = lblid labels[entid] = lblid
return idx return idx
def has_cache(self):
"""check if there is a processed data"""
graph_path = os.path.join(self.save_path,
self.save_name + '.bin')
info_path = os.path.join(self.save_path,
self.save_name + '.pkl')
if os.path.exists(graph_path) and \
os.path.exists(info_path):
return True
return False
def save(self):
"""save the graph list and the labels"""
graph_path = os.path.join(self.save_path,
self.save_name + '.bin')
info_path = os.path.join(self.save_path,
self.save_name + '.pkl')
save_graphs(str(graph_path), self._hg)
save_info(str(info_path), {'num_classes': self.num_classes,
'predict_category': self.predict_category})
def load(self):
"""load the graph list and the labels from disk"""
graph_path = os.path.join(self.save_path,
self.save_name + '.bin')
info_path = os.path.join(self.save_path,
self.save_name + '.pkl')
graphs, _ = load_graphs(str(graph_path))
info = load_info(str(info_path))
self._num_classes = info['num_classes']
self._predict_category = info['predict_category']
self._hg = graphs[0]
train_mask = self._hg.nodes[self.predict_category].data['train_mask']
test_mask = self._hg.nodes[self.predict_category].data['test_mask']
self._labels = self._hg.nodes[self.predict_category].data['labels']
train_idx = F.nonzero_1d(train_mask)
test_idx = F.nonzero_1d(test_mask)
self._train_idx = train_idx
self._test_idx = test_idx
def __getitem__(self, idx):
r"""Gets the graph object
"""
g = self._hg
return g
def __len__(self):
r"""The number of examples in the dataset."""
return 1
@property
def save_name(self):
return self.name + '_dgl_graph'
@property
def graph(self):
deprecate_property('dataset.graph', 'hg = dataset[0]')
return self._hg
@property
def predict_category(self):
return self._predict_category
@property
def num_classes(self):
return self._num_classes
@property
def train_idx(self):
deprecate_property('dataset.train_idx', 'train_mask = g.ndata[\'train_mask\']')
return self._train_idx
@property
def test_idx(self):
deprecate_property('dataset.test_idx', 'train_mask = g.ndata[\'test_mask\']')
return self._test_idx
@property
def labels(self):
deprecate_property('dataset.labels', 'train_mask = g.ndata[\'labels\']')
return self._labels
@abc.abstractmethod @abc.abstractmethod
def parse_entity(self, term): def parse_entity(self, term):
"""Parse one entity from an RDF term. """Parse one entity from an RDF term.
Return None if the term does not represent a valid entity and the Return None if the term does not represent a valid entity and the
whole tuple should be ignored. whole tuple should be ignored.
Parameters Parameters
---------- ----------
term : rdflib.term.Identifier term : rdflib.term.Identifier
RDF term RDF term
Returns Returns
------- -------
Entity or None Entity or None
...@@ -344,15 +480,12 @@ class RDFGraphDataset: ...@@ -344,15 +480,12 @@ class RDFGraphDataset:
@abc.abstractmethod @abc.abstractmethod
def parse_relation(self, term): def parse_relation(self, term):
"""Parse one relation from an RDF term. """Parse one relation from an RDF term.
Return None if the term does not represent a valid relation and the Return None if the term does not represent a valid relation and the
whole tuple should be ignored. whole tuple should be ignored.
Parameters Parameters
---------- ----------
term : rdflib.term.Identifier term : rdflib.term.Identifier
RDF term RDF term
Returns Returns
------- -------
Relation or None Relation or None
...@@ -363,10 +496,9 @@ class RDFGraphDataset: ...@@ -363,10 +496,9 @@ class RDFGraphDataset:
@abc.abstractmethod @abc.abstractmethod
def process_tuple(self, raw_tuple, sbj, rel, obj): def process_tuple(self, raw_tuple, sbj, rel, obj):
"""Process the tuple. """Process the tuple.
Return (Entity, Relation, Entity) tuple for as the final tuple. Return (Entity, Relation, Entity) tuple for as the final tuple.
Return None if the tuple should be ignored. Return None if the tuple should be ignored.
Parameters Parameters
---------- ----------
raw_tuple : tuple of rdflib.term.Identifier raw_tuple : tuple of rdflib.term.Identifier
...@@ -377,7 +509,6 @@ class RDFGraphDataset: ...@@ -377,7 +509,6 @@ class RDFGraphDataset:
Relation Relation
obj : Entity obj : Entity
Object entity Object entity
Returns Returns
------- -------
(Entity, Relation, Entity) (Entity, Relation, Entity)
...@@ -388,12 +519,10 @@ class RDFGraphDataset: ...@@ -388,12 +519,10 @@ class RDFGraphDataset:
@abc.abstractmethod @abc.abstractmethod
def process_idx_file_line(self, line): def process_idx_file_line(self, line):
"""Process one line of ``trainingSet.tsv`` or ``testSet.tsv``. """Process one line of ``trainingSet.tsv`` or ``testSet.tsv``.
Parameters Parameters
---------- ----------
line : str line : str
One line of the file One line of the file
Returns Returns
------- -------
(str, str) (str, str)
...@@ -401,12 +530,6 @@ class RDFGraphDataset: ...@@ -401,12 +530,6 @@ class RDFGraphDataset:
""" """
pass pass
@property
@abc.abstractmethod
def predict_category(self):
"""Return the category name that has labels."""
pass
def _get_id(dict, key): def _get_id(dict, key):
id = dict.get(key, None) id = dict.get(key, None)
if id is None: if id is None:
...@@ -414,25 +537,54 @@ def _get_id(dict, key): ...@@ -414,25 +537,54 @@ def _get_id(dict, key):
dict[key] = id dict[key] = id
return id return id
def save_strlist(filename, strlist): class AIFBDataset(RDFGraphDataset):
with open(filename, 'w') as f: r"""AIFB dataset.
for s in strlist: AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in
f.write(s + '\n') data mining. It records the organizational structure of AIFB at the
University of Karlsruhe.
def load_strlist(filename): Statistics
with open(filename, 'r') as f: ===
ret = [] Nodes: 7262
for line in f: Edges: 48810 (including reverse edges)
ret.append(line.strip()) Target Category: Personen
return ret Number of Classes: 4
Label Split: Train: 140, Test: 36
class AIFB(RDFGraphDataset): Parameters
"""AIFB dataset. -----------
print_every: int
Preprocessing log for every X tuples. Default: 10000.
insert_reverse: bool
If true, add reverse edge and reverse relations to the final graph. Default: True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Returns
===
AIFBDataset object with three properties:
graph: A Heterogenous graph containing the
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
predict_category: The category name to run the node classification
prediction.
num_of_class: number of publication categories
for the classification task.
Examples Examples
-------- --------
>>> dataset = dgl.data.rdf.AIFB() >>> dataset = dgl.data.rdf.AIFBDataset()
>>> print(dataset.graph) >>> graph = dataset[0]
>>> category = dataset.predict_category
>>> num_classes = dataset.num_classes
>>>
>>> train_mask = g.nodes[category].data.pop('train_mask')
>>> test_mask = g.nodes[category].data.pop('test_mask')
>>> labels = g.nodes[category].data.pop('labels')
""" """
employs = rdf.term.URIRef("http://swrc.ontoware.org/ontology#employs") employs = rdf.term.URIRef("http://swrc.ontoware.org/ontology#employs")
...@@ -441,15 +593,20 @@ class AIFB(RDFGraphDataset): ...@@ -441,15 +593,20 @@ class AIFB(RDFGraphDataset):
relation_prefix = 'http://swrc.ontoware.org/' relation_prefix = 'http://swrc.ontoware.org/'
def __init__(self, def __init__(self,
force_reload=False,
print_every=10000, print_every=10000,
insert_reverse=True): insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
url = _get_dgl_url('dataset/rdf/aifb-hetero.zip') url = _get_dgl_url('dataset/rdf/aifb-hetero.zip')
name = 'aifb-hetero' name = 'aifb-hetero'
super(AIFB, self).__init__(url, name, predict_category = 'Personen'
force_reload=force_reload, super(AIFBDataset, self).__init__(name, url, predict_category,
print_every=print_every, print_every=print_every,
insert_reverse=insert_reverse) insert_reverse=insert_reverse,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
...@@ -482,17 +639,68 @@ class AIFB(RDFGraphDataset): ...@@ -482,17 +639,68 @@ class AIFB(RDFGraphDataset):
person, _, label = line.strip().split('\t') person, _, label = line.strip().split('\t')
return person, label return person, label
@property class AIFB(AIFBDataset):
def predict_category(self): """AIFB dataset. Same as AIFBDataset.
return 'Personen' """
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
deprecate_class('AIFB', 'AIFBDataset')
super(AIFB, self).__init__(print_every,
insert_reverse,
raw_dir,
force_reload,
verbose)
class MUTAGDataset(RDFGraphDataset):
r"""MUTAG dataset.
Statistics
===
Nodes: 27163
Edges: 148100 (including reverse edges)
Target Category: d
Number of Classes: 2
Label Split: Train: 272, Test: 68
Parameters
-----------
print_every: int
Preprocessing log for every X tuples. Default: 10000.
insert_reverse: bool
If true, add reverse edge and reverse relations to the final graph. Default: True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Returns
===
MUTAGDataset object with three properties:
graph: A Heterogenous graph containing the
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
predict_category: The category name to run the node classification
prediction.
num_of_class: number of publication categories
for the classification task.
class MUTAG(RDFGraphDataset):
"""MUTAG dataset.
Examples Examples
-------- --------
>>> dataset = dgl.data.rdf.MUTAG() >>> dataset = dgl.data.rdf.MUTAGDataset()
>>> print(dataset.graph) >>> graph = dataset[0]
>>> category = dataset.predict_category
>>> num_classes = dataset.num_classes
>>>
>>> train_mask = g.nodes[category].data.pop('train_mask')
>>> test_mask = g.nodes[category].data.pop('test_mask')
>>> labels = g.nodes[category].data.pop('labels')
""" """
d_entity = re.compile("d[0-9]") d_entity = re.compile("d[0-9]")
...@@ -507,15 +715,20 @@ class MUTAG(RDFGraphDataset): ...@@ -507,15 +715,20 @@ class MUTAG(RDFGraphDataset):
relation_prefix = entity_prefix relation_prefix = entity_prefix
def __init__(self, def __init__(self,
force_reload=False,
print_every=10000, print_every=10000,
insert_reverse=True): insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
url = _get_dgl_url('dataset/rdf/mutag-hetero.zip') url = _get_dgl_url('dataset/rdf/mutag-hetero.zip')
name = 'mutag-hetero' name = 'mutag-hetero'
super(MUTAG, self).__init__(url, name, predict_category = 'd'
force_reload=force_reload, super(MUTAGDataset, self).__init__(name, url, predict_category,
print_every=print_every, print_every=print_every,
insert_reverse=insert_reverse) insert_reverse=insert_reverse,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
...@@ -558,31 +771,78 @@ class MUTAG(RDFGraphDataset): ...@@ -558,31 +771,78 @@ class MUTAG(RDFGraphDataset):
obj.cls = rel.cls obj.cls = rel.cls
assert sbj.cls is not None and obj.cls is not None assert sbj.cls is not None and obj.cls is not None
return (sbj, rel, obj) return (sbj, rel, obj)
def process_idx_file_line(self, line): def process_idx_file_line(self, line):
bond, _, label = line.strip().split('\t') bond, _, label = line.strip().split('\t')
return bond, label return bond, label
@property class MUTAG(MUTAGDataset):
def predict_category(self): """MUTAG dataset. Same as MUTAGDataset.
return 'd' """
def __init__(self,
class BGS(RDFGraphDataset): print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
deprecate_class('MUTAG', 'MUTAGDataset')
super(MUTAG, self).__init__(print_every,
insert_reverse,
raw_dir,
force_reload,
verbose)
class BGSDataset(RDFGraphDataset):
"""BGS dataset. """BGS dataset.
BGS namespace convention: BGS namespace convention:
http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE
We ignored all literal nodes and the relations connecting them in the We ignored all literal nodes and the relations connecting them in the
output graph. We also ignored the relation used to mark whether a output graph. We also ignored the relation used to mark whether a
term is CURRENT or DEPRECATED. term is CURRENT or DEPRECATED.
Statistics
===
Nodes: 94806
Edges: 672884 (including reverse edges)
Target Category: Lexicon/NamedRockUnit
Number of Classes: 2
Label Split: Train: 117, Test: 29
Parameters
-----------
print_every: int
Preprocessing log for every X tuples. Default: 10000.
insert_reverse: bool
If true, add reverse edge and reverse relations to the final graph. Default: True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Returns
===
BGSDataset object with three properties:
graph: A Heterogenous graph containing the
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
predict_category: The category name to run the node classification
prediction.
num_of_class: number of publication categories
for the classification task.
Examples Examples
-------- --------
>>> dataset = dgl.data.rdf.BGS() >>> dataset = dgl.data.rdf.BGSDataset()
>>> print(dataset.graph) >>> graph = dataset[0]
>>> category = dataset.predict_category
>>> num_classes = dataset.num_classes
>>>
>>> train_mask = g.nodes[category].data.pop('train_mask')
>>> test_mask = g.nodes[category].data.pop('test_mask')
>>> labels = g.nodes[category].data.pop('labels')
""" """
lith = rdf.term.URIRef("http://data.bgs.ac.uk/ref/Lexicon/hasLithogenesis") lith = rdf.term.URIRef("http://data.bgs.ac.uk/ref/Lexicon/hasLithogenesis")
...@@ -591,15 +851,20 @@ class BGS(RDFGraphDataset): ...@@ -591,15 +851,20 @@ class BGS(RDFGraphDataset):
relation_prefix = 'http://data.bgs.ac.uk/ref' relation_prefix = 'http://data.bgs.ac.uk/ref'
def __init__(self, def __init__(self,
force_reload=False,
print_every=10000, print_every=10000,
insert_reverse=True): insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
url = _get_dgl_url('dataset/rdf/bgs-hetero.zip') url = _get_dgl_url('dataset/rdf/bgs-hetero.zip')
name = 'bgs-hetero' name = 'bgs-hetero'
super(BGS, self).__init__(url, name, predict_category = 'Lexicon/NamedRockUnit'
force_reload=force_reload, super(BGSDataset, self).__init__(name, url, predict_category,
print_every=print_every, print_every=print_every,
insert_reverse=insert_reverse) insert_reverse=insert_reverse,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
...@@ -644,24 +909,76 @@ class BGS(RDFGraphDataset): ...@@ -644,24 +909,76 @@ class BGS(RDFGraphDataset):
_, rock, label = line.strip().split('\t') _, rock, label = line.strip().split('\t')
return rock, label return rock, label
@property class BGS(BGSDataset):
def predict_category(self): """BGS dataset. Same as BGSDataset.
return 'Lexicon/NamedRockUnit' """
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
deprecate_class('BGS', 'BGSDataset')
super(BGS, self).__init__(print_every,
insert_reverse,
raw_dir,
force_reload,
verbose)
class AM(RDFGraphDataset):
"""AM dataset.
class AMDataset(RDFGraphDataset):
"""AM dataset.
Namespace convention: Namespace convention:
Instance: http://purl.org/collections/nl/am/<type>-<id> Instance: http://purl.org/collections/nl/am/<type>-<id>
Relation: http://purl.org/collections/nl/am/<name> Relation: http://purl.org/collections/nl/am/<name>
We ignored all literal nodes and the relations connecting them in the We ignored all literal nodes and the relations connecting them in the
output graph. output graph.
Statistics
===
Nodes: 881680
Edges: 5668682 (including reverse edges)
Target Category: proxy
Number of Classes: 11
Label Split: Train: 802, Test: 198
Parameters
-----------
print_every: int
Preprocessing log for every X tuples. Default: 10000.
insert_reverse: bool
If true, add reverse edge and reverse relations to the final graph. Default: True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Returns
===
AMDataset object with three properties:
graph: A Heterogenous graph containing the
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
predict_category: The category name to run the node classification
prediction.
num_of_class: number of publication categories
for the classification task.
Examples Examples
-------- --------
>>> dataset = dgl.data.rdf.AM() >>> dataset = dgl.data.rdf.AMDataset()
>>> print(dataset.graph) >>> graph = dataset[0]
>>> category = dataset.predict_category
>>> num_classes = dataset.num_classes
>>>
>>> train_mask = g.nodes[category].data.pop('train_mask')
>>> test_mask = g.nodes[category].data.pop('test_mask')
>>> labels = g.nodes[category].data.pop('labels')
""" """
objectCategory = rdf.term.URIRef("http://purl.org/collections/nl/am/objectCategory") objectCategory = rdf.term.URIRef("http://purl.org/collections/nl/am/objectCategory")
...@@ -670,15 +987,20 @@ class AM(RDFGraphDataset): ...@@ -670,15 +987,20 @@ class AM(RDFGraphDataset):
relation_prefix = entity_prefix relation_prefix = entity_prefix
def __init__(self, def __init__(self,
force_reload=False,
print_every=10000, print_every=10000,
insert_reverse=True): insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
url = _get_dgl_url('dataset/rdf/am-hetero.zip') url = _get_dgl_url('dataset/rdf/am-hetero.zip')
name = 'am-hetero' name = 'am-hetero'
super(AM, self).__init__(url, name, predict_category = 'proxy'
force_reload=force_reload, super(AMDataset, self).__init__(name, url, predict_category,
print_every=print_every, print_every=print_every,
insert_reverse=insert_reverse) insert_reverse=insert_reverse,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
...@@ -722,9 +1044,21 @@ class AM(RDFGraphDataset): ...@@ -722,9 +1044,21 @@ class AM(RDFGraphDataset):
proxy, _, label = line.strip().split('\t') proxy, _, label = line.strip().split('\t')
return proxy, label return proxy, label
@property class AM(AMDataset):
def predict_category(self): """AM dataset. Same as AMDataset.
return 'proxy' """
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
deprecate_class('AM', 'AMDataset')
super(AM, self).__init__(print_every,
insert_reverse,
raw_dir,
force_reload,
verbose)
if __name__ == '__main__': if __name__ == '__main__':
AIFB() dataset = AIFB()
...@@ -5,13 +5,16 @@ import os ...@@ -5,13 +5,16 @@ import os
import sys import sys
import hashlib import hashlib
import warnings import warnings
import numpy as np
import warnings
import requests import requests
import pickle
import errno
import numpy as np
from .graph_serialize import save_graphs, load_graphs, load_labels from .graph_serialize import save_graphs, load_graphs, load_labels
from .tensor_serialize import save_tensors, load_tensors from .tensor_serialize import save_tensors, load_tensors
from .. import backend as F
__all__ = ['loadtxt','download', 'check_sha1', 'extract_archive', __all__ = ['loadtxt','download', 'check_sha1', 'extract_archive',
'get_download_dir', 'Subset', 'split_dataset', 'get_download_dir', 'Subset', 'split_dataset',
'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors"] 'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors"]
...@@ -237,6 +240,13 @@ def get_download_dir(): ...@@ -237,6 +240,13 @@ def get_download_dir():
os.makedirs(dirname) os.makedirs(dirname)
return dirname return dirname
def makedirs(path):
try:
os.makedirs(os.path.expanduser(os.path.normpath(path)))
except OSError as e:
if e.errno != errno.EEXIST and os.path.isdir(path):
raise e
def save_info(path, info): def save_info(path, info):
""" Save dataset related information into disk. """ Save dataset related information into disk.
...@@ -268,6 +278,39 @@ def load_info(path): ...@@ -268,6 +278,39 @@ def load_info(path):
info = pickle.load(pf) info = pickle.load(pf)
return info return info
def deprecate_property(old, new):
warnings.warn('Property {} will be deprecated, please use {} instead.'.format(old, new))
def deprecate_function(old, new):
warnings.warn('Function {} will be deprecated, please use {} instead.'.format(old, new))
def deprecate_class(old, new):
warnings.warn('Class {} will be deprecated, please use {} instead.'.format(old, new))
def idx2mask(idx, len):
"""Create mask."""
mask = np.zeros(len)
mask[idx] = 1
return mask
def generate_mask_tensor(mask):
"""Generate mask tensor according to different backend
For torch and tensorflow, it will create a bool tensor
For mxnet, it will create a float tensor
Parameters
----------
mask: numpy ndarray
input mask tensor
"""
assert isinstance(mask, np.ndarray), "input for generate_mask_tensor" \
"should be an numpy ndarray"
if F.backend_name == 'mxnet':
return F.tensor(mask, dtype=F.data_type_dict['float32'])
else:
return F.tensor(mask, dtype=F.data_type_dict['bool'])
class Subset(object): class Subset(object):
"""Subset of a dataset at specified indices """Subset of a dataset at specified indices
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment