Unverified Commit dbb028ac authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

[Dataset] Builtin knowledge graph dataset (#1881)



* buildin knowledge graph dataset

* upd

* docstring

* Fix
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-51-214.ec2.internal>
Co-authored-by: default avatarTong He <hetong007@gmail.com>
parent f4608c22
...@@ -62,9 +62,9 @@ python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 ...@@ -62,9 +62,9 @@ python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0
### Link Prediction ### Link Prediction
FB15k-237: MRR 0.151 (DGL), 0.158 (paper) FB15k-237: MRR 0.151 (DGL), 0.158 (paper)
``` ```
python3 link_predict.py -d FB15k-237 --gpu 0 --raw python3 link_predict.py -d FB15k-237 --gpu 0 --eval-protocol raw
``` ```
FB15k-237: Filtered-MRR 0.2044 FB15k-237: Filtered-MRR 0.2044
``` ```
python3 link_predict.py -d FB15k-237 --gpu 0 --filtered python3 link_predict.py -d FB15k-237 --gpu 0 --eval-protocol filtered
``` ```
...@@ -19,7 +19,7 @@ import torch ...@@ -19,7 +19,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import random import random
from dgl.contrib.data import load_data from dgl.data.knowledge_graph import load_data
from dgl.nn.pytorch import RelGraphConv from dgl.nn.pytorch import RelGraphConv
from model import BaseRGCN from model import BaseRGCN
......
...@@ -135,7 +135,7 @@ def build_graph_from_triplets(num_nodes, num_rels, triplets): ...@@ -135,7 +135,7 @@ def build_graph_from_triplets(num_nodes, num_rels, triplets):
This function also generates edge type and normalization factor This function also generates edge type and normalization factor
(reciprocal of node incoming degree) (reciprocal of node incoming degree)
""" """
g = dgl.DGLGraph() g = dgl.graph([])
g.add_nodes(num_nodes) g.add_nodes(num_nodes)
src, rel, dst = triplets src, rel, dst = triplets
src, dst = np.concatenate((src, dst)), np.concatenate((dst, src)) src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
......
from __future__ import absolute_import
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import os, sys
from .dgl_dataset import DGLBuiltinDataset
from .utils import download, extract_archive, get_download_dir
from .utils import save_graphs, load_graphs, save_info, load_info, makedirs, _get_dgl_url
from .utils import generate_mask_tensor
from .utils import deprecate_property, deprecate_function
from ..utils import retry_method_with_fix
from .. import backend as F
from ..graph import DGLGraph
from ..graph import batch as graph_batch
from ..convert import graph as dgl_graph
class KnowledgeGraphDataset(DGLBuiltinDataset):
"""KnowledgeGraph link prediction dataset
The dataset contains a graph depicting the connectivity of a knowledge
base. Currently, the knowledge bases from the
`RGCN paper <https://arxiv.org/pdf/1703.06103.pdf>`_ supported are
FB15k-237, FB15k, wn18
Parameters
-----------
name: str
Name can be 'FB15k-237', 'FB15k' or 'wn18'.
reverse: boo
Whether add reverse edges. Default: True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
"""
def __init__(self, name, reverse=True, raw_dir=None, force_reload=False, verbose=True):
self._name = name
self.reverse = reverse
url = _get_dgl_url('dataset/') + '{}.tgz'.format(name)
super(KnowledgeGraphDataset, self).__init__(name,
url=url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def download(self):
r""" Automatically download data and extract it.
"""
tgz_path = os.path.join(self.raw_dir, self.name + '.tgz')
download(self.url, path=tgz_path)
extract_archive(tgz_path, self.raw_path)
def process(self):
"""
The original knowledge base is stored in triplets.
This function will parse these triplets and build the DGLGraph.
"""
root_path = self.raw_path
entity_path = os.path.join(root_path, 'entities.dict')
relation_path = os.path.join(root_path, 'relations.dict')
train_path = os.path.join(root_path, 'train.txt')
valid_path = os.path.join(root_path, 'valid.txt')
test_path = os.path.join(root_path, 'test.txt')
entity_dict = _read_dictionary(entity_path)
relation_dict = _read_dictionary(relation_path)
train = np.asarray(_read_triplets_as_list(train_path, entity_dict, relation_dict))
valid = np.asarray(_read_triplets_as_list(valid_path, entity_dict, relation_dict))
test = np.asarray(_read_triplets_as_list(test_path, entity_dict, relation_dict))
num_nodes = len(entity_dict)
num_rels = len(relation_dict)
if self.verbose:
print("# entities: {}".format(num_nodes))
print("# relations: {}".format(num_rels))
print("# training edges: {}".format(len(train)))
print("# validation edges: {}".format(len(valid)))
print("# testing edges: {}".format(len(test)))
# for compatability
self._train = train
self._valid = valid
self._test = test
self._num_nodes = num_nodes
self._num_rels = num_rels
# build graph
g, data = build_knowledge_graph(num_nodes, num_rels, train, valid, test, reverse=self.reverse)
etype, ntype, train_edge_mask, valid_edge_mask, test_edge_mask, train_mask, val_mask, test_mask = data
g.edata['train_edge_mask'] = train_edge_mask
g.edata['valid_edge_mask'] = valid_edge_mask
g.edata['test_edge_mask'] = test_edge_mask
g.edata['train_mask'] = train_mask
g.edata['val_mask'] = val_mask
g.edata['test_mask'] = test_mask
g.edata['etype'] = etype
g.ndata['ntype'] = ntype
self._g = g
def has_cache(self):
graph_path = os.path.join(self.save_path,
self.save_name + '.bin')
info_path = os.path.join(self.save_path,
self.save_name + '.pkl')
if os.path.exists(graph_path) and \
os.path.exists(info_path):
return True
return False
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self._g
def __len__(self):
return 1
def save(self):
"""save the graph list and the labels"""
graph_path = os.path.join(self.save_path,
self.save_name + '.bin')
info_path = os.path.join(self.save_path,
self.save_name + '.pkl')
save_graphs(str(graph_path), self._g)
save_info(str(info_path), {'num_nodes': self.num_nodes,
'num_rels': self.num_rels})
def load(self):
graph_path = os.path.join(self.save_path,
self.save_name + '.bin')
info_path = os.path.join(self.save_path,
self.save_name + '.pkl')
graphs, _ = load_graphs(str(graph_path))
info = load_info(str(info_path))
self._num_nodes = info['num_nodes']
self._num_rels = info['num_rels']
self._g = graphs[0]
train_mask = self._g.edata['train_mask'].numpy()
val_mask = self._g.edata['val_mask'].numpy()
test_mask = self._g.edata['test_mask'].numpy()
# convert mask tensor into bool tensor if possible
self._g.ndata['train_edge_mask'] = generate_mask_tensor(self._g.ndata['train_edge_mask'].numpy())
self._g.ndata['valid_edge_mask'] = generate_mask_tensor(self._g.ndata['valid_edge_mask'].numpy())
self._g.ndata['test_edge_mask'] = generate_mask_tensor(self._g.ndata['test_edge_mask'].numpy())
self._g.ndata['train_mask'] = generate_mask_tensor(train_mask)
self._g.ndata['val_mask'] = generate_mask_tensor(val_mask)
self._g.ndata['test_mask'] = generate_mask_tensor(test_mask)
# for compatability (with 0.4.x) generate train_idx, valid_idx and test_idx
etype = self.g.edata['etype'].numpy()
self._etype = etype
u, v = self._g.all_edges(form='uv')
u = u.numpy()
v = v.numpy()
train_idx = np.nonzero(train_mask==1)
self._train = np.column_stack((u[train_idx], etype[train_idx], v[train_idx]))
valid_idx = np.nonzero(valid_mask==1)
self._valid = np.column_stack((u[valid_idx], etype[valid_idx], v[valid_idx]))
test_idx = np.nonzero(test_mask==1)
self._test = np.column_stack((u[test_idx], etype[test_idx], v[test_idx]))
if self.verbose:
print("# entities: {}".format(num_nodes))
print("# relations: {}".format(num_rels))
print("# training edges: {}".format(len(train_idx)))
print("# validation edges: {}".format(len(valid_idx)))
print("# testing edges: {}".format(len(test_idx)))
@property
def num_nodes(self):
return self._num_nodes
@property
def num_rels(self):
return self._num_rels
@property
def save_name(self):
return self.name + '_dgl_graph'
@property
def train(self):
deprecate_property('dataset.train', 'g.edata[\'train_mask\']')
return self._train
@property
def valid(self):
deprecate_property('dataset.valid', 'g.edata[\'val_mask\']')
return self._valid
@property
def test(self):
deprecate_property('dataset.test', 'g.edata[\'test_mask\']')
return self._test
def _read_dictionary(filename):
d = {}
with open(filename, 'r+') as f:
for line in f:
line = line.strip().split('\t')
d[line[1]] = int(line[0])
return d
def _read_triplets(filename):
with open(filename, 'r+') as f:
for line in f:
processed_line = line.strip().split('\t')
yield processed_line
def _read_triplets_as_list(filename, entity_dict, relation_dict):
l = []
for triplet in _read_triplets(filename):
s = entity_dict[triplet[0]]
r = relation_dict[triplet[1]]
o = entity_dict[triplet[2]]
l.append([s, r, o])
return l
def build_knowledge_graph(num_nodes, num_rels, train, valid, test, reverse=True):
""" Create a DGL Homogeneous graph with heterograph info stored as node or edge features.
"""
src = []
rel = []
dst = []
raw_subg = {}
raw_subg_eset = {}
raw_subg_etype = {}
raw_reverse_sugb = {}
raw_reverse_subg_eset = {}
raw_reverse_subg_etype = {}
# here there is noly one node type
s_type = "node"
d_type = "node"
def add_edge(s, r, d, reverse, edge_set):
r_type = str(r)
e_type = (s_type, r_type, d_type)
if raw_subg.get(e_type, None) is None:
raw_subg[e_type] = ([], [])
raw_subg_eset[e_type] = []
raw_subg_etype[e_type] = []
raw_subg[e_type][0].append(s)
raw_subg[e_type][1].append(d)
raw_subg_eset[e_type].append(edge_set)
raw_subg_etype[e_type].append(r)
if reverse is True:
r_type = str(r + num_rels)
re_type = (d_type, r_type, s_type)
if raw_reverse_sugb.get(re_type, None) is None:
raw_reverse_sugb[re_type] = ([], [])
raw_reverse_subg_etype[re_type] = []
raw_reverse_subg_eset[re_type] = []
raw_reverse_sugb[re_type][0].append(d)
raw_reverse_sugb[re_type][1].append(s)
raw_reverse_subg_eset[re_type].append(edge_set)
raw_reverse_subg_etype[re_type].append(r + num_rels)
for edge in train:
s, r, d = edge
assert r < num_rels
add_edge(s, r, d, reverse, 1) # train set
for edge in valid:
s, r, d = edge
assert r < num_rels
add_edge(s, r, d, reverse, 2) # valid set
for edge in test:
s, r, d = edge
assert r < num_rels
add_edge(s, r, d, reverse, 3) # test set
subg = []
fg_s = []
fg_d = []
fg_etype = []
fg_settype = []
for e_type, val in raw_subg.items():
s, d = val
s = np.asarray(s)
d = np.asarray(d)
etype = raw_subg_etype[e_type]
etype = np.asarray(etype)
settype = raw_subg_eset[e_type]
settype = np.asarray(settype)
fg_s.append(s)
fg_d.append(d)
fg_etype.append(etype)
fg_settype.append(settype)
settype = np.concatenate(fg_settype)
if reverse is True:
settype = np.concatenate([settype, np.full((settype.shape[0]), 0)])
train_edge_mask = generate_mask_tensor(settype == 1)
valid_edge_mask = generate_mask_tensor(settype == 2)
test_edge_mask = generate_mask_tensor(settype == 3)
for e_type, val in raw_reverse_sugb.items():
s, d = val
s = np.asarray(s)
d = np.asarray(d)
etype = raw_reverse_subg_etype[e_type]
etype = np.asarray(etype)
settype = raw_reverse_subg_eset[e_type]
settype = np.asarray(settype)
fg_s.append(s)
fg_d.append(d)
fg_etype.append(etype)
fg_settype.append(settype)
s = np.concatenate(fg_s)
d = np.concatenate(fg_d)
g = dgl_graph((s, d), num_nodes=num_nodes)
etype = np.concatenate(fg_etype)
settype = np.concatenate(fg_settype)
etype = F.tensor(etype, dtype=F.data_type_dict['int64'])
train_edge_mask = train_edge_mask
valid_edge_mask = valid_edge_mask
test_edge_mask = test_edge_mask
train_mask = generate_mask_tensor(settype == 1) if reverse is True else train_edge_mask
valid_mask = generate_mask_tensor(settype == 2) if reverse is True else valid_edge_mask
test_mask = generate_mask_tensor(settype == 3) if reverse is True else test_edge_mask
ntype = F.full_1d(num_nodes, 0, dtype=F.data_type_dict['int64'], ctx=F.cpu())
return g, (etype, ntype, train_edge_mask, valid_edge_mask, test_edge_mask, train_mask, valid_mask, test_mask)
class FB15k237Dataset(KnowledgeGraphDataset):
r"""FB15k237 link prediction dataset.
.. deprecated:: 0.5.0
`train` is deprecated, it is replaced by:
>>> dataset = FB15k237Dataset()
>>> graph = dataset[0]
>>> train_mask = graph.edata['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
>>> src, dst = graph.edges(train_idx)
>>> rel = graph.edata['etype'][train_idx]
`valid` is deprecated, it is replaced by:
>>> dataset = FB15k237Dataset()
>>> graph = dataset[0]
>>> val_mask = graph.edata['val_mask']
>>> val_idx = th.nonzero(val_mask).squeeze()
>>> src, dst = graph.edges(val_idx)
>>> rel = graph.edata['etype'][val_idx]
`test` is deprecated, it is replaced by:
>>> dataset = FB15k237Dataset()
>>> graph = dataset[0]
>>> test_mask = graph.edata['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
>>> src, dst = graph.edges(test_idx)
>>> rel = graph.edata['etype'][test_idx]
FB15k-237 is a subset of FB15k where inverse
relations are removed. When creating the dataset,
a reverse edge with reversed relation types are
created for each edge by default.
FB15k237 dataset statistics:
Nodes: 14541
Number of relation types: 237
Number of reversed relation types: 237
Label Split: Train: 272115 ,Valid: 17535, Test: 20466
Parameters
----------
reverse : bool
Whether to add reverse edge. Default True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Attributes
----------
num_nodes: int
Number of nodes
num_rels: int
Number of relation types
train: numpy array
A numpy array of triplets (src, rel, dst) for the training graph
valid: numpy array
A numpy array of triplets (src, rel, dst) for the validation graph
test: numpy array
A numpy array of triplets (src, rel, dst) for the test graph
Examples
----------
>>> dataset = FB15k237Dataset()
>>> g = dataset.graph
>>> e_type = g.edata['e_type']
>>>
>>> # get data split
>>> train_mask = g.edata['train_mask']
>>> val_mask = g.edata['val_mask']
>>> test_mask = g.edata['test_mask']
>>>
>>> train_set = th.arange(g.number_of_edges())[train_mask]
>>> val_set = th.arange(g.number_of_edges())[val_mask]
>>>
>>> # build train_g
>>> train_edges = train_set
>>> train_g = g.edge_subgraph(train_edges,
preserve_nodes=True)
>>> train_g.edata['e_type'] = e_type[train_edges];
>>>
>>> # build val_g
>>> val_edges = th.cat([train_edges, val_edges])
>>> val_g = g.edge_subgraph(val_edges,
preserve_nodes=True)
>>> val_g.edata['e_type'] = e_type[val_edges];
>>>
>>> # Train, Validation and Test
>>>
"""
def __init__(self, reverse=True, raw_dir=None, force_reload=False, verbose=True):
name = 'FB15k-237'
super(FB15k237Dataset, self).__init__(name, reverse, raw_dir, force_reload, verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, FB15k237Dataset has only one graph object
Return
-------
dgl.DGLGraph
The graph contain
- edata['e_type']: edge relation type
- edata['train_edge_mask']: positive training edge mask
- edata['val_edge_mask']: positive validation edge mask
- edata['test_edge_mask']: positive testing edge mask
- edata['train_mask']: training edge set mask (include reversed training edges)
- edata['val_mask']: validation edge set mask (include reversed validation edges)
- edata['test_mask']: testing edge set mask (include reversed testing edges)
- ndata['ntype']: node type. All 0 in this dataset
"""
return super(FB15k237Dataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(FB15k237Dataset, self).__len__(idx)
class FB15kDataset(KnowledgeGraphDataset):
r"""FB15k link prediction dataset.
.. deprecated:: 0.5.0
`train` is deprecated, it is replaced by:
>>> dataset = FB15kDataset()
>>> graph = dataset[0]
>>> train_mask = graph.edata['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
>>> src, dst = graph.edges(train_idx)
>>> rel = graph.edata['etype'][train_idx]
`valid` is deprecated, it is replaced by:
>>> dataset = FB15kDataset()
>>> graph = dataset[0]
>>> val_mask = graph.edata['val_mask']
>>> val_idx = th.nonzero(val_mask).squeeze()
>>> src, dst = graph.edges(val_idx)
>>> rel = graph.edata['etype'][val_idx]
`test` is deprecated, it is replaced by:
>>> dataset = FB15kDataset()
>>> graph = dataset[0]
>>> test_mask = graph.edata['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
>>> src, dst = graph.edges(test_idx)
>>> rel = graph.edata['etype'][test_idx]
The FB15K dataset was introduced in http://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data.pdf,
It is a subset of Freebase which contains about
14,951 entities with 1,345 different relations.
When creating the dataset, a reverse edge with
reversed relation types are created for each edge
by default.
FB15k dataset statistics:
Nodes: 14,951
Number of relation types: 1,345
Number of reversed relation types: 1,345
Label Split: Train: 483142 ,Valid: 50000, Test: 59071
Parameters
----------
reverse : bool
Whether to add reverse edge. Default True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Attributes
----------
num_nodes: int
Number of nodes
num_rels: int
Number of relation types
train: numpy array
A numpy array of triplets (src, rel, dst) for the training graph
valid: numpy array
A numpy array of triplets (src, rel, dst) for the validation graph
test: numpy array
A numpy array of triplets (src, rel, dst) for the test graph
Examples
----------
>>> dataset = FB15kDataset()
>>> g = dataset.graph
>>> e_type = g.edata['e_type']
>>>
>>> # get data split
>>> train_mask = g.edata['train_mask']
>>> val_mask = g.edata['val_mask']
>>>
>>> train_set = th.arange(g.number_of_edges())[train_mask]
>>> val_set = th.arange(g.number_of_edges())[val_mask]
>>>
>>> # build train_g
>>> train_edges = train_set
>>> train_g = g.edge_subgraph(train_edges,
preserve_nodes=True)
>>> train_g.edata['e_type'] = e_type[train_edges];
>>>
>>> # build val_g
>>> val_edges = th.cat([train_edges, val_edges])
>>> val_g = g.edge_subgraph(val_edges,
preserve_nodes=True)
>>> val_g.edata['e_type'] = e_type[val_edges];
>>>
>>> # Train, Validation and Test
>>>
"""
def __init__(self, reverse=True, raw_dir=None, force_reload=False, verbose=True):
name = 'FB15k'
super(FB15kDataset, self).__init__(name, reverse, raw_dir, force_reload, verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, FB15kDataset has only one graph object
Return
-------
dgl.DGLGraph
The graph contain
- edata['e_type']: edge relation type
- edata['train_edge_mask']: positive training edge mask
- edata['val_edge_mask']: positive validation edge mask
- edata['test_edge_mask']: positive testing edge mask
- edata['train_mask']: training edge set mask (include reversed training edges)
- edata['val_mask']: validation edge set mask (include reversed validation edges)
- edata['test_mask']: testing edge set mask (include reversed testing edges)
- ndata['ntype']: node type. All 0 in this dataset
"""
return super(FB15kDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(FB15kDataset, self).__len__(idx)
class WN18Dataset(KnowledgeGraphDataset):
r""" WN18 link prediction dataset.
.. deprecated:: 0.5.0
`train` is deprecated, it is replaced by:
>>> dataset = WN18Dataset()
>>> graph = dataset[0]
>>> train_mask = graph.edata['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
>>> src, dst = graph.edges(train_idx)
>>> rel = graph.edata['etype'][train_idx]
`valid` is deprecated, it is replaced by:
>>> dataset = WN18Dataset()
>>> graph = dataset[0]
>>> val_mask = graph.edata['val_mask']
>>> val_idx = th.nonzero(val_mask).squeeze()
>>> src, dst = graph.edges(val_idx)
>>> rel = graph.edata['etype'][val_idx]
`test` is deprecated, it is replaced by:
>>> dataset = WN18Dataset()
>>> graph = dataset[0]
>>> test_mask = graph.edata['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
>>> src, dst = graph.edges(test_idx)
>>> rel = graph.edata['etype'][test_idx]
The WN18 dataset was introduced in http://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data.pdf,
It included the full 18 relations scraped from
WordNet for roughly 41,000 synsets. When creating
the dataset, a reverse edge with reversed relation
types are created for each edge by default.
WN18 dataset tatistics:
Nodes: 40943
Number of relation types: 18
Number of reversed relation types: 18
Label Split: Train: 141442 ,Valid: 5000, Test: 5000
Parameters
----------
reverse : bool
Whether to add reverse edge. Default True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Attributes
----------
num_nodes: int
Number of nodes
num_rels: int
Number of relation types
train: numpy array
A numpy array of triplets (src, rel, dst) for the training graph
valid: numpy array
A numpy array of triplets (src, rel, dst) for the validation graph
test: numpy array
A numpy array of triplets (src, rel, dst) for the test graph
Examples
----------
>>> dataset = WN18Dataset()
>>> g = dataset.graph
>>> e_type = g.edata['e_type']
>>>
>>> # get data split
>>> train_mask = g.edata['train_mask']
>>> val_mask = g.edata['val_mask']
>>>
>>> train_set = th.arange(g.number_of_edges())[train_mask]
>>> val_set = th.arange(g.number_of_edges())[val_mask]
>>>
>>> # build train_g
>>> train_edges = train_set
>>> train_g = g.edge_subgraph(train_edges,
preserve_nodes=True)
>>> train_g.edata['e_type'] = e_type[train_edges];
>>>
>>> # build val_g
>>> val_edges = th.cat([train_edges, val_edges])
>>> val_g = g.edge_subgraph(val_edges,
preserve_nodes=True)
>>> val_g.edata['e_type'] = e_type[val_edges];
>>>
>>> # Train, Validation and Test
>>>
"""
def __init__(self, reverse=True, raw_dir=None, force_reload=False, verbose=True):
name = 'wn18'
super(WN18Dataset, self).__init__(name, reverse, raw_dir, force_reload, verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, WN18Dataset has only one graph object
Return
-------
dgl.DGLGraph
The graph contain
- edata['e_type']: edge relation type
- edata['train_edge_mask']: positive training edge mask
- edata['val_edge_mask']: positive validation edge mask
- edata['test_edge_mask']: positive testing edge mask
- edata['train_mask']: training edge set mask (include reversed training edges)
- edata['val_mask']: validation edge set mask (include reversed validation edges)
- edata['test_mask']: testing edge set mask (include reversed testing edges)
- ndata['ntype']: node type. All 0 in this dataset
"""
return super(WN18Dataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(WN18Dataset, self).__len__(idx)
def load_data(dataset):
r"""Load knowledge graph dataset for RGCN link prediction tasks
It supports three datasets: wn18, FB15k and FB15k-237
Parameters
----------
dataset: str
The name of the dataset to load.
Return
------
The dataset object.
"""
if dataset == 'wn18':
return WN18Dataset()
elif dataset == 'FB15k':
return FB15kDataset()
elif dataset == 'FB15k-237':
return FB15k237Dataset()
...@@ -73,7 +73,7 @@ class RDFGraphDataset(DGLBuiltinDataset): ...@@ -73,7 +73,7 @@ class RDFGraphDataset(DGLBuiltinDataset):
Attributes Attributes
---------- ----------
graph : dgl.DGLHeteroGraph graph : dgl.DGLraph
Graph structure Graph structure
num_classes : int num_classes : int
Number of classes to predict Number of classes to predict
...@@ -426,7 +426,7 @@ class RDFGraphDataset(DGLBuiltinDataset): ...@@ -426,7 +426,7 @@ class RDFGraphDataset(DGLBuiltinDataset):
return g return g
def __len__(self): def __len__(self):
r"""The number of examples in the dataset.""" r"""The number of graphs in the dataset."""
return 1 return 1
@property @property
...@@ -538,17 +538,34 @@ def _get_id(dict, key): ...@@ -538,17 +538,34 @@ def _get_id(dict, key):
return id return id
class AIFBDataset(RDFGraphDataset): class AIFBDataset(RDFGraphDataset):
r"""AIFB dataset. r"""AIFB dataset for node classification task
.. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in
data mining. It records the organizational structure of AIFB at the data mining. It records the organizational structure of AIFB at the
University of Karlsruhe. University of Karlsruhe.
Statistics
=== AIFB dataset statistics:
Nodes: 7262 Nodes: 7262
Edges: 48810 (including reverse edges) Edges: 48810 (including reverse edges)
Target Category: Personen Target Category: Personen
Number of Classes: 4 Number of Classes: 4
Label Split: Train: 140, Test: 36 Label Split: Train: 140, Test: 36
Parameters Parameters
----------- -----------
print_every: int print_every: int
...@@ -562,18 +579,21 @@ class AIFBDataset(RDFGraphDataset): ...@@ -562,18 +579,21 @@ class AIFBDataset(RDFGraphDataset):
Whether to reload the dataset. Default: False Whether to reload the dataset. Default: False
verbose: bool verbose: bool
Whether to print out progress information. Default: True. Whether to print out progress information. Default: True.
Returns
=== Attributes
AIFBDataset object with three properties: ----------
graph: A Heterogenous graph containing the num_classes : int
graph structure, node features and labels. Number of classes to predict
- ndata['train_mask']: mask for training node set predict_category : str
- ndata['test_mask']: mask for testing node set The entity category (node type) that has labels for prediction
- ndata['labels']: mask for labels labels : Tensor
predict_category: The category name to run the node classification All the labels of the entities in ``predict_category``
prediction. graph : dgl.DGLGraph
num_of_class: number of publication categories Graph structure
for the classification task. train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples Examples
-------- --------
...@@ -608,6 +628,28 @@ class AIFBDataset(RDFGraphDataset): ...@@ -608,6 +628,28 @@ class AIFBDataset(RDFGraphDataset):
force_reload=force_reload, force_reload=force_reload,
verbose=verbose) verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, AIFBDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(AIFBDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(AIFBDataset, self).__len__(idx)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
return Entity(e_id=str(term), cls="_Literal") return Entity(e_id=str(term), cls="_Literal")
...@@ -657,14 +699,30 @@ class AIFB(AIFBDataset): ...@@ -657,14 +699,30 @@ class AIFB(AIFBDataset):
class MUTAGDataset(RDFGraphDataset): class MUTAGDataset(RDFGraphDataset):
r"""MUTAG dataset. r"""MUTAG dataset for node classification task
Statistics
=== .. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
Mutag dataset statistics:
Nodes: 27163 Nodes: 27163
Edges: 148100 (including reverse edges) Edges: 148100 (including reverse edges)
Target Category: d Target Category: d
Number of Classes: 2 Number of Classes: 2
Label Split: Train: 272, Test: 68 Label Split: Train: 272, Test: 68
Parameters Parameters
----------- -----------
print_every: int print_every: int
...@@ -678,18 +736,21 @@ class MUTAGDataset(RDFGraphDataset): ...@@ -678,18 +736,21 @@ class MUTAGDataset(RDFGraphDataset):
Whether to reload the dataset. Default: False Whether to reload the dataset. Default: False
verbose: bool verbose: bool
Whether to print out progress information. Default: True. Whether to print out progress information. Default: True.
Returns
=== Attributes
MUTAGDataset object with three properties: ----------
graph: A Heterogenous graph containing the num_classes : int
graph structure, node features and labels. Number of classes to predict
- ndata['train_mask']: mask for training node set predict_category : str
- ndata['test_mask']: mask for testing node set The entity category (node type) that has labels for prediction
- ndata['labels']: mask for labels labels : Tensor
predict_category: The category name to run the node classification All the labels of the entities in ``predict_category``
prediction. graph : dgl.DGLGraph
num_of_class: number of publication categories Graph structure
for the classification task. train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples Examples
-------- --------
...@@ -730,6 +791,28 @@ class MUTAGDataset(RDFGraphDataset): ...@@ -730,6 +791,28 @@ class MUTAGDataset(RDFGraphDataset):
force_reload=force_reload, force_reload=force_reload,
verbose=verbose) verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, MUTAGDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(MUTAGDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(MUTAGDataset, self).__len__(idx)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
return Entity(e_id=str(term), cls="_Literal") return Entity(e_id=str(term), cls="_Literal")
...@@ -795,19 +878,36 @@ class MUTAG(MUTAGDataset): ...@@ -795,19 +878,36 @@ class MUTAG(MUTAGDataset):
verbose) verbose)
class BGSDataset(RDFGraphDataset): class BGSDataset(RDFGraphDataset):
"""BGS dataset. r"""BGS dataset for node classification task
.. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
BGS namespace convention: BGS namespace convention:
http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE
We ignored all literal nodes and the relations connecting them in the We ignored all literal nodes and the relations connecting them in the
output graph. We also ignored the relation used to mark whether a output graph. We also ignored the relation used to mark whether a
term is CURRENT or DEPRECATED. term is CURRENT or DEPRECATED.
Statistics
=== BGS dataset statistics:
Nodes: 94806 Nodes: 94806
Edges: 672884 (including reverse edges) Edges: 672884 (including reverse edges)
Target Category: Lexicon/NamedRockUnit Target Category: Lexicon/NamedRockUnit
Number of Classes: 2 Number of Classes: 2
Label Split: Train: 117, Test: 29 Label Split: Train: 117, Test: 29
Parameters Parameters
----------- -----------
print_every: int print_every: int
...@@ -821,18 +921,22 @@ class BGSDataset(RDFGraphDataset): ...@@ -821,18 +921,22 @@ class BGSDataset(RDFGraphDataset):
Whether to reload the dataset. Default: False Whether to reload the dataset. Default: False
verbose: bool verbose: bool
Whether to print out progress information. Default: True. Whether to print out progress information. Default: True.
Returns
=== Attributes
BGSDataset object with three properties: ----------
graph: A Heterogenous graph containing the num_classes : int
graph structure, node features and labels. Number of classes to predict
- ndata['train_mask']: mask for training node set predict_category : str
- ndata['test_mask']: mask for testing node set The entity category (node type) that has labels for prediction
- ndata['labels']: mask for labels labels : Tensor
predict_category: The category name to run the node classification All the labels of the entities in ``predict_category``
prediction. graph : dgl.DGLGraph
num_of_class: number of publication categories Graph structure
for the classification task. train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples Examples
-------- --------
>>> dataset = dgl.data.rdf.BGSDataset() >>> dataset = dgl.data.rdf.BGSDataset()
...@@ -866,6 +970,28 @@ class BGSDataset(RDFGraphDataset): ...@@ -866,6 +970,28 @@ class BGSDataset(RDFGraphDataset):
force_reload=force_reload, force_reload=force_reload,
verbose=verbose) verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, BGSDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(BGSDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(BGSDataset, self).__len__(idx)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
return None return None
...@@ -927,15 +1053,30 @@ class BGS(BGSDataset): ...@@ -927,15 +1053,30 @@ class BGS(BGSDataset):
class AMDataset(RDFGraphDataset): class AMDataset(RDFGraphDataset):
"""AM dataset. """AM dataset. for node classification task
.. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
Namespace convention: Namespace convention:
Instance: http://purl.org/collections/nl/am/<type>-<id> Instance: http://purl.org/collections/nl/am/<type>-<id>
Relation: http://purl.org/collections/nl/am/<name> Relation: http://purl.org/collections/nl/am/<name>
We ignored all literal nodes and the relations connecting them in the We ignored all literal nodes and the relations connecting them in the
output graph. output graph.
Statistics AM dataset statistics:
===
Nodes: 881680 Nodes: 881680
Edges: 5668682 (including reverse edges) Edges: 5668682 (including reverse edges)
Target Category: proxy Target Category: proxy
...@@ -956,18 +1097,20 @@ class AMDataset(RDFGraphDataset): ...@@ -956,18 +1097,20 @@ class AMDataset(RDFGraphDataset):
verbose: bool verbose: bool
Whether to print out progress information. Default: True. Whether to print out progress information. Default: True.
Returns Attributes
=== ----------
AMDataset object with three properties: num_classes : int
graph: A Heterogenous graph containing the Number of classes to predict
graph structure, node features and labels. predict_category : str
- ndata['train_mask']: mask for training node set The entity category (node type) that has labels for prediction
- ndata['test_mask']: mask for testing node set labels : Tensor
- ndata['labels']: mask for labels All the labels of the entities in ``predict_category``
predict_category: The category name to run the node classification graph : dgl.DGLGraph
prediction. Graph structure
num_of_class: number of publication categories train_idx : Tensor
for the classification task. Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples Examples
-------- --------
...@@ -1002,6 +1145,28 @@ class AMDataset(RDFGraphDataset): ...@@ -1002,6 +1145,28 @@ class AMDataset(RDFGraphDataset):
force_reload=force_reload, force_reload=force_reload,
verbose=verbose) verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, AMDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(AMDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(AMDataset, self).__len__(idx)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
return None return None
......
...@@ -10,6 +10,9 @@ import pickle ...@@ -10,6 +10,9 @@ import pickle
import errno import errno
import numpy as np import numpy as np
import pickle
import errno
from .graph_serialize import save_graphs, load_graphs, load_labels from .graph_serialize import save_graphs, load_graphs, load_labels
from .tensor_serialize import save_tensors, load_tensors from .tensor_serialize import save_tensors, load_tensors
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment