Unverified Commit dbb028ac authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

[Dataset] Builtin knowledge graph dataset (#1881)



* buildin knowledge graph dataset

* upd

* docstring

* Fix
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-51-214.ec2.internal>
Co-authored-by: default avatarTong He <hetong007@gmail.com>
parent f4608c22
...@@ -62,9 +62,9 @@ python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 ...@@ -62,9 +62,9 @@ python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0
### Link Prediction ### Link Prediction
FB15k-237: MRR 0.151 (DGL), 0.158 (paper) FB15k-237: MRR 0.151 (DGL), 0.158 (paper)
``` ```
python3 link_predict.py -d FB15k-237 --gpu 0 --raw python3 link_predict.py -d FB15k-237 --gpu 0 --eval-protocol raw
``` ```
FB15k-237: Filtered-MRR 0.2044 FB15k-237: Filtered-MRR 0.2044
``` ```
python3 link_predict.py -d FB15k-237 --gpu 0 --filtered python3 link_predict.py -d FB15k-237 --gpu 0 --eval-protocol filtered
``` ```
...@@ -19,7 +19,7 @@ import torch ...@@ -19,7 +19,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import random import random
from dgl.contrib.data import load_data from dgl.data.knowledge_graph import load_data
from dgl.nn.pytorch import RelGraphConv from dgl.nn.pytorch import RelGraphConv
from model import BaseRGCN from model import BaseRGCN
......
...@@ -135,7 +135,7 @@ def build_graph_from_triplets(num_nodes, num_rels, triplets): ...@@ -135,7 +135,7 @@ def build_graph_from_triplets(num_nodes, num_rels, triplets):
This function also generates edge type and normalization factor This function also generates edge type and normalization factor
(reciprocal of node incoming degree) (reciprocal of node incoming degree)
""" """
g = dgl.DGLGraph() g = dgl.graph([])
g.add_nodes(num_nodes) g.add_nodes(num_nodes)
src, rel, dst = triplets src, rel, dst = triplets
src, dst = np.concatenate((src, dst)), np.concatenate((dst, src)) src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
......
This diff is collapsed.
...@@ -73,7 +73,7 @@ class RDFGraphDataset(DGLBuiltinDataset): ...@@ -73,7 +73,7 @@ class RDFGraphDataset(DGLBuiltinDataset):
Attributes Attributes
---------- ----------
graph : dgl.DGLHeteroGraph graph : dgl.DGLraph
Graph structure Graph structure
num_classes : int num_classes : int
Number of classes to predict Number of classes to predict
...@@ -426,7 +426,7 @@ class RDFGraphDataset(DGLBuiltinDataset): ...@@ -426,7 +426,7 @@ class RDFGraphDataset(DGLBuiltinDataset):
return g return g
def __len__(self): def __len__(self):
r"""The number of examples in the dataset.""" r"""The number of graphs in the dataset."""
return 1 return 1
@property @property
...@@ -538,17 +538,34 @@ def _get_id(dict, key): ...@@ -538,17 +538,34 @@ def _get_id(dict, key):
return id return id
class AIFBDataset(RDFGraphDataset): class AIFBDataset(RDFGraphDataset):
r"""AIFB dataset. r"""AIFB dataset for node classification task
AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in
data mining. It records the organizational structure of AIFB at the .. deprecated:: 0.5.0
University of Karlsruhe. `graph` is deprecated, it is replaced by:
Statistics >>> dataset = AIFBDataset()
=== >>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in
data mining. It records the organizational structure of AIFB at the
University of Karlsruhe.
AIFB dataset statistics:
Nodes: 7262 Nodes: 7262
Edges: 48810 (including reverse edges) Edges: 48810 (including reverse edges)
Target Category: Personen Target Category: Personen
Number of Classes: 4 Number of Classes: 4
Label Split: Train: 140, Test: 36 Label Split: Train: 140, Test: 36
Parameters Parameters
----------- -----------
print_every: int print_every: int
...@@ -562,18 +579,21 @@ class AIFBDataset(RDFGraphDataset): ...@@ -562,18 +579,21 @@ class AIFBDataset(RDFGraphDataset):
Whether to reload the dataset. Default: False Whether to reload the dataset. Default: False
verbose: bool verbose: bool
Whether to print out progress information. Default: True. Whether to print out progress information. Default: True.
Returns
=== Attributes
AIFBDataset object with three properties: ----------
graph: A Heterogenous graph containing the num_classes : int
graph structure, node features and labels. Number of classes to predict
- ndata['train_mask']: mask for training node set predict_category : str
- ndata['test_mask']: mask for testing node set The entity category (node type) that has labels for prediction
- ndata['labels']: mask for labels labels : Tensor
predict_category: The category name to run the node classification All the labels of the entities in ``predict_category``
prediction. graph : dgl.DGLGraph
num_of_class: number of publication categories Graph structure
for the classification task. train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples Examples
-------- --------
...@@ -608,6 +628,28 @@ class AIFBDataset(RDFGraphDataset): ...@@ -608,6 +628,28 @@ class AIFBDataset(RDFGraphDataset):
force_reload=force_reload, force_reload=force_reload,
verbose=verbose) verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, AIFBDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(AIFBDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(AIFBDataset, self).__len__(idx)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
return Entity(e_id=str(term), cls="_Literal") return Entity(e_id=str(term), cls="_Literal")
...@@ -657,14 +699,30 @@ class AIFB(AIFBDataset): ...@@ -657,14 +699,30 @@ class AIFB(AIFBDataset):
class MUTAGDataset(RDFGraphDataset): class MUTAGDataset(RDFGraphDataset):
r"""MUTAG dataset. r"""MUTAG dataset for node classification task
Statistics
=== .. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
Mutag dataset statistics:
Nodes: 27163 Nodes: 27163
Edges: 148100 (including reverse edges) Edges: 148100 (including reverse edges)
Target Category: d Target Category: d
Number of Classes: 2 Number of Classes: 2
Label Split: Train: 272, Test: 68 Label Split: Train: 272, Test: 68
Parameters Parameters
----------- -----------
print_every: int print_every: int
...@@ -678,18 +736,21 @@ class MUTAGDataset(RDFGraphDataset): ...@@ -678,18 +736,21 @@ class MUTAGDataset(RDFGraphDataset):
Whether to reload the dataset. Default: False Whether to reload the dataset. Default: False
verbose: bool verbose: bool
Whether to print out progress information. Default: True. Whether to print out progress information. Default: True.
Returns
=== Attributes
MUTAGDataset object with three properties: ----------
graph: A Heterogenous graph containing the num_classes : int
graph structure, node features and labels. Number of classes to predict
- ndata['train_mask']: mask for training node set predict_category : str
- ndata['test_mask']: mask for testing node set The entity category (node type) that has labels for prediction
- ndata['labels']: mask for labels labels : Tensor
predict_category: The category name to run the node classification All the labels of the entities in ``predict_category``
prediction. graph : dgl.DGLGraph
num_of_class: number of publication categories Graph structure
for the classification task. train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples Examples
-------- --------
...@@ -730,6 +791,28 @@ class MUTAGDataset(RDFGraphDataset): ...@@ -730,6 +791,28 @@ class MUTAGDataset(RDFGraphDataset):
force_reload=force_reload, force_reload=force_reload,
verbose=verbose) verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, MUTAGDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(MUTAGDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(MUTAGDataset, self).__len__(idx)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
return Entity(e_id=str(term), cls="_Literal") return Entity(e_id=str(term), cls="_Literal")
...@@ -795,19 +878,36 @@ class MUTAG(MUTAGDataset): ...@@ -795,19 +878,36 @@ class MUTAG(MUTAGDataset):
verbose) verbose)
class BGSDataset(RDFGraphDataset): class BGSDataset(RDFGraphDataset):
"""BGS dataset. r"""BGS dataset for node classification task
.. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
BGS namespace convention: BGS namespace convention:
http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE
We ignored all literal nodes and the relations connecting them in the We ignored all literal nodes and the relations connecting them in the
output graph. We also ignored the relation used to mark whether a output graph. We also ignored the relation used to mark whether a
term is CURRENT or DEPRECATED. term is CURRENT or DEPRECATED.
Statistics
=== BGS dataset statistics:
Nodes: 94806 Nodes: 94806
Edges: 672884 (including reverse edges) Edges: 672884 (including reverse edges)
Target Category: Lexicon/NamedRockUnit Target Category: Lexicon/NamedRockUnit
Number of Classes: 2 Number of Classes: 2
Label Split: Train: 117, Test: 29 Label Split: Train: 117, Test: 29
Parameters Parameters
----------- -----------
print_every: int print_every: int
...@@ -821,18 +921,22 @@ class BGSDataset(RDFGraphDataset): ...@@ -821,18 +921,22 @@ class BGSDataset(RDFGraphDataset):
Whether to reload the dataset. Default: False Whether to reload the dataset. Default: False
verbose: bool verbose: bool
Whether to print out progress information. Default: True. Whether to print out progress information. Default: True.
Returns
=== Attributes
BGSDataset object with three properties: ----------
graph: A Heterogenous graph containing the num_classes : int
graph structure, node features and labels. Number of classes to predict
- ndata['train_mask']: mask for training node set predict_category : str
- ndata['test_mask']: mask for testing node set The entity category (node type) that has labels for prediction
- ndata['labels']: mask for labels labels : Tensor
predict_category: The category name to run the node classification All the labels of the entities in ``predict_category``
prediction. graph : dgl.DGLGraph
num_of_class: number of publication categories Graph structure
for the classification task. train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples Examples
-------- --------
>>> dataset = dgl.data.rdf.BGSDataset() >>> dataset = dgl.data.rdf.BGSDataset()
...@@ -866,6 +970,28 @@ class BGSDataset(RDFGraphDataset): ...@@ -866,6 +970,28 @@ class BGSDataset(RDFGraphDataset):
force_reload=force_reload, force_reload=force_reload,
verbose=verbose) verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, BGSDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(BGSDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(BGSDataset, self).__len__(idx)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
return None return None
...@@ -927,15 +1053,30 @@ class BGS(BGSDataset): ...@@ -927,15 +1053,30 @@ class BGS(BGSDataset):
class AMDataset(RDFGraphDataset): class AMDataset(RDFGraphDataset):
"""AM dataset. """AM dataset. for node classification task
.. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
Namespace convention: Namespace convention:
Instance: http://purl.org/collections/nl/am/<type>-<id> Instance: http://purl.org/collections/nl/am/<type>-<id>
Relation: http://purl.org/collections/nl/am/<name> Relation: http://purl.org/collections/nl/am/<name>
We ignored all literal nodes and the relations connecting them in the We ignored all literal nodes and the relations connecting them in the
output graph. output graph.
Statistics AM dataset statistics:
===
Nodes: 881680 Nodes: 881680
Edges: 5668682 (including reverse edges) Edges: 5668682 (including reverse edges)
Target Category: proxy Target Category: proxy
...@@ -956,18 +1097,20 @@ class AMDataset(RDFGraphDataset): ...@@ -956,18 +1097,20 @@ class AMDataset(RDFGraphDataset):
verbose: bool verbose: bool
Whether to print out progress information. Default: True. Whether to print out progress information. Default: True.
Returns Attributes
=== ----------
AMDataset object with three properties: num_classes : int
graph: A Heterogenous graph containing the Number of classes to predict
graph structure, node features and labels. predict_category : str
- ndata['train_mask']: mask for training node set The entity category (node type) that has labels for prediction
- ndata['test_mask']: mask for testing node set labels : Tensor
- ndata['labels']: mask for labels All the labels of the entities in ``predict_category``
predict_category: The category name to run the node classification graph : dgl.DGLGraph
prediction. Graph structure
num_of_class: number of publication categories train_idx : Tensor
for the classification task. Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples Examples
-------- --------
...@@ -1002,6 +1145,28 @@ class AMDataset(RDFGraphDataset): ...@@ -1002,6 +1145,28 @@ class AMDataset(RDFGraphDataset):
force_reload=force_reload, force_reload=force_reload,
verbose=verbose) verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, AMDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(AMDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(AMDataset, self).__len__(idx)
def parse_entity(self, term): def parse_entity(self, term):
if isinstance(term, rdf.Literal): if isinstance(term, rdf.Literal):
return None return None
......
...@@ -10,6 +10,9 @@ import pickle ...@@ -10,6 +10,9 @@ import pickle
import errno import errno
import numpy as np import numpy as np
import pickle
import errno
from .graph_serialize import save_graphs, load_graphs, load_labels from .graph_serialize import save_graphs, load_graphs, load_labels
from .tensor_serialize import save_tensors, load_tensors from .tensor_serialize import save_tensors, load_tensors
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment