Unverified Commit dbb028ac authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

[Dataset] Builtin knowledge graph dataset (#1881)



* buildin knowledge graph dataset

* upd

* docstring

* Fix
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-51-214.ec2.internal>
Co-authored-by: default avatarTong He <hetong007@gmail.com>
parent f4608c22
......@@ -62,9 +62,9 @@ python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0
### Link Prediction
FB15k-237: MRR 0.151 (DGL), 0.158 (paper)
```
python3 link_predict.py -d FB15k-237 --gpu 0 --raw
python3 link_predict.py -d FB15k-237 --gpu 0 --eval-protocol raw
```
FB15k-237: Filtered-MRR 0.2044
```
python3 link_predict.py -d FB15k-237 --gpu 0 --filtered
python3 link_predict.py -d FB15k-237 --gpu 0 --eval-protocol filtered
```
......@@ -19,7 +19,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from dgl.contrib.data import load_data
from dgl.data.knowledge_graph import load_data
from dgl.nn.pytorch import RelGraphConv
from model import BaseRGCN
......
......@@ -135,7 +135,7 @@ def build_graph_from_triplets(num_nodes, num_rels, triplets):
This function also generates edge type and normalization factor
(reciprocal of node incoming degree)
"""
g = dgl.DGLGraph()
g = dgl.graph([])
g.add_nodes(num_nodes)
src, rel, dst = triplets
src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
......
This diff is collapsed.
......@@ -73,7 +73,7 @@ class RDFGraphDataset(DGLBuiltinDataset):
Attributes
----------
graph : dgl.DGLHeteroGraph
graph : dgl.DGLraph
Graph structure
num_classes : int
Number of classes to predict
......@@ -426,7 +426,7 @@ class RDFGraphDataset(DGLBuiltinDataset):
return g
def __len__(self):
r"""The number of examples in the dataset."""
r"""The number of graphs in the dataset."""
return 1
@property
......@@ -538,17 +538,34 @@ def _get_id(dict, key):
return id
class AIFBDataset(RDFGraphDataset):
r"""AIFB dataset.
r"""AIFB dataset for node classification task
.. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in
data mining. It records the organizational structure of AIFB at the
University of Karlsruhe.
Statistics
===
AIFB dataset statistics:
Nodes: 7262
Edges: 48810 (including reverse edges)
Target Category: Personen
Number of Classes: 4
Label Split: Train: 140, Test: 36
Parameters
-----------
print_every: int
......@@ -562,18 +579,21 @@ class AIFBDataset(RDFGraphDataset):
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Returns
===
AIFBDataset object with three properties:
graph: A Heterogenous graph containing the
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
predict_category: The category name to run the node classification
prediction.
num_of_class: number of publication categories
for the classification task.
Attributes
----------
num_classes : int
Number of classes to predict
predict_category : str
The entity category (node type) that has labels for prediction
labels : Tensor
All the labels of the entities in ``predict_category``
graph : dgl.DGLGraph
Graph structure
train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples
--------
......@@ -608,6 +628,28 @@ class AIFBDataset(RDFGraphDataset):
force_reload=force_reload,
verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, AIFBDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(AIFBDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(AIFBDataset, self).__len__(idx)
def parse_entity(self, term):
if isinstance(term, rdf.Literal):
return Entity(e_id=str(term), cls="_Literal")
......@@ -657,14 +699,30 @@ class AIFB(AIFBDataset):
class MUTAGDataset(RDFGraphDataset):
r"""MUTAG dataset.
Statistics
===
r"""MUTAG dataset for node classification task
.. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
Mutag dataset statistics:
Nodes: 27163
Edges: 148100 (including reverse edges)
Target Category: d
Number of Classes: 2
Label Split: Train: 272, Test: 68
Parameters
-----------
print_every: int
......@@ -678,18 +736,21 @@ class MUTAGDataset(RDFGraphDataset):
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Returns
===
MUTAGDataset object with three properties:
graph: A Heterogenous graph containing the
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
predict_category: The category name to run the node classification
prediction.
num_of_class: number of publication categories
for the classification task.
Attributes
----------
num_classes : int
Number of classes to predict
predict_category : str
The entity category (node type) that has labels for prediction
labels : Tensor
All the labels of the entities in ``predict_category``
graph : dgl.DGLGraph
Graph structure
train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples
--------
......@@ -730,6 +791,28 @@ class MUTAGDataset(RDFGraphDataset):
force_reload=force_reload,
verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, MUTAGDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(MUTAGDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(MUTAGDataset, self).__len__(idx)
def parse_entity(self, term):
if isinstance(term, rdf.Literal):
return Entity(e_id=str(term), cls="_Literal")
......@@ -795,19 +878,36 @@ class MUTAG(MUTAGDataset):
verbose)
class BGSDataset(RDFGraphDataset):
"""BGS dataset.
r"""BGS dataset for node classification task
.. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
BGS namespace convention:
http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE
We ignored all literal nodes and the relations connecting them in the
output graph. We also ignored the relation used to mark whether a
term is CURRENT or DEPRECATED.
Statistics
===
BGS dataset statistics:
Nodes: 94806
Edges: 672884 (including reverse edges)
Target Category: Lexicon/NamedRockUnit
Number of Classes: 2
Label Split: Train: 117, Test: 29
Parameters
-----------
print_every: int
......@@ -821,18 +921,22 @@ class BGSDataset(RDFGraphDataset):
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Returns
===
BGSDataset object with three properties:
graph: A Heterogenous graph containing the
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
predict_category: The category name to run the node classification
prediction.
num_of_class: number of publication categories
for the classification task.
Attributes
----------
num_classes : int
Number of classes to predict
predict_category : str
The entity category (node type) that has labels for prediction
labels : Tensor
All the labels of the entities in ``predict_category``
graph : dgl.DGLGraph
Graph structure
train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples
--------
>>> dataset = dgl.data.rdf.BGSDataset()
......@@ -866,6 +970,28 @@ class BGSDataset(RDFGraphDataset):
force_reload=force_reload,
verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, BGSDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(BGSDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(BGSDataset, self).__len__(idx)
def parse_entity(self, term):
if isinstance(term, rdf.Literal):
return None
......@@ -927,15 +1053,30 @@ class BGS(BGSDataset):
class AMDataset(RDFGraphDataset):
"""AM dataset.
"""AM dataset. for node classification task
.. deprecated:: 0.5.0
`graph` is deprecated, it is replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
`train_idx` is deprecated, it can be replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
`test_idx` is deprecated, it can be replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
Namespace convention:
Instance: http://purl.org/collections/nl/am/<type>-<id>
Relation: http://purl.org/collections/nl/am/<name>
We ignored all literal nodes and the relations connecting them in the
output graph.
Statistics
===
AM dataset statistics:
Nodes: 881680
Edges: 5668682 (including reverse edges)
Target Category: proxy
......@@ -956,18 +1097,20 @@ class AMDataset(RDFGraphDataset):
verbose: bool
Whether to print out progress information. Default: True.
Returns
===
AMDataset object with three properties:
graph: A Heterogenous graph containing the
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
predict_category: The category name to run the node classification
prediction.
num_of_class: number of publication categories
for the classification task.
Attributes
----------
num_classes : int
Number of classes to predict
predict_category : str
The entity category (node type) that has labels for prediction
labels : Tensor
All the labels of the entities in ``predict_category``
graph : dgl.DGLGraph
Graph structure
train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples
--------
......@@ -1002,6 +1145,28 @@ class AMDataset(RDFGraphDataset):
force_reload=force_reload,
verbose=verbose)
def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, AMDataset has only one graph object
Return
-------
dgl.DGLGraph
graph structure, node features and labels.
- ndata['train_mask']: mask for training node set
- ndata['test_mask']: mask for testing node set
- ndata['labels']: mask for labels
"""
return super(AMDataset, self).__getitem__(idx)
def __len__(self):
r"""The number of graphs in the dataset."""
return super(AMDataset, self).__len__(idx)
def parse_entity(self, term):
if isinstance(term, rdf.Literal):
return None
......
......@@ -10,6 +10,9 @@ import pickle
import errno
import numpy as np
import pickle
import errno
from .graph_serialize import save_graphs, load_graphs, load_labels
from .tensor_serialize import save_tensors, load_tensors
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment