[Dataset] Builtin knowledge graph dataset (#1881)

* buildin knowledge graph dataset * upd * docstring * Fix Co-authored-by: Ubuntu <ubuntu@ip-172-31-51-214.ec2.internal> Co-authored-by: Tong He <hetong007@gmail.com>

[Dataset] Builtin knowledge graph dataset (#1881)
* buildin knowledge graph dataset * upd * docstring * Fix Co-authored-by: Ubuntu <ubuntu@ip-172-31-51-214.ec2.internal> Co-authored-by: Tong He <hetong007@gmail.com>
dbb028ac · xiang song(charlie.song) · GitHub · f4608c22 · dbb028ac · dbb028ac
Unverified Commit dbb028ac authored Jul 30, 2020 by xiang song(charlie.song) Committed by GitHub Jul 30, 2020
6 changed files
--- a/examples/pytorch/rgcn/README.md
+++ b/examples/pytorch/rgcn/README.md
@@ -62,9 +62,9 @@ python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0
 ### Link Prediction
 FB15k-237: MRR 0.151 (DGL), 0.158 (paper)
 ```
-python3 link_predict.py -d FB15k-237 --gpu 0 --raw
+python3 link_predict.py -d FB15k-237 --gpu 0 --eval-protocol raw
 ```
 FB15k-237: Filtered-MRR 0.2044
 ```
-python3 link_predict.py -d FB15k-237 --gpu 0 --filtered
+python3 link_predict.py -d FB15k-237 --gpu 0 --eval-protocol filtered
 ```
--- a/examples/pytorch/rgcn/link_predict.py
+++ b/examples/pytorch/rgcn/link_predict.py
@@ -19,7 +19,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import random
-from dgl.contrib.data import load_data
+from dgl.data.knowledge_graph import load_data
 from dgl.nn.pytorch import RelGraphConv
 from model import BaseRGCN

--- a/examples/pytorch/rgcn/utils.py
+++ b/examples/pytorch/rgcn/utils.py
@@ -135,7 +135,7 @@ def build_graph_from_triplets(num_nodes, num_rels, triplets):
        This function also generates edge type and normalization factor
        (reciprocal of node incoming degree)
    """
-    g = dgl.DGLGraph()
+    g = dgl.graph([])
    g.add_nodes(num_nodes)
    src, rel, dst = triplets
    src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))

--- a/python/dgl/data/knowledge_graph.py
+++ b/python/dgl/data/knowledge_graph.py
--- a/python/dgl/data/rdf.py
+++ b/python/dgl/data/rdf.py
@@ -73,7 +73,7 @@ class RDFGraphDataset(DGLBuiltinDataset):
    Attributes
    ----------
-    graph : dgl.DGLHeteroGraph
+    graph : dgl.DGLraph
        Graph structure
    num_classes : int
        Number of classes to predict
@@ -426,7 +426,7 @@ class RDFGraphDataset(DGLBuiltinDataset):
        return g
    def __len__(self):
-        r"""The number of examples in the dataset."""
+        r"""The number of graphs in the dataset."""
        return 1
    @property
@@ -538,17 +538,34 @@ def _get_id(dict, key):
    return id
 class AIFBDataset(RDFGraphDataset):
-    r"""AIFB dataset.
+    r"""AIFB dataset for node classification task
-     AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in
-     data mining.  It records the organizational structure of AIFB at the
+    .. deprecated:: 0.5.0
-     University of Karlsruhe.
+        `graph` is deprecated, it is replaced by:
-    Statistics
+            >>> dataset = AIFBDataset()
-    ===
+            >>> graph = dataset[0]
+        `train_idx` is deprecated, it can be replaced by:
+            >>> dataset = AIFBDataset()
+            >>> graph = dataset[0]
+            >>> train_mask = graph.nodes[dataset.category].data['train_mask']
+            >>> train_idx = th.nonzero(train_mask).squeeze()
+        `test_idx` is deprecated, it can be replaced by:
+            >>> dataset = AIFBDataset()
+            >>> graph = dataset[0]
+            >>> test_mask = graph.nodes[dataset.category].data['test_mask']
+            >>> test_idx = th.nonzero(test_mask).squeeze()
+    AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in
+    data mining.  It records the organizational structure of AIFB at the
+    University of Karlsruhe.
+    AIFB dataset statistics:
    Nodes: 7262
    Edges: 48810 (including reverse edges)
    Target Category: Personen
    Number of Classes: 4
    Label Split: Train: 140, Test: 36
    Parameters
    -----------
    print_every: int
@@ -562,18 +579,21 @@ class AIFBDataset(RDFGraphDataset):
        Whether to reload the dataset. Default: False
    verbose: bool
      Whether to print out progress information. Default: True.
-    Returns
-    ===
+    Attributes
-    AIFBDataset object with three properties:
+    ----------
-        graph: A Heterogenous graph containing the
+    num_classes : int
-               graph structure, node features and labels.
+        Number of classes to predict
-            - ndata['train_mask']: mask for training node set
+    predict_category : str
-            - ndata['test_mask']: mask for testing node set
+        The entity category (node type) that has labels for prediction
-            - ndata['labels']: mask for labels
+    labels : Tensor
-        predict_category: The category name to run the node classification
+        All the labels of the entities in ``predict_category``
-            prediction.
+    graph : dgl.DGLGraph
-        num_of_class: number of publication categories
+        Graph structure
-            for the classification task.
+    train_idx : Tensor
+        Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
+    test_idx : Tensor
+        Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
    Examples
    --------
@@ -608,6 +628,28 @@ class AIFBDataset(RDFGraphDataset):
                                          force_reload=force_reload,
                                          verbose=verbose)
+    def __getitem__(self, idx):
+        r"""Gets the graph object
+        Parameters
+        -----------
+        idx: int
+            Item index, AIFBDataset has only one graph object
+        Return
+        -------
+            dgl.DGLGraph
+                graph structure, node features and labels.
+                - ndata['train_mask']: mask for training node set
+                - ndata['test_mask']: mask for testing node set
+                - ndata['labels']: mask for labels
+        """
+        return super(AIFBDataset, self).__getitem__(idx)
+    def __len__(self):
+        r"""The number of graphs in the dataset."""
+        return super(AIFBDataset, self).__len__(idx)
    def parse_entity(self, term):
        if isinstance(term, rdf.Literal):
            return Entity(e_id=str(term), cls="_Literal")
@@ -657,14 +699,30 @@ class AIFB(AIFBDataset):
 class MUTAGDataset(RDFGraphDataset):
-    r"""MUTAG dataset.
+    r"""MUTAG dataset for node classification task
-    Statistics
-    ===
+    .. deprecated:: 0.5.0
+        `graph` is deprecated, it is replaced by:
+            >>> dataset = MUTAGDataset()
+            >>> graph = dataset[0]
+        `train_idx` is deprecated, it can be replaced by:
+            >>> dataset = MUTAGDataset()
+            >>> graph = dataset[0]
+            >>> train_mask = graph.nodes[dataset.category].data['train_mask']
+            >>> train_idx = th.nonzero(train_mask).squeeze()
+        `test_idx` is deprecated, it can be replaced by:
+            >>> dataset = MUTAGDataset()
+            >>> graph = dataset[0]
+            >>> test_mask = graph.nodes[dataset.category].data['test_mask']
+            >>> test_idx = th.nonzero(test_mask).squeeze()
+    Mutag dataset statistics:
    Nodes: 27163
    Edges: 148100 (including reverse edges)
    Target Category: d
    Number of Classes: 2
    Label Split: Train: 272, Test: 68
    Parameters
    -----------
    print_every: int
@@ -678,18 +736,21 @@ class MUTAGDataset(RDFGraphDataset):
        Whether to reload the dataset. Default: False
    verbose: bool
      Whether to print out progress information. Default: True.
-    Returns
-    ===
+    Attributes
-    MUTAGDataset object with three properties:
+    ----------
-        graph: A Heterogenous graph containing the
+    num_classes : int
-               graph structure, node features and labels.
+        Number of classes to predict
-            - ndata['train_mask']: mask for training node set
+    predict_category : str
-            - ndata['test_mask']: mask for testing node set
+        The entity category (node type) that has labels for prediction
-            - ndata['labels']: mask for labels
+    labels : Tensor
-        predict_category: The category name to run the node classification
+        All the labels of the entities in ``predict_category``
-            prediction.
+    graph : dgl.DGLGraph
-        num_of_class: number of publication categories
+        Graph structure
-            for the classification task.
+    train_idx : Tensor
+        Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
+    test_idx : Tensor
+        Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
    Examples
    --------
@@ -730,6 +791,28 @@ class MUTAGDataset(RDFGraphDataset):
                                           force_reload=force_reload,
                                           verbose=verbose)
+    def __getitem__(self, idx):
+        r"""Gets the graph object
+        Parameters
+        -----------
+        idx: int
+            Item index, MUTAGDataset has only one graph object
+        Return
+        -------
+            dgl.DGLGraph
+                graph structure, node features and labels.
+                - ndata['train_mask']: mask for training node set
+                - ndata['test_mask']: mask for testing node set
+                - ndata['labels']: mask for labels
+        """
+        return super(MUTAGDataset, self).__getitem__(idx)
+    def __len__(self):
+        r"""The number of graphs in the dataset."""
+        return super(MUTAGDataset, self).__len__(idx)
    def parse_entity(self, term):
        if isinstance(term, rdf.Literal):
            return Entity(e_id=str(term), cls="_Literal")
@@ -795,19 +878,36 @@ class MUTAG(MUTAGDataset):
                                    verbose)
 class BGSDataset(RDFGraphDataset):
-    """BGS dataset.
+    r"""BGS dataset for node classification task
+    .. deprecated:: 0.5.0
+        `graph` is deprecated, it is replaced by:
+            >>> dataset = BGSDataset()
+            >>> graph = dataset[0]
+        `train_idx` is deprecated, it can be replaced by:
+            >>> dataset = BGSDataset()
+            >>> graph = dataset[0]
+            >>> train_mask = graph.nodes[dataset.category].data['train_mask']
+            >>> train_idx = th.nonzero(train_mask).squeeze()
+        `test_idx` is deprecated, it can be replaced by:
+            >>> dataset = BGSDataset()
+            >>> graph = dataset[0]
+            >>> test_mask = graph.nodes[dataset.category].data['test_mask']
+            >>> test_idx = th.nonzero(test_mask).squeeze()
    BGS namespace convention:
    http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE
    We ignored all literal nodes and the relations connecting them in the
    output graph. We also ignored the relation used to mark whether a
    term is CURRENT or DEPRECATED.
-    Statistics
-    ===
+    BGS dataset statistics:
    Nodes: 94806
    Edges: 672884 (including reverse edges)
    Target Category: Lexicon/NamedRockUnit
    Number of Classes: 2
    Label Split: Train: 117, Test: 29
    Parameters
    -----------
    print_every: int
@@ -821,18 +921,22 @@ class BGSDataset(RDFGraphDataset):
        Whether to reload the dataset. Default: False
    verbose: bool
      Whether to print out progress information. Default: True.
-    Returns
-    ===
+    Attributes
-    BGSDataset object with three properties:
+    ----------
-        graph: A Heterogenous graph containing the
+    num_classes : int
-               graph structure, node features and labels.
+        Number of classes to predict
-            - ndata['train_mask']: mask for training node set
+    predict_category : str
-            - ndata['test_mask']: mask for testing node set
+        The entity category (node type) that has labels for prediction
-            - ndata['labels']: mask for labels
+    labels : Tensor
-        predict_category: The category name to run the node classification
+        All the labels of the entities in ``predict_category``
-            prediction.
+    graph : dgl.DGLGraph
-        num_of_class: number of publication categories
+        Graph structure
-            for the classification task.
+    train_idx : Tensor
+        Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
+    test_idx : Tensor
+        Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
    Examples
    --------
    >>> dataset = dgl.data.rdf.BGSDataset()
@@ -866,6 +970,28 @@ class BGSDataset(RDFGraphDataset):
                                         force_reload=force_reload,
                                         verbose=verbose)
+    def __getitem__(self, idx):
+        r"""Gets the graph object
+        Parameters
+        -----------
+        idx: int
+            Item index, BGSDataset has only one graph object
+        Return
+        -------
+            dgl.DGLGraph
+                graph structure, node features and labels.
+                - ndata['train_mask']: mask for training node set
+                - ndata['test_mask']: mask for testing node set
+                - ndata['labels']: mask for labels
+        """
+        return super(BGSDataset, self).__getitem__(idx)
+    def __len__(self):
+        r"""The number of graphs in the dataset."""
+        return super(BGSDataset, self).__len__(idx)
    def parse_entity(self, term):
        if isinstance(term, rdf.Literal):
            return None
@@ -927,15 +1053,30 @@ class BGS(BGSDataset):
 class AMDataset(RDFGraphDataset):
-    """AM dataset.
+    """AM dataset. for node classification task
+    .. deprecated:: 0.5.0
+        `graph` is deprecated, it is replaced by:
+            >>> dataset = AMDataset()
+            >>> graph = dataset[0]
+        `train_idx` is deprecated, it can be replaced by:
+            >>> dataset = AMDataset()
+            >>> graph = dataset[0]
+            >>> train_mask = graph.nodes[dataset.category].data['train_mask']
+            >>> train_idx = th.nonzero(train_mask).squeeze()
+        `test_idx` is deprecated, it can be replaced by:
+            >>> dataset = AMDataset()
+            >>> graph = dataset[0]
+            >>> test_mask = graph.nodes[dataset.category].data['test_mask']
+            >>> test_idx = th.nonzero(test_mask).squeeze()
    Namespace convention:
    Instance: http://purl.org/collections/nl/am/<type>-<id>
    Relation: http://purl.org/collections/nl/am/<name>
    We ignored all literal nodes and the relations connecting them in the
    output graph.
-    Statistics
+    AM dataset statistics:
-    ===
    Nodes: 881680
    Edges: 5668682 (including reverse edges)
    Target Category: proxy
@@ -956,18 +1097,20 @@ class AMDataset(RDFGraphDataset):
    verbose: bool
      Whether to print out progress information. Default: True.
-    Returns
+    Attributes
-    ===
+    ----------
-    AMDataset object with three properties:
+    num_classes : int
-        graph: A Heterogenous graph containing the
+        Number of classes to predict
-               graph structure, node features and labels.
+    predict_category : str
-            - ndata['train_mask']: mask for training node set
+        The entity category (node type) that has labels for prediction
-            - ndata['test_mask']: mask for testing node set
+    labels : Tensor
-            - ndata['labels']: mask for labels
+        All the labels of the entities in ``predict_category``
-        predict_category: The category name to run the node classification
+    graph : dgl.DGLGraph
-            prediction.
+        Graph structure
-        num_of_class: number of publication categories
+    train_idx : Tensor
-            for the classification task.
+        Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
+    test_idx : Tensor
+        Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
    Examples
    --------
@@ -1002,6 +1145,28 @@ class AMDataset(RDFGraphDataset):
                                        force_reload=force_reload,
                                        verbose=verbose)
+    def __getitem__(self, idx):
+        r"""Gets the graph object
+        Parameters
+        -----------
+        idx: int
+            Item index, AMDataset has only one graph object
+        Return
+        -------
+            dgl.DGLGraph
+                graph structure, node features and labels.
+                - ndata['train_mask']: mask for training node set
+                - ndata['test_mask']: mask for testing node set
+                - ndata['labels']: mask for labels
+        """
+        return super(AMDataset, self).__getitem__(idx)
+    def __len__(self):
+        r"""The number of graphs in the dataset."""
+        return super(AMDataset, self).__len__(idx)
    def parse_entity(self, term):
        if isinstance(term, rdf.Literal):
            return None

--- a/python/dgl/data/utils.py
+++ b/python/dgl/data/utils.py
@@ -10,6 +10,9 @@ import pickle
 import errno
 import numpy as np
+import pickle
+import errno
 from .graph_serialize import save_graphs, load_graphs, load_labels
 from .tensor_serialize import save_tensors, load_tensors