Unverified Commit d9c25521 authored by Mufei Li's avatar Mufei Li Committed by GitHub
Browse files

[Data] AsGraphPredDataset (#4073)

* Update

* CI

* Update

* Update

* Fix

* Fix
parent 9922f41f
......@@ -99,6 +99,7 @@ Dataset adapters
AsNodePredDataset
AsLinkPredDataset
AsGraphPredDataset
Utilities
-----------------
......
......@@ -389,6 +389,8 @@ After loaded, the dataset has multiple homographs with features and labels:
>>> print(data1)
{'feat': tensor([0.5348, 0.2864, 0.1155], dtype=torch.float64), 'label': tensor(0)}
If there is a single feature column in ``graphs.csv``, ``data0`` will directly be a tensor for the feature.
Custom Data Parser
~~~~~~~~~~~~~~~~~~
......
......@@ -29,7 +29,7 @@ the data.
Take :class:`~dgl.data.QM7bDataset` as example:
.. code::
.. code::
from dgl.data import DGLDataset
......@@ -37,33 +37,33 @@ Take :class:`~dgl.data.QM7bDataset` as example:
_url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/' \
'datasets/qm7b.mat'
_sha1_str = '4102c744bb9d6fd7b40ac67a300e49cd87e28392'
def __init__(self, raw_dir=None, force_reload=False, verbose=False):
super(QM7bDataset, self).__init__(name='qm7b',
url=self._url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
mat_path = self.raw_path + '.mat'
# process data to a list of graphs and a list of labels
self.graphs, self.label = self._load_graph(mat_path)
def __getitem__(self, idx):
""" Get graph and label by index
Parameters
----------
idx : int
Item index
Returns
-------
(dgl.DGLGraph, Tensor)
"""
return self.graphs[idx], self.label[idx]
def __len__(self):
"""Number of graphs in the dataset"""
return len(self.graphs)
......@@ -78,33 +78,33 @@ for details of ``self._load_graph()`` and ``__getitem__``.
One can also add properties to the class to indicate some useful
information of the dataset. In :class:`~dgl.data.QM7bDataset`, one can add a property
``num_labels`` to indicate the total number of prediction tasks in this
``num_tasks`` to indicate the total number of prediction tasks in this
multi-task dataset:
.. code::
.. code::
@property
def num_labels(self):
def num_tasks(self):
"""Number of labels for each graph, i.e. number of prediction tasks."""
return 14
After all these coding, one can finally use :class:`~dgl.data.QM7bDataset` as
follows:
.. code::
.. code::
import dgl
import torch
from dgl.dataloading import GraphDataLoader
# load data
dataset = QM7bDataset()
num_labels = dataset.num_labels
num_tasks = dataset.num_tasks
# create dataloaders
dataloader = GraphDataLoader(dataset, batch_size=1, shuffle=True)
# training
for epoch in range(100):
for g, labels in dataloader:
......@@ -115,7 +115,7 @@ A complete guide for training graph classification models can be found
in :ref:`guide-training-graph-classification`.
For more examples of graph classification datasets, please refer to DGL's builtin graph classification
datasets:
datasets:
* :ref:`gindataset`
......@@ -140,18 +140,18 @@ computation and analysis conducted on the graph. DGL provides an API called
:func:`dgl.reorder_graph` for this purpose. Please refer to ``process()``
part in below example for more details.
.. code::
.. code::
from dgl.data import DGLBuiltinDataset
from dgl.data.utils import _get_dgl_url
class CitationGraphDataset(DGLBuiltinDataset):
_urls = {
'cora_v2' : 'dataset/cora_v2.zip',
'citeseer' : 'dataset/citeseer.zip',
'pubmed' : 'dataset/pubmed.zip',
}
def __init__(self, name, raw_dir=None, force_reload=False, verbose=True):
assert name.lower() in ['cora', 'citeseer', 'pubmed']
if name.lower() == 'cora':
......@@ -162,11 +162,11 @@ part in below example for more details.
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
# Skip some processing code
# === data processing skipped ===
# build graph
g = dgl.graph(graph)
# splitting masks
......@@ -178,15 +178,15 @@ part in below example for more details.
# node features
g.ndata['feat'] = torch.tensor(_preprocess_features(features),
dtype=F.data_type_dict['float32'])
self._num_labels = onehot_labels.shape[1]
self._num_tasks = onehot_labels.shape[1]
self._labels = labels
# reorder graph to obtain better locality.
self._g = dgl.reorder_graph(g)
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self._g
def __len__(self):
return 1
......@@ -204,20 +204,20 @@ and TensorFlow, and ``float tensors`` in MXNet.
The section uses a subclass of ``CitationGraphDataset``, :class:`dgl.data.CiteseerGraphDataset`,
to show the usage of it:
.. code::
.. code::
# load data
dataset = CiteseerGraphDataset(raw_dir='')
graph = dataset[0]
# get split masks
train_mask = graph.ndata['train_mask']
val_mask = graph.ndata['val_mask']
test_mask = graph.ndata['test_mask']
# get node features
feats = graph.ndata['feat']
# get labels
labels = graph.ndata['label']
......@@ -258,7 +258,7 @@ The section uses builtin dataset
as an example, and still skips the detailed data processing code to
highlight the key part for processing link prediction datasets:
.. code::
.. code::
# Example for creating Link Prediction datasets
class KnowledgeGraphDataset(DGLBuiltinDataset):
......@@ -271,11 +271,11 @@ highlight the key part for processing link prediction datasets:
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
# Skip some processing code
# === data processing skipped ===
# splitting mask
g.edata['train_mask'] = train_mask
g.edata['val_mask'] = val_mask
......@@ -285,11 +285,11 @@ highlight the key part for processing link prediction datasets:
# node type
g.ndata['ntype'] = ntype
self._g = g
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self._g
def __len__(self):
return 1
......@@ -299,14 +299,14 @@ code <https://docs.dgl.ai/en/0.5.x/_modules/dgl/data/knowledge_graph.html#Knowle
to see the complete code. The following code uses a subclass of ``KnowledgeGraphDataset``,
:class:`dgl.data.FB15k237Dataset`, to show the usage of it:
.. code::
.. code::
from dgl.data import FB15k237Dataset
# load data
dataset = FB15k237Dataset()
graph = dataset[0]
# get training mask
train_mask = graph.edata['train_mask']
train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
......@@ -319,7 +319,7 @@ A complete guide for training link prediction models can be found in
:ref:`guide-training-link-prediction`.
For more examples of link prediction datasets, please refer to DGL's
builtin datasets:
builtin datasets:
* :ref:`kgdata`
......
......@@ -29,33 +29,33 @@
_url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/' \
'datasets/qm7b.mat'
_sha1_str = '4102c744bb9d6fd7b40ac67a300e49cd87e28392'
def __init__(self, raw_dir=None, force_reload=False, verbose=False):
super(QM7bDataset, self).__init__(name='qm7b',
url=self._url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
mat_path = self.raw_path + '.mat'
# 将数据处理为图列表和标签列表
self.graphs, self.label = self._load_graph(mat_path)
def __getitem__(self, idx):
""" 通过idx获取对应的图和标签
Parameters
----------
idx : int
Item index
Returns
-------
(dgl.DGLGraph, Tensor)
"""
return self.graphs[idx], self.label[idx]
def __len__(self):
"""数据集中图的数量"""
return len(self.graphs)
......@@ -66,31 +66,31 @@ DGL建议让 ``__getitem__(idx)`` 返回如上面代码所示的元组 ``(图,
以获得 ``self._load_graph()`` 和 ``__getitem__`` 的详细信息。
用户还可以向类添加属性以指示一些有用的数据集信息。在 :class:`~dgl.data.QM7bDataset` 中,
用户可以添加属性 ``num_labels`` 来指示此多任务数据集中的预测任务总数:
用户可以添加属性 ``num_tasks`` 来指示此多任务数据集中的预测任务总数:
.. code::
@property
def num_labels(self):
def num_tasks(self):
"""每个图的标签数,即预测任务数。"""
return 14
在编写完这些代码之后,用户可以按如下所示的方式来使用 :class:`~dgl.data.QM7bDataset`:
.. code::
.. code::
import dgl
import torch
from dgl.dataloading import GraphDataLoader
# 数据导入
dataset = QM7bDataset()
num_labels = dataset.num_labels
num_tasks = dataset.num_tasks
# 创建 dataloaders
dataloader = GraphDataLoader(dataset, batch_size=1, shuffle=True)
# 训练
for epoch in range(100):
for g, labels in dataloader:
......@@ -125,14 +125,14 @@ DGL提供了名为 :func:`dgl.reorder_graph` 的API用于此优化。更多细
from dgl.data import DGLBuiltinDataset
from dgl.data.utils import _get_dgl_url
class CitationGraphDataset(DGLBuiltinDataset):
_urls = {
'cora_v2' : 'dataset/cora_v2.zip',
'citeseer' : 'dataset/citeseer.zip',
'pubmed' : 'dataset/pubmed.zip',
}
def __init__(self, name, raw_dir=None, force_reload=False, verbose=True):
assert name.lower() in ['cora', 'citeseer', 'pubmed']
if name.lower() == 'cora':
......@@ -143,11 +143,11 @@ DGL提供了名为 :func:`dgl.reorder_graph` 的API用于此优化。更多细
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
# 跳过一些处理的代码
# === 跳过数据处理 ===
# 构建图
g = dgl.graph(graph)
......@@ -162,15 +162,15 @@ DGL提供了名为 :func:`dgl.reorder_graph` 的API用于此优化。更多细
# 节点的特征
g.ndata['feat'] = torch.tensor(_preprocess_features(features),
dtype=F.data_type_dict['float32'])
self._num_labels = onehot_labels.shape[1]
self._num_tasks = onehot_labels.shape[1]
self._labels = labels
# 重排图以获得更优的局部性
self._g = dgl.reorder_graph(g)
def __getitem__(self, idx):
assert idx == 0, "这个数据集里只有一个图"
return self._g
def __len__(self):
return 1
......@@ -184,20 +184,20 @@ DGL提供了名为 :func:`dgl.reorder_graph` 的API用于此优化。更多细
下面中使用 :class:`dgl.data.CitationGraphDataset` 的子类 :class:`dgl.data.CiteseerGraphDataset`
来演示如何使用用于节点分类的数据集:
.. code::
.. code::
# 导入数据
dataset = CiteseerGraphDataset(raw_dir='')
graph = dataset[0]
# 获取划分的掩码
train_mask = graph.ndata['train_mask']
val_mask = graph.ndata['val_mask']
test_mask = graph.ndata['test_mask']
# 获取节点特征
feats = graph.ndata['feat']
# 获取标签
labels = graph.ndata['label']
......@@ -246,7 +246,7 @@ DGL提供了名为 :func:`dgl.reorder_graph` 的API用于此优化。更多细
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
# 跳过一些处理的代码
# === 跳过数据处理 ===
......@@ -262,11 +262,11 @@ DGL提供了名为 :func:`dgl.reorder_graph` 的API用于此优化。更多细
# 节点类型
g.ndata['ntype'] = ntype
self._g = g
def __getitem__(self, idx):
assert idx == 0, "这个数据集只有一个图"
return self._g
def __len__(self):
return 1
......@@ -275,14 +275,14 @@ DGL提供了名为 :func:`dgl.reorder_graph` 的API用于此优化。更多细
`KnowledgeGraphDataset 源代码 <https://docs.dgl.ai/en/0.5.x/_modules/dgl/data/knowledge_graph.html#KnowledgeGraphDataset>`__
中可以查看完整的代码。下面使用 ``KnowledgeGraphDataset``的子类 :class:`dgl.data.FB15k237Dataset` 来做演示如何使用用于链路预测的数据集:
.. code::
.. code::
from dgl.data import FB15k237Dataset
# 导入数据
dataset = FB15k237Dataset()
graph = dataset[0]
# 获取训练集掩码
train_mask = graph.edata['train_mask']
train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
......
......@@ -7,7 +7,7 @@
데이터 프로세싱 코드를 ``process()`` 함수에 구현할 수 있으며, 이때 처리되지 않은 데이터는 ``self.raw_dir`` 디렉토리에 있어야 한다. 그래프 머신러닝에는 일반적으로 3가지 종류의 일이 있다: 그래프 분류, 노드 분류, 그리고 링크 예측. 이 절에서는 이 일들에 관련된 데이터셋 처리 방법을 설명한다.
이 절에서 그래프들, 피쳐들, 그리고 마스크들을 처리하는 표준 방법에 집중해서 알아본다. 빌트인 데이터셋을 예제로 사용할 것이고, 파일로 부터 그래프를 만드는 방법은 생략한다. 하지만, 이와 관련된 구현에 대한 링크를 제공할 것이다. 외부 소스들로 부터 그래프를 만드는 방법에 대한 완벽한 가이드는 :ref:`guide_ko-graph-external` 를 참고하자.
이 절에서 그래프들, 피쳐들, 그리고 마스크들을 처리하는 표준 방법에 집중해서 알아본다. 빌트인 데이터셋을 예제로 사용할 것이고, 파일로 부터 그래프를 만드는 방법은 생략한다. 하지만, 이와 관련된 구현에 대한 링크를 제공할 것이다. 외부 소스들로 부터 그래프를 만드는 방법에 대한 완벽한 가이드는 :ref:`guide_ko-graph-external` 를 참고하자.
그래프 분류 데이터셋 프로세싱
~~~~~~~~~~~~~~~~~~~~~~
......@@ -16,7 +16,7 @@
:class:`~dgl.data.QM7bDataset` 를 예로 살펴보자:
.. code::
.. code::
from dgl.data import DGLDataset
......@@ -24,33 +24,33 @@
_url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/' \
'datasets/qm7b.mat'
_sha1_str = '4102c744bb9d6fd7b40ac67a300e49cd87e28392'
def __init__(self, raw_dir=None, force_reload=False, verbose=False):
super(QM7bDataset, self).__init__(name='qm7b',
url=self._url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
mat_path = self.raw_path + '.mat'
# process data to a list of graphs and a list of labels
self.graphs, self.label = self._load_graph(mat_path)
def __getitem__(self, idx):
""" Get graph and label by index
Parameters
----------
idx : int
Item index
Returns
-------
(dgl.DGLGraph, Tensor)
"""
return self.graphs[idx], self.label[idx]
def __len__(self):
"""Number of graphs in the dataset"""
return len(self.graphs)
......@@ -58,31 +58,31 @@
``process()`` 함수에서 처리되지 않은 데이터는 그래프들의 리스트와 레이블들의 리스트로 변환된다. Iteration을 위해서 ``__getitem__(idx)`` 와 ``__len__()`` 를 구현해야 한다. 위의 예제에서와 같이, DGL에서는 ``__getitem__(idx)`` 가 ``(graph, label)`` tuple을 리턴하도록 권장한다. ``self._load_graph()`` 와 ``__getitem__`` 함수의 구체적인 구현은 `QM7bDataset source
code <https://docs.dgl.ai/en/0.5.x/_modules/dgl/data/qm7b.html#QM7bDataset>`__ 를 확인하자.
데이터셋의 유용한 정보들을 지정하기 위해서 클래스에 프로퍼티들을 추가하는 것이 가능하다. :class:`~dgl.data.QM7bDataset` 에 이 멀티 테스크 데이터셋의 예측 테스트의 총 개숫를 지정하기 위해 ``num_labels`` 라는 프로퍼티를 추가할 수 있다.
데이터셋의 유용한 정보들을 지정하기 위해서 클래스에 프로퍼티들을 추가하는 것이 가능하다. :class:`~dgl.data.QM7bDataset` 에 이 멀티 테스크 데이터셋의 예측 테스트의 총 개숫를 지정하기 위해 ``num_tasks`` 라는 프로퍼티를 추가할 수 있다.
.. code::
.. code::
@property
def num_labels(self):
def num_tasks(self):
"""Number of labels for each graph, i.e. number of prediction tasks."""
return 14
구현 코드를 마친 후에, :class:`~dgl.data.QM7bDataset` 를 다음과 같이 사용한다.
.. code::
.. code::
import dgl
import torch
from dgl.dataloading import GraphDataLoader
# load data
dataset = QM7bDataset()
num_labels = dataset.num_labels
num_tasks = dataset.num_tasks
# create dataloaders
dataloader = GraphDataLoader(dataset, batch_size=1, shuffle=True)
# training
for epoch in range(100):
for g, labels in dataloader:
......@@ -105,18 +105,18 @@ DGL의 빌트인 그래프 분류 데이터셋을 참고하면 그래프 분류
추가로, DGL은 노드들와 에지들이 서로 가까운 ID값들이 서로 가까운 범위에 있도록 재배열하는 것을 권장한다. 이 절차는 노드의 neighbor들에 대한 접근성을 향상시켜서, 이 후의 연산 및 그래프에 대한 분석을 빠르게 하기 위함이다. 이를 위해서 DGL은 :func:`dgl.reorder_graph` API를 제공한다. 더 자세한 내용은 다음 예제의 ``process()`` 를 참고하자.
.. code::
.. code::
from dgl.data import DGLBuiltinDataset
from dgl.data.utils import _get_dgl_url
class CitationGraphDataset(DGLBuiltinDataset):
_urls = {
'cora_v2' : 'dataset/cora_v2.zip',
'citeseer' : 'dataset/citeseer.zip',
'pubmed' : 'dataset/pubmed.zip',
}
def __init__(self, name, raw_dir=None, force_reload=False, verbose=True):
assert name.lower() in ['cora', 'citeseer', 'pubmed']
if name.lower() == 'cora':
......@@ -127,11 +127,11 @@ DGL의 빌트인 그래프 분류 데이터셋을 참고하면 그래프 분류
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
# Skip some processing code
# === data processing skipped ===
# build graph
g = dgl.graph(graph)
# splitting masks
......@@ -143,15 +143,15 @@ DGL의 빌트인 그래프 분류 데이터셋을 참고하면 그래프 분류
# node features
g.ndata['feat'] = torch.tensor(_preprocess_features(features),
dtype=F.data_type_dict['float32'])
self._num_labels = onehot_labels.shape[1]
self._num_tasks = onehot_labels.shape[1]
self._labels = labels
# reorder graph to obtain better locality.
self._g = dgl.reorder_graph(g)
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self._g
def __len__(self):
return 1
......@@ -161,20 +161,20 @@ DGL의 빌트인 그래프 분류 데이터셋을 참고하면 그래프 분류
다음 예는 ``CitationGraphDataset`` 의 서브 클래스인 :class:`dgl.data.CiteseerGraphDataset` 를 사용하는 방법이다.
.. code::
.. code::
# load data
dataset = CiteseerGraphDataset(raw_dir='')
graph = dataset[0]
# get split masks
train_mask = graph.ndata['train_mask']
val_mask = graph.ndata['val_mask']
test_mask = graph.ndata['test_mask']
# get node features
feats = graph.ndata['feat']
# get labels
labels = graph.ndata['label']
......@@ -205,11 +205,11 @@ DGL의 빌트인 데이터셋들은 노드 분류 데이터셋의 여러 예제
링크 예측 데이터셋 프로세싱
~~~~~~~~~~~~~~~~~~~~
링크 예측 데이테셋을 프로세싱하는 것은 주로 데이터셋에 하나의 그래프만 있기 때문에, 노드 분류의 경우와 비슷하다.
링크 예측 데이테셋을 프로세싱하는 것은 주로 데이터셋에 하나의 그래프만 있기 때문에, 노드 분류의 경우와 비슷하다.
예제로 `KnowledgeGraphDataset <https://docs.dgl.ai/en/0.5.x/_modules/dgl/data/knowledge_graph.html#KnowledgeGraphDataset>`__ 빌트인 데이터셋을 사용하는데, 링크 예측 데이터셋 프로세싱의 주요 부분을 강조하기 위해서 자세한 데이터 프로세싱 코드는 생략했다.
.. code::
.. code::
# Example for creating Link Prediction datasets
class KnowledgeGraphDataset(DGLBuiltinDataset):
......@@ -222,11 +222,11 @@ DGL의 빌트인 데이터셋들은 노드 분류 데이터셋의 여러 예제
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
# Skip some processing code
# === data processing skipped ===
# splitting mask
g.edata['train_mask'] = train_mask
g.edata['val_mask'] = val_mask
......@@ -236,25 +236,25 @@ DGL의 빌트인 데이터셋들은 노드 분류 데이터셋의 여러 예제
# node type
g.ndata['ntype'] = ntype
self._g = g
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self._g
def __len__(self):
return 1
위 코드에서 볼 수 있듯이 분할 마스크들을 그래프의 ``edata`` 필드에 추가한다. 전체 구현은 `KnowledgeGraphDataset 소스 코드 <https://docs.dgl.ai/en/0.5.x/_modules/dgl/data/knowledge_graph.html#KnowledgeGraphDataset>`__ 를 참고하자.
.. code::
.. code::
from dgl.data import FB15k237Dataset
# load data
dataset = FB15k237Dataset()
graph = dataset[0]
# get training mask
train_mask = graph.edata['train_mask']
train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
......
......@@ -30,7 +30,7 @@ from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
from .fakenews import FakeNewsDataset
from .csv_dataset import CSVDataset
from .adapter import AsNodePredDataset, AsLinkPredDataset
from .adapter import *
from .synthetic import BAShapeDataset, BACommunityDataset, TreeCycleDataset, TreeGridDataset, BA2MotifDataset
def register_data_args(parser):
......
......@@ -9,16 +9,17 @@ from ..convert import graph as create_dgl_graph
from ..sampling.negative import _calc_redundancy
from .dgl_dataset import DGLDataset
from . import utils
from ..base import DGLError
from .. import backend as F
__all__ = ['AsNodePredDataset', 'AsLinkPredDataset']
__all__ = ['AsNodePredDataset', 'AsLinkPredDataset', 'AsGraphPredDataset']
class AsNodePredDataset(DGLDataset):
"""Repurpose a dataset for a standard semi-supervised transductive
node prediction task.
The class converts a given dataset into a new dataset object that:
The class converts a given dataset into a new dataset object such that:
- Contains only one graph, accessible from ``dataset[0]``.
- The graph stores:
......@@ -40,7 +41,7 @@ class AsNodePredDataset(DGLDataset):
So do validation and test masks.
The class will keep only the first graph in the provided dataset and
generate train/val/test masks according to the given spplit ratio. The generated
generate train/val/test masks according to the given split ratio. The generated
masks will be cached to disk for fast re-loading. If the provided split ratio
differs from the cached one, it will re-process the dataset properly.
......@@ -49,7 +50,7 @@ class AsNodePredDataset(DGLDataset):
dataset : DGLDataset
The dataset to be converted.
split_ratio : (float, float, float), optional
Split ratios for training, validation and test sets. Must sum to one.
Split ratios for training, validation and test sets. They must sum to one.
target_ntype : str, optional
The node type to add split mask for.
......@@ -193,7 +194,7 @@ class AsLinkPredDataset(DGLDataset):
"""Repurpose a dataset for link prediction task.
The created dataset will include data needed for link prediction.
Currently only support homogeneous graph.
Currently it only supports homogeneous graphs.
It will keep only the first graph in the provided dataset and
generate train/val/test edges according to the given split ratio,
and the correspondent negative edges based on the neg_ratio. The generated
......@@ -368,3 +369,158 @@ class AsLinkPredDataset(DGLDataset):
def __len__(self):
return 1
class AsGraphPredDataset(DGLDataset):
"""Repurpose a dataset for standard graph property prediction task.
The created dataset will include data needed for graph property prediction.
Currently it only supports homogeneous graphs.
The class converts a given dataset into a new dataset object such that:
- It stores ``len(dataset)`` graphs.
- The i-th graph and its label is accessible from ``dataset[i]``.
The class will generate a train/val/test split if :attr:`split_ratio` is provided.
The generated split will be cached to disk for fast re-loading. If the provided split
ratio differs from the cached one, it will re-process the dataset properly.
Parameters
----------
dataset : DGLDataset
The dataset to be converted.
split_ratio : (float, float, float), optional
Split ratios for training, validation and test sets. They must sum to one.
Attributes
----------
num_tasks : int
Number of tasks to predict.
num_classes : int
Number of classes to predict per task, None for regression datasets.
train_idx : Tensor
An 1-D integer tensor of training node IDs.
val_idx : Tensor
An 1-D integer tensor of validation node IDs.
test_idx : Tensor
An 1-D integer tensor of test node IDs.
node_feat_size : int
Input node feature size, None if not applicable.
edge_feat_size : int
Input edge feature size, None if not applicable.
Examples
--------
>>> from dgl.data import AsGraphPredDataset
>>> from ogb.graphproppred import DglGraphPropPredDataset
>>> dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
>>> new_dataset = AsGraphPredDataset(dataset)
>>> print(new_dataset)
Dataset("ogbg-molhiv-as-graphpred", num_graphs=41127, save_path=...)
>>> print(len(new_dataset))
41127
>>> print(new_dataset[0])
(Graph(num_nodes=19, num_edges=40,
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)}), tensor([0]))
"""
def __init__(self,
dataset,
split_ratio=None,
**kwargs):
self.dataset = dataset
self.split_ratio = split_ratio
super().__init__(dataset.name + '-as-graphpred',
hash_key=(split_ratio, dataset.name, 'graphpred'), **kwargs)
def process(self):
is_ogb = hasattr(self.dataset, 'get_idx_split')
if self.split_ratio is None:
if is_ogb:
split = self.dataset.get_idx_split()
self.train_idx = split['train']
self.val_idx = split['valid']
self.test_idx = split['test']
else:
# Handle FakeNewsDataset
try:
self.train_idx = F.nonzero_1d(self.dataset.train_mask)
self.val_idx = F.nonzero_1d(self.dataset.val_mask)
self.test_idx = F.nonzero_1d(self.dataset.test_mask)
except:
raise DGLError('The input dataset does not have default train/val/test\
split. Please specify split_ratio to generate the split.')
else:
if self.verbose:
print('Generating train/val/test split...')
train_ratio, val_ratio, _ = self.split_ratio
num_graphs = len(self.dataset)
num_train = int(num_graphs * train_ratio)
num_val = int(num_graphs * val_ratio)
idx = np.random.permutation(num_graphs)
self.train_idx = F.tensor(idx[:num_train])
self.val_idx = F.tensor(idx[num_train: num_train + num_val])
self.test_idx = F.tensor(idx[num_train + num_val:])
if hasattr(self.dataset, 'num_classes'):
# GINDataset, MiniGCDataset, FakeNewsDataset, TUDataset,
# LegacyTUDataset, BA2MotifDataset
self.num_classes = self.dataset.num_classes
else:
# None for multi-label classification and regression
self.num_classes = None
if hasattr(self.dataset, 'num_tasks'):
# OGB datasets
self.num_tasks = self.dataset.num_tasks
else:
self.num_tasks = 1
def has_cache(self):
return os.path.isfile(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)))
def load(self):
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'r') as f:
info = json.load(f)
if info['split_ratio'] != self.split_ratio:
raise ValueError('Provided split ratio is different from the cached file. '
'Re-process the dataset.')
self.split_ratio = info['split_ratio']
self.num_tasks = info['num_tasks']
self.num_classes = info['num_classes']
split = np.load(os.path.join(self.save_path, 'split_{}.npz'.format(self.hash)))
self.train_idx = F.zerocopy_from_numpy(split['train_idx'])
self.val_idx = F.zerocopy_from_numpy(split['val_idx'])
self.test_idx = F.zerocopy_from_numpy(split['test_idx'])
def save(self):
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'w') as f:
json.dump({
'split_ratio': self.split_ratio,
'num_tasks': self.num_tasks,
'num_classes': self.num_classes}, f)
np.savez(os.path.join(self.save_path, 'split_{}.npz'.format(self.hash)),
train_idx=F.zerocopy_to_numpy(self.train_idx),
val_idx=F.zerocopy_to_numpy(self.val_idx),
test_idx=F.zerocopy_to_numpy(self.test_idx))
def __getitem__(self, idx):
return self.dataset[idx]
def __len__(self):
return len(self.dataset)
@property
def node_feat_size(self):
g = self[0][0]
return g.ndata['feat'].shape[-1] if 'feat' in g.ndata else None
@property
def edge_feat_size(self):
g = self[0][0]
return g.edata['feat'].shape[-1] if 'feat' in g.edata else None
import os
import numpy as np
from .dgl_dataset import DGLDataset
from .utils import save_graphs, load_graphs
from .utils import save_graphs, load_graphs, Subset
from .. import backend as F
from ..base import DGLError
......@@ -120,6 +121,8 @@ class CSVDataset(DGLDataset):
# construct graphs
self.graphs, self.data = DGLGraphConstructor.construct_graphs(
node_data, edge_data, graph_data)
if len(self.data) == 1:
self.labels = list(self.data.values())[0]
def has_cache(self):
graph_path = os.path.join(self.save_path,
......@@ -141,14 +144,21 @@ class CSVDataset(DGLDataset):
graph_path = os.path.join(self.save_path,
self.name + '.bin')
self.graphs, self.data = load_graphs(graph_path)
if len(self.data) == 1:
self.labels = list(self.data.values())[0]
def __getitem__(self, i):
if F.is_tensor(i) and F.ndim(i) == 1:
return Subset(self, F.copy_to(i, F.cpu()))
if self._transform is None:
g = self.graphs[i]
else:
g = self._transform(self.graphs[i])
if len(self.data) > 0:
if len(self.data) == 1:
return g, self.labels[i]
elif len(self.data) > 0:
data = {k: v[i] for (k, v) in self.data.items()}
return g, data
else:
......
......@@ -239,7 +239,7 @@ class GraphData(BaseData):
{('_V', '_E', '_V'): ([], [])})
for graph_id in graph_ids:
graphs.append(graphs_dict[graph_id])
data = {k: _tensor(v) for k, v in graph_data.data.items()}
data = {k: F.reshape(_tensor(v), (len(graphs), -1)) for k, v in graph_data.data.items()}
return graphs, data
......
......@@ -49,6 +49,11 @@ class GINDataset(DGLBuiltinDataset):
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Attributes
----------
num_classes : int
Number of classes for multiclass classification
Examples
--------
>>> data = GINDataset(name='MUTAG', self_loop=False)
......@@ -362,3 +367,7 @@ class GINDataset(DGLBuiltinDataset):
if os.path.exists(graph_path) and os.path.exists(info_path):
return True
return False
@property
def num_classes(self):
return self.gclasses
......@@ -43,8 +43,10 @@ class QM7bDataset(DGLDataset):
Attributes
----------
num_tasks : int
Number of prediction tasks
num_labels : int
Number of labels for each graph, i.e. number of prediction tasks
(DEPRECATED, use num_tasks instead) Number of prediction tasks
Raises
------
......@@ -54,7 +56,7 @@ class QM7bDataset(DGLDataset):
Examples
--------
>>> data = QM7bDataset()
>>> data.num_labels
>>> data.num_tasks
14
>>>
>>> # iterate over the dataset
......@@ -117,9 +119,14 @@ class QM7bDataset(DGLDataset):
'The repo may be outdated or download may be incomplete. '
'Otherwise you can create an issue for it.'.format(self.name))
@property
def num_tasks(self):
"""Number of prediction tasks."""
return self.num_labels
@property
def num_labels(self):
"""Number of labels for each graph, i.e. number of prediction tasks."""
"""Number of prediction tasks."""
return 14
def __getitem__(self, idx):
......
......@@ -79,8 +79,10 @@ class QM9Dataset(DGLDataset):
Attributes
----------
num_tasks : int
Number of prediction tasks
num_labels : int
Number of labels for each graph, i.e. number of prediction tasks
(DEPRECATED, use num_tasks instead) Number of prediction tasks
Raises
------
......@@ -90,7 +92,7 @@ class QM9Dataset(DGLDataset):
Examples
--------
>>> data = QM9Dataset(label_keys=['mu', 'gap'], cutoff=5.0)
>>> data.num_labels
>>> data.num_tasks
2
>>>
>>> # iterate over the dataset
......@@ -143,7 +145,17 @@ class QM9Dataset(DGLDataset):
Returns
--------
int
Number of labels for each graph, i.e. number of prediction tasks.
Number of prediction tasks.
"""
return self.label.shape[1]
@property
def num_tasks(self):
r"""
Returns
--------
int
Number of prediction tasks.
"""
return self.label.shape[1]
......
......@@ -105,8 +105,10 @@ class QM9EdgeDataset(DGLDataset):
Attributes
----------
num_tasks : int
Number of prediction tasks
num_labels : int
Number of labels for each graph, i.e. number of prediction tasks
(DEPRECATED, use num_tasks instead) Number of prediction tasks
Raises
------
......@@ -116,7 +118,7 @@ class QM9EdgeDataset(DGLDataset):
Examples
--------
>>> data = QM9EdgeDataset(label_keys=['mu', 'alpha'])
>>> data.num_labels
>>> data.num_tasks
2
>>> # iterate over the dataset
......@@ -245,5 +247,15 @@ class QM9EdgeDataset(DGLDataset):
"""
return self.n_node.shape[0]
@property
def num_tasks(self):
r"""
Returns
-------
int
Number of prediction tasks
"""
return self.num_labels
QM9Edge = QM9EdgeDataset
\ No newline at end of file
......@@ -692,7 +692,7 @@ class BA2MotifDataset(DGLBuiltinDataset):
Attributes
----------
num_classes : int
Number of node classes
Number of graph classes
Examples
--------
......
......@@ -35,8 +35,10 @@ class LegacyTUDataset(DGLBuiltinDataset):
----------
max_num_node : int
Maximum number of nodes
num_labels : int
num_classes : int
Number of classes
num_labels : numpy.int64
(DEPRECATED, use num_classes instead) Number of classes
Notes
-----
......@@ -244,6 +246,10 @@ class LegacyTUDataset(DGLBuiltinDataset):
self.num_labels,\
self.max_num_node
@property
def num_classes(self):
return int(self.num_labels)
class TUDataset(DGLBuiltinDataset):
r"""
TUDataset contains lots of graph kernel datasets for graph classification.
......@@ -262,8 +268,10 @@ class TUDataset(DGLBuiltinDataset):
----------
max_num_node : int
Maximum number of nodes
num_labels : int
num_classes : int
Number of classes
num_labels : int
(DEPRECATED, use num_classes instead) Number of classes
Notes
-----
......@@ -329,7 +337,7 @@ class TUDataset(DGLBuiltinDataset):
if os.path.exists(self._file_path("graph_labels")):
DS_graph_labels = self._idx_reset(
loadtxt(self._file_path("graph_labels"), delimiter=",").astype(int))
self.num_labels = max(DS_graph_labels) + 1
self.num_labels = int(max(DS_graph_labels) + 1)
self.graph_labels = F.tensor(DS_graph_labels)
elif os.path.exists(self._file_path("graph_attributes")):
DS_graph_labels = loadtxt(self._file_path("graph_attributes"), delimiter=",").astype(float)
......@@ -442,3 +450,7 @@ class TUDataset(DGLBuiltinDataset):
return self.graph_lists[0].ndata['feat'].shape[1], \
self.num_labels, \
self.max_num_node
@property
def num_classes(self):
return self.num_labels
......@@ -40,6 +40,7 @@ def test_gin():
ds = data.GINDataset(name, self_loop=False, degree_as_nlabel=False, transform=transform)
g2 = ds[0][0]
assert g2.num_edges() - g1.num_edges() == g1.num_nodes()
assert ds.num_classes == ds.gclasses
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_fraud():
......@@ -83,6 +84,7 @@ def test_fakenews():
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_tudataset_regression():
ds = data.TUDataset('ZINC_test', force_reload=True)
assert ds.num_classes == ds.num_labels
assert len(ds) == 5000
g = ds[0][0]
......@@ -494,7 +496,7 @@ def _test_construct_graphs_multiple():
assert len(data_dict) == len(gdata)
for k, v in data_dict.items():
assert F.dtype(v) != F.float64
assert F.array_equal(F.tensor(gdata[k], dtype=F.dtype(v)), v)
assert F.array_equal(F.reshape(F.tensor(gdata[k], dtype=F.dtype(v)), (len(graphs), -1)), v)
for i, g in enumerate(graphs):
assert g.is_homogeneous
assert g.num_nodes() == num_nodes
......@@ -1102,7 +1104,7 @@ def _test_CSVDataset_customized_data_parser():
assert 'label' in csv_dataset.data
for i, (g, g_data) in enumerate(csv_dataset):
assert not g.is_homogeneous
assert F.asnumpy(g_data['label']) == label_gdata[i] + 2
assert F.asnumpy(g_data) == label_gdata[i] + 2
for ntype in g.ntypes:
assert g.num_nodes(ntype) == num_nodes
offset = 2 if ntype == 'user' else 0
......@@ -1122,7 +1124,7 @@ def _test_CSVDataset_customized_data_parser():
assert 'label' in csv_dataset.data
for i, (g, g_data) in enumerate(csv_dataset):
assert not g.is_homogeneous
assert F.asnumpy(g_data['label']) == label_gdata[i] + 2
assert F.asnumpy(g_data) == label_gdata[i] + 2
for ntype in g.ntypes:
assert g.num_nodes(ntype) == num_nodes
offset = 2
......@@ -1384,6 +1386,141 @@ def test_as_nodepred_csvdataset():
assert 'label' in new_ds[0].ndata
assert 'train_mask' in new_ds[0].ndata
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_as_graphpred():
ds = data.GINDataset(name='MUTAG', self_loop=True)
new_ds = data.AsGraphPredDataset(ds, [0.8, 0.1, 0.1], verbose=True)
assert len(new_ds) == 188
assert new_ds.num_tasks == 1
assert new_ds.num_classes == 2
ds = data.FakeNewsDataset('politifact', 'profile')
new_ds = data.AsGraphPredDataset(ds, verbose=True)
assert len(new_ds) == 314
assert new_ds.num_tasks == 1
assert new_ds.num_classes == 2
ds = data.QM7bDataset()
new_ds = data.AsGraphPredDataset(ds, [0.8, 0.1, 0.1], verbose=True)
assert len(new_ds) == 7211
assert new_ds.num_tasks == 14
assert new_ds.num_classes is None
ds = data.QM9Dataset(label_keys=['mu', 'gap'])
new_ds = data.AsGraphPredDataset(ds, [0.8, 0.1, 0.1], verbose=True)
assert len(new_ds) == 130831
assert new_ds.num_tasks == 2
assert new_ds.num_classes is None
ds = data.QM9EdgeDataset(label_keys=['mu', 'alpha'])
new_ds = data.AsGraphPredDataset(ds, [0.8, 0.1, 0.1], verbose=True)
assert len(new_ds) == 130831
assert new_ds.num_tasks == 2
assert new_ds.num_classes is None
ds = data.TUDataset('DD')
new_ds = data.AsGraphPredDataset(ds, [0.8, 0.1, 0.1], verbose=True)
assert len(new_ds) == 1178
assert new_ds.num_tasks == 1
assert new_ds.num_classes == 2
ds = data.LegacyTUDataset('DD')
new_ds = data.AsGraphPredDataset(ds, [0.8, 0.1, 0.1], verbose=True)
assert len(new_ds) == 1178
assert new_ds.num_tasks == 1
assert new_ds.num_classes == 2
ds = data.BA2MotifDataset()
new_ds = data.AsGraphPredDataset(ds, [0.8, 0.1, 0.1], verbose=True)
assert len(new_ds) == 1000
assert new_ds.num_tasks == 1
assert new_ds.num_classes == 2
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_as_graphpred_reprocess():
ds = data.AsGraphPredDataset(data.GINDataset(name='MUTAG', self_loop=True), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# read from cache
ds = data.AsGraphPredDataset(data.GINDataset(name='MUTAG', self_loop=True), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# invalid cache, re-read
ds = data.AsGraphPredDataset(data.GINDataset(name='MUTAG', self_loop=True), [0.1, 0.1, 0.8])
assert len(ds.train_idx) == int(len(ds) * 0.1)
ds = data.AsGraphPredDataset(data.FakeNewsDataset('politifact', 'profile'), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# read from cache
ds = data.AsGraphPredDataset(data.FakeNewsDataset('politifact', 'profile'), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# invalid cache, re-read
ds = data.AsGraphPredDataset(data.FakeNewsDataset('politifact', 'profile'), [0.1, 0.1, 0.8])
assert len(ds.train_idx) == int(len(ds) * 0.1)
ds = data.AsGraphPredDataset(data.QM7bDataset(), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# read from cache
ds = data.AsGraphPredDataset(data.QM7bDataset(), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# invalid cache, re-read
ds = data.AsGraphPredDataset(data.QM7bDataset(), [0.1, 0.1, 0.8])
assert len(ds.train_idx) == int(len(ds) * 0.1)
ds = data.AsGraphPredDataset(data.QM9Dataset(label_keys=['mu', 'gap']), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# read from cache
ds = data.AsGraphPredDataset(data.QM9Dataset(label_keys=['mu', 'gap']), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# invalid cache, re-read
ds = data.AsGraphPredDataset(data.QM9Dataset(label_keys=['mu', 'gap']), [0.1, 0.1, 0.8])
assert len(ds.train_idx) == int(len(ds) * 0.1)
ds = data.AsGraphPredDataset(data.QM9EdgeDataset(label_keys=['mu', 'alpha']), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# read from cache
ds = data.AsGraphPredDataset(data.QM9EdgeDataset(label_keys=['mu', 'alpha']), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# invalid cache, re-read
ds = data.AsGraphPredDataset(data.QM9EdgeDataset(label_keys=['mu', 'alpha']), [0.1, 0.1, 0.8])
assert len(ds.train_idx) == int(len(ds) * 0.1)
ds = data.AsGraphPredDataset(data.TUDataset('DD'), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# read from cache
ds = data.AsGraphPredDataset(data.TUDataset('DD'), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# invalid cache, re-read
ds = data.AsGraphPredDataset(data.TUDataset('DD'), [0.1, 0.1, 0.8])
assert len(ds.train_idx) == int(len(ds) * 0.1)
ds = data.AsGraphPredDataset(data.LegacyTUDataset('DD'), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# read from cache
ds = data.AsGraphPredDataset(data.LegacyTUDataset('DD'), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# invalid cache, re-read
ds = data.AsGraphPredDataset(data.LegacyTUDataset('DD'), [0.1, 0.1, 0.8])
assert len(ds.train_idx) == int(len(ds) * 0.1)
ds = data.AsGraphPredDataset(data.BA2MotifDataset(), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# read from cache
ds = data.AsGraphPredDataset(data.BA2MotifDataset(), [0.8, 0.1, 0.1])
assert len(ds.train_idx) == int(len(ds) * 0.8)
# invalid cache, re-read
ds = data.AsGraphPredDataset(data.BA2MotifDataset(), [0.1, 0.1, 0.8])
assert len(ds.train_idx) == int(len(ds) * 0.1)
@unittest.skipIf(dgl.backend.backend_name != 'pytorch', reason="ogb only supports pytorch")
def test_as_graphpred_ogb():
from ogb.graphproppred import DglGraphPropPredDataset
ds = data.AsGraphPredDataset(DglGraphPropPredDataset('ogbg-molhiv'),
split_ratio=None, verbose=True)
assert len(ds.train_idx) == 32901
# force generate new split
ds = data.AsGraphPredDataset(DglGraphPropPredDataset('ogbg-molhiv'),
split_ratio=[0.6, 0.2, 0.2], verbose=True)
assert len(ds.train_idx) == 24676
if __name__ == '__main__':
test_minigc()
test_gin()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment