Unverified Commit dc78e11c authored by Jinjing Zhou's avatar Jinjing Zhou Committed by GitHub
Browse files

[Dataset] Adapter to convert a dataset for link prediction task (#3699)



* add ut

* add doc link

* install dep

* add

* Revert "install dep"

This reverts commit e574a8377144749056c6849b655004df2771e179.

* add

* merge fix

* rm files

* fix

* fix

* fix

* fix

* fix typo

* fix tf

* fix

* fix

* fix

* fix

* fix

* fix dependency

* fix test

* fix

* fix

* add doc

* fix

* fix

* fix test

* fix test
Co-authored-by: default avatarMinjie Wang <wmjlyjemaine@gmail.com>
parent bc8f8b0b
......@@ -234,6 +234,10 @@ Dataset adapters
:members: __getitem__, __len__
.. autoclass:: AsEdgePredDataset
:members: __getitem__, __len__
Utilities
-----------------
......
......@@ -30,7 +30,7 @@ from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
from .fakenews import FakeNewsDataset
from .csv_dataset import DGLCSVDataset
from .adapter import AsNodePredDataset
from .adapter import AsNodePredDataset, AsLinkPredDataset
def register_data_args(parser):
parser.add_argument(
......
......@@ -2,12 +2,17 @@
import os
import json
import numpy as np
from .. import backend as F
from ..convert import graph as create_dgl_graph
from ..sampling.negative import _calc_redundancy
from .dgl_dataset import DGLDataset
from . import utils
from .. import backend as F
__all__ = ['AsNodePredDataset']
__all__ = ['AsNodePredDataset', 'AsLinkPredDataset']
class AsNodePredDataset(DGLDataset):
"""Repurpose a dataset for a standard semi-supervised transductive
......@@ -61,32 +66,43 @@ class AsNodePredDataset(DGLDataset):
>>> print('train_mask' in new_ds[0].ndata)
True
"""
def __init__(self,
dataset,
split_ratio=[0.8, 0.1, 0.1],
split_ratio=None,
target_ntype=None,
**kwargs):
self.g = dataset[0].clone()
self.split_ratio = split_ratio
self.target_ntype = target_ntype
self.num_classes = getattr(dataset, 'num_classes', None)
super().__init__(dataset.name + '-as-nodepred', **kwargs)
super().__init__(dataset.name + '-as-nodepred',
hash_key=(split_ratio, target_ntype), **kwargs)
def process(self):
if 'label' not in self.g.nodes[self.target_ntype].data:
raise ValueError("Missing node labels. Make sure labels are stored "
"under name 'label'.")
if self.num_classes is None:
self.num_classes = len(F.unique(self.g.nodes[self.target_ntype].data['label']))
if self.split_ratio is None:
assert "train_mask" in self.g.nodes[self.target_ntype].data, \
"train_mask is not provided, please specify split_ratio to generate the masks"
assert "val_mask" in self.g.nodes[self.target_ntype].data, \
"val_mask is not provided, please specify split_ratio to generate the masks"
assert "test_mask" in self.g.nodes[self.target_ntype].data, \
"test_mask is not provided, please specify split_ratio to generate the masks"
else:
if self.verbose:
print('Generating train/val/test masks...')
utils.add_nodepred_split(self, self.split_ratio, self.target_ntype)
if self.num_classes is None:
self.num_classes = len(F.unique(self.g.nodes[self.target_ntype].data['label']))
def has_cache(self):
return os.path.isfile(os.path.join(self.save_path, 'graph.bin'))
return os.path.isfile(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
def load(self):
with open(os.path.join(self.save_path, 'info.json'), 'r') as f:
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'r') as f:
info = json.load(f)
if (info['split_ratio'] != self.split_ratio
or info['target_ntype'] != self.target_ntype):
......@@ -95,16 +111,200 @@ class AsNodePredDataset(DGLDataset):
self.split_ratio = info['split_ratio']
self.target_ntype = info['target_ntype']
self.num_classes = info['num_classes']
gs, _ = utils.load_graphs(os.path.join(self.save_path, 'graph.bin'))
gs, _ = utils.load_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
self.g = gs[0]
def save(self):
utils.save_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)), [self.g])
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'w') as f:
json.dump({
'split_ratio': self.split_ratio,
'target_ntype': self.target_ntype,
'num_classes': self.num_classes}, f)
def __getitem__(self, idx):
return self.g
def __len__(self):
return 1
def negative_sample(g, num_samples):
"""Random sample negative edges from graph, excluding self-loops,
the result samples might be less than num_samples
"""
num_nodes = g.num_nodes()
redundancy = _calc_redundancy(
num_samples, g.num_edges(), num_nodes ** 2)
sample_size = int(num_samples*(1+redundancy))
edges = np.random.randint(0, num_nodes, size=(2, sample_size))
edges = np.unique(edges, axis=1)
# remove self loop
mask_self_loop = edges[0] == edges[1]
# remove existing edges
has_edges = F.asnumpy(g.has_edges_between(edges[0], edges[1]))
mask = ~(np.logical_or(mask_self_loop, has_edges))
edges = edges[:, mask]
if edges.shape[1] >= num_samples:
edges = edges[:, :num_samples]
return edges
class AsLinkPredDataset(DGLDataset):
"""Repurpose a dataset for link prediction task.
The created dataset will include data needed for link prediction.
Currently only support homogeneous graph.
It will keep only the first graph in the provided dataset and
generate train/val/test edges according to the given split ratio,
and the correspondent negative edges based on the neg_ratio. The generated
edges will be cached to disk for fast re-loading. If the provided split ratio
differs from the cached one, it will re-process the dataset properly.
Parameters
----------
dataset : DGLDataset
The dataset to be converted.
split_ratio : (float, float, float), optional
Split ratios for training, validation and test sets. Must sum to one.
neg_ratio : int, optional
Indicate how much negative samples to be sampled
The number of the negative samples will be neg_ratio * num_positive_edges.
Attributes
-------
feat_size: int
The size of the feature dimension in the graph
train_graph: DGLGraph
The DGLGraph for training
val_edges: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]
The validation set edges, encoded as
((positive_edge_src, positive_edge_dst), (negative_edge_src, negative_edge_dst))
test_edges: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]
The test set edges, encoded as
((positive_edge_src, positive_edge_dst), (negative_edge_src, negative_edge_dst))
Examples
--------
>>> ds = dgl.data.CoraGraphDataset()
>>> print(ds)
Dataset("cora_v2", num_graphs=1, save_path=...)
>>> new_ds = dgl.data.AsNodePredDataset(ds, [0.8, 0.1, 0.1])
>>> print(new_ds)
Dataset("cora_v2-as-edgepred", num_graphs=1, save_path=/home/ubuntu/.dgl/cora_v2-as-edgepred)
>>> print(hasattr(new_ds, "get_test_edges"))
True
"""
def __init__(self,
dataset,
split_ratio=None,
neg_ratio=3,
**kwargs):
self.g = dataset[0]
self.num_nodes = self.g.num_nodes()
self.dataset = dataset
self.split_ratio = split_ratio
self.neg_ratio = neg_ratio
super().__init__(dataset.name + '-as-edgepred',
hash_key=(neg_ratio, split_ratio), **kwargs)
def process(self):
if self.split_ratio is None:
assert hasattr(self.dataset, "get_edge_split"), \
"dataset doesn't have get_edge_split method, please specify split_ratio and neg_ratio to generate the split"
# This is likely to be an ogb dataset
self.edge_split = self.dataset.get_edge_split()
self._train_graph = self.g
pos_e_tensor, neg_e_tensor = self.edge_split["valid"][
"edge"], self.edge_split["valid"]["edge_neg"]
pos_e = (pos_e_tensor[:, 0], pos_e_tensor[:, 1])
neg_e = (neg_e_tensor[:, 0], neg_e_tensor[:, 1])
self._val_edges = pos_e, neg_e
pos_e_tensor, neg_e_tensor = self.edge_split["test"][
"edge"], self.edge_split["test"]["edge_neg"]
pos_e = (pos_e_tensor[:, 0], pos_e_tensor[:, 1])
neg_e = (neg_e_tensor[:, 0], neg_e_tensor[:, 1])
self._test_edges = pos_e, neg_e
else:
ratio = self.split_ratio
graph = self.dataset[0]
n = graph.num_edges()
src, dst = graph.edges()
src, dst = F.asnumpy(src), F.asnumpy(dst)
n_train, n_val, n_test = int(
n * ratio[0]), int(n * ratio[1]), int(n * ratio[2])
idx = np.random.permutation(n)
train_pos_idx = idx[:n_train]
val_pos_idx = idx[n_train:n_train+n_val]
test_pos_idx = idx[n_train+n_val:]
neg_src, neg_dst = negative_sample(
graph, self.neg_ratio*(n_val+n_test))
neg_n_val, neg_n_test = self.neg_ratio * n_val, self.neg_ratio * n_test
neg_val_src, neg_val_dst = neg_src[:neg_n_val], neg_dst[:neg_n_val]
neg_test_src, neg_test_dst = neg_src[neg_n_val:], neg_dst[neg_n_val:]
self._val_edges = (F.tensor(src[val_pos_idx]), F.tensor(dst[val_pos_idx])
), (F.tensor(neg_val_src), F.tensor(neg_val_dst))
self._test_edges = (F.tensor(src[test_pos_idx]),
F.tensor(dst[test_pos_idx])), (F.tensor(neg_test_src), F.tensor(neg_test_dst))
self._train_graph = create_dgl_graph(
(src[train_pos_idx], dst[train_pos_idx]), num_nodes=self.num_nodes)
self._train_graph.ndata["feat"] = graph.ndata["feat"]
def has_cache(self):
return os.path.isfile(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
def load(self):
gs, tensor_dict = utils.load_graphs(
os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
self.g = gs[0]
self._train_graph = self.g
self._val_edges = (tensor_dict["val_pos_src"], tensor_dict["val_pos_dst"]), (
tensor_dict["val_neg_src"], tensor_dict["val_neg_dst"])
self._test_edges = (tensor_dict["test_pos_src"], tensor_dict["test_pos_dst"]), (
tensor_dict["test_neg_src"], tensor_dict["test_neg_dst"])
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'r') as f:
info = json.load(f)
self.split_ratio = info["split_ratio"]
self.neg_ratio = info["neg_ratio"]
def save(self):
utils.save_graphs(os.path.join(self.save_path, 'graph.bin'), [self.g])
with open(os.path.join(self.save_path, 'info.json'), 'w') as f:
tensor_dict = {
"val_pos_src": self._val_edges[0][0],
"val_pos_dst": self._val_edges[0][1],
"val_neg_src": self._val_edges[1][0],
"val_neg_dst": self._val_edges[1][1],
"test_pos_src": self._test_edges[0][0],
"test_pos_dst": self._test_edges[0][1],
"test_neg_src": self._test_edges[1][0],
"test_neg_dst": self._test_edges[1][1],
}
utils.save_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)), [
self._train_graph], tensor_dict)
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'w') as f:
json.dump({
'split_ratio' : self.split_ratio,
'target_ntype' : self.target_ntype,
'num_classes' : self.num_classes}, f)
'split_ratio': self.split_ratio,
'neg_ratio': self.neg_ratio}, f)
@property
def feat_size(self):
return self._train_graph.ndata["feat"].shape[-1]
@property
def train_graph(self):
return self._train_graph
@property
def val_edges(self):
return self._val_edges
@property
def test_edges(self):
return self._test_edges
def __getitem__(self, idx):
return self.g
......
......@@ -9,7 +9,7 @@ import yaml
import pytest
import dgl.data as data
from dgl import DGLError
import dgl
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_minigc():
......@@ -1067,6 +1067,33 @@ def test_as_nodepred2():
ds = data.AsNodePredDataset(data.AIFBDataset(), [0.1, 0.1, 0.8], 'Personen', verbose=True)
assert F.sum(F.astype(ds[0].nodes['Personen'].data['train_mask'], F.int32), 0) == int(ds[0].num_nodes('Personen') * 0.1)
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_as_linkpred():
# create
ds = data.AsLinkPredDataset(data.CoraGraphDataset(), split_ratio=[0.8, 0.1, 0.1], neg_ratio=1, verbose=True)
# Cora has 10556 edges, 10% test edges can be 1057
assert ds.test_edges[0][0].shape[0] == 1057
# negative samples, not guaranteed, so the assert is in a relaxed range
assert 1000 <= ds.test_edges[1][0].shape[0] <= 1057
# read from cache
ds = data.AsLinkPredDataset(data.CoraGraphDataset(), split_ratio=[0.7, 0.1, 0.2], neg_ratio=2, verbose=True)
assert ds.test_edges[0][0].shape[0] == 2112
# negative samples, not guaranteed to be ratio 2, so the assert is in a relaxed range
assert 4000 < ds.test_edges[1][0].shape[0] <= 4224
@unittest.skipIf(dgl.backend.backend_name != 'pytorch', reason="ogb only supports pytorch")
def test_as_linkpred_ogb():
from ogb.linkproppred import DglLinkPropPredDataset
ds = data.AsLinkPredDataset(DglLinkPropPredDataset("ogbl-collab"), split_ratio=None, verbose=True)
# original dataset has 46329 test edges
assert ds.test_edges[0][0].shape[0] == 46329
# force generate new split
ds = data.AsLinkPredDataset(DglLinkPropPredDataset("ogbl-collab"), split_ratio=[0.7, 0.2, 0.1], verbose=True)
assert ds.test_edges[0][0].shape[0] == 235812
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_as_nodepred_csvdataset():
with tempfile.TemporaryDirectory() as test_dir:
......@@ -1103,7 +1130,7 @@ def test_as_nodepred_csvdataset():
assert 'label' in ds[0].ndata
assert 'train_mask' not in ds[0].ndata
assert not hasattr(ds[0], 'num_classes')
new_ds = data.AsNodePredDataset(ds, force_reload=True)
new_ds = data.AsNodePredDataset(ds, split_ratio=[0.8, 0.1, 0.1], force_reload=True)
assert new_ds.num_classes == num_classes
assert 'feat' in new_ds[0].ndata
assert 'label' in new_ds[0].ndata
......
......@@ -32,7 +32,7 @@ fi
conda activate ${DGLBACKEND}-ci
python3 -m pip install pytest pyyaml pandas pydantic rdflib || EXIT /B 1
python3 -m pip install pytest pyyaml pandas pydantic rdflib ogb || fail "pip install"
python3 -m pytest -v --junitxml=pytest_compute.xml tests/compute || fail "compute"
python3 -m pytest -v --junitxml=pytest_backend.xml tests/$DGLBACKEND || fail "backend-specific"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment