"git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "cae5d4d4ea43493670d038ec01c466bec81edf38"
Unverified Commit dc78e11c authored by Jinjing Zhou's avatar Jinjing Zhou Committed by GitHub
Browse files

[Dataset] Adapter to convert a dataset for link prediction task (#3699)



* add ut

* add doc link

* install dep

* add

* Revert "install dep"

This reverts commit e574a8377144749056c6849b655004df2771e179.

* add

* merge fix

* rm files

* fix

* fix

* fix

* fix

* fix typo

* fix tf

* fix

* fix

* fix

* fix

* fix

* fix dependency

* fix test

* fix

* fix

* add doc

* fix

* fix

* fix test

* fix test
Co-authored-by: default avatarMinjie Wang <wmjlyjemaine@gmail.com>
parent bc8f8b0b
...@@ -234,6 +234,10 @@ Dataset adapters ...@@ -234,6 +234,10 @@ Dataset adapters
:members: __getitem__, __len__ :members: __getitem__, __len__
.. autoclass:: AsEdgePredDataset
:members: __getitem__, __len__
Utilities Utilities
----------------- -----------------
......
...@@ -30,7 +30,7 @@ from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset ...@@ -30,7 +30,7 @@ from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
from .fakenews import FakeNewsDataset from .fakenews import FakeNewsDataset
from .csv_dataset import DGLCSVDataset from .csv_dataset import DGLCSVDataset
from .adapter import AsNodePredDataset from .adapter import AsNodePredDataset, AsLinkPredDataset
def register_data_args(parser): def register_data_args(parser):
parser.add_argument( parser.add_argument(
......
...@@ -2,12 +2,17 @@ ...@@ -2,12 +2,17 @@
import os import os
import json import json
import numpy as np
from .. import backend as F
from ..convert import graph as create_dgl_graph
from ..sampling.negative import _calc_redundancy
from .dgl_dataset import DGLDataset from .dgl_dataset import DGLDataset
from . import utils from . import utils
from .. import backend as F from .. import backend as F
__all__ = ['AsNodePredDataset'] __all__ = ['AsNodePredDataset', 'AsLinkPredDataset']
class AsNodePredDataset(DGLDataset): class AsNodePredDataset(DGLDataset):
"""Repurpose a dataset for a standard semi-supervised transductive """Repurpose a dataset for a standard semi-supervised transductive
...@@ -61,32 +66,43 @@ class AsNodePredDataset(DGLDataset): ...@@ -61,32 +66,43 @@ class AsNodePredDataset(DGLDataset):
>>> print('train_mask' in new_ds[0].ndata) >>> print('train_mask' in new_ds[0].ndata)
True True
""" """
def __init__(self, def __init__(self,
dataset, dataset,
split_ratio=[0.8, 0.1, 0.1], split_ratio=None,
target_ntype=None, target_ntype=None,
**kwargs): **kwargs):
self.g = dataset[0].clone() self.g = dataset[0].clone()
self.split_ratio = split_ratio self.split_ratio = split_ratio
self.target_ntype = target_ntype self.target_ntype = target_ntype
self.num_classes = getattr(dataset, 'num_classes', None) self.num_classes = getattr(dataset, 'num_classes', None)
super().__init__(dataset.name + '-as-nodepred', **kwargs) super().__init__(dataset.name + '-as-nodepred',
hash_key=(split_ratio, target_ntype), **kwargs)
def process(self): def process(self):
if 'label' not in self.g.nodes[self.target_ntype].data: if 'label' not in self.g.nodes[self.target_ntype].data:
raise ValueError("Missing node labels. Make sure labels are stored " raise ValueError("Missing node labels. Make sure labels are stored "
"under name 'label'.") "under name 'label'.")
if self.num_classes is None: if self.split_ratio is None:
self.num_classes = len(F.unique(self.g.nodes[self.target_ntype].data['label'])) assert "train_mask" in self.g.nodes[self.target_ntype].data, \
"train_mask is not provided, please specify split_ratio to generate the masks"
assert "val_mask" in self.g.nodes[self.target_ntype].data, \
"val_mask is not provided, please specify split_ratio to generate the masks"
assert "test_mask" in self.g.nodes[self.target_ntype].data, \
"test_mask is not provided, please specify split_ratio to generate the masks"
else:
if self.verbose: if self.verbose:
print('Generating train/val/test masks...') print('Generating train/val/test masks...')
utils.add_nodepred_split(self, self.split_ratio, self.target_ntype) utils.add_nodepred_split(self, self.split_ratio, self.target_ntype)
if self.num_classes is None:
self.num_classes = len(F.unique(self.g.nodes[self.target_ntype].data['label']))
def has_cache(self): def has_cache(self):
return os.path.isfile(os.path.join(self.save_path, 'graph.bin')) return os.path.isfile(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
def load(self): def load(self):
with open(os.path.join(self.save_path, 'info.json'), 'r') as f: with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'r') as f:
info = json.load(f) info = json.load(f)
if (info['split_ratio'] != self.split_ratio if (info['split_ratio'] != self.split_ratio
or info['target_ntype'] != self.target_ntype): or info['target_ntype'] != self.target_ntype):
...@@ -95,16 +111,200 @@ class AsNodePredDataset(DGLDataset): ...@@ -95,16 +111,200 @@ class AsNodePredDataset(DGLDataset):
self.split_ratio = info['split_ratio'] self.split_ratio = info['split_ratio']
self.target_ntype = info['target_ntype'] self.target_ntype = info['target_ntype']
self.num_classes = info['num_classes'] self.num_classes = info['num_classes']
gs, _ = utils.load_graphs(os.path.join(self.save_path, 'graph.bin')) gs, _ = utils.load_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
self.g = gs[0]
def save(self):
utils.save_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)), [self.g])
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'w') as f:
json.dump({
'split_ratio': self.split_ratio,
'target_ntype': self.target_ntype,
'num_classes': self.num_classes}, f)
def __getitem__(self, idx):
return self.g
def __len__(self):
return 1
def negative_sample(g, num_samples):
"""Random sample negative edges from graph, excluding self-loops,
the result samples might be less than num_samples
"""
num_nodes = g.num_nodes()
redundancy = _calc_redundancy(
num_samples, g.num_edges(), num_nodes ** 2)
sample_size = int(num_samples*(1+redundancy))
edges = np.random.randint(0, num_nodes, size=(2, sample_size))
edges = np.unique(edges, axis=1)
# remove self loop
mask_self_loop = edges[0] == edges[1]
# remove existing edges
has_edges = F.asnumpy(g.has_edges_between(edges[0], edges[1]))
mask = ~(np.logical_or(mask_self_loop, has_edges))
edges = edges[:, mask]
if edges.shape[1] >= num_samples:
edges = edges[:, :num_samples]
return edges
class AsLinkPredDataset(DGLDataset):
"""Repurpose a dataset for link prediction task.
The created dataset will include data needed for link prediction.
Currently only support homogeneous graph.
It will keep only the first graph in the provided dataset and
generate train/val/test edges according to the given split ratio,
and the correspondent negative edges based on the neg_ratio. The generated
edges will be cached to disk for fast re-loading. If the provided split ratio
differs from the cached one, it will re-process the dataset properly.
Parameters
----------
dataset : DGLDataset
The dataset to be converted.
split_ratio : (float, float, float), optional
Split ratios for training, validation and test sets. Must sum to one.
neg_ratio : int, optional
Indicate how much negative samples to be sampled
The number of the negative samples will be neg_ratio * num_positive_edges.
Attributes
-------
feat_size: int
The size of the feature dimension in the graph
train_graph: DGLGraph
The DGLGraph for training
val_edges: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]
The validation set edges, encoded as
((positive_edge_src, positive_edge_dst), (negative_edge_src, negative_edge_dst))
test_edges: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]
The test set edges, encoded as
((positive_edge_src, positive_edge_dst), (negative_edge_src, negative_edge_dst))
Examples
--------
>>> ds = dgl.data.CoraGraphDataset()
>>> print(ds)
Dataset("cora_v2", num_graphs=1, save_path=...)
>>> new_ds = dgl.data.AsNodePredDataset(ds, [0.8, 0.1, 0.1])
>>> print(new_ds)
Dataset("cora_v2-as-edgepred", num_graphs=1, save_path=/home/ubuntu/.dgl/cora_v2-as-edgepred)
>>> print(hasattr(new_ds, "get_test_edges"))
True
"""
def __init__(self,
dataset,
split_ratio=None,
neg_ratio=3,
**kwargs):
self.g = dataset[0]
self.num_nodes = self.g.num_nodes()
self.dataset = dataset
self.split_ratio = split_ratio
self.neg_ratio = neg_ratio
super().__init__(dataset.name + '-as-edgepred',
hash_key=(neg_ratio, split_ratio), **kwargs)
def process(self):
if self.split_ratio is None:
assert hasattr(self.dataset, "get_edge_split"), \
"dataset doesn't have get_edge_split method, please specify split_ratio and neg_ratio to generate the split"
# This is likely to be an ogb dataset
self.edge_split = self.dataset.get_edge_split()
self._train_graph = self.g
pos_e_tensor, neg_e_tensor = self.edge_split["valid"][
"edge"], self.edge_split["valid"]["edge_neg"]
pos_e = (pos_e_tensor[:, 0], pos_e_tensor[:, 1])
neg_e = (neg_e_tensor[:, 0], neg_e_tensor[:, 1])
self._val_edges = pos_e, neg_e
pos_e_tensor, neg_e_tensor = self.edge_split["test"][
"edge"], self.edge_split["test"]["edge_neg"]
pos_e = (pos_e_tensor[:, 0], pos_e_tensor[:, 1])
neg_e = (neg_e_tensor[:, 0], neg_e_tensor[:, 1])
self._test_edges = pos_e, neg_e
else:
ratio = self.split_ratio
graph = self.dataset[0]
n = graph.num_edges()
src, dst = graph.edges()
src, dst = F.asnumpy(src), F.asnumpy(dst)
n_train, n_val, n_test = int(
n * ratio[0]), int(n * ratio[1]), int(n * ratio[2])
idx = np.random.permutation(n)
train_pos_idx = idx[:n_train]
val_pos_idx = idx[n_train:n_train+n_val]
test_pos_idx = idx[n_train+n_val:]
neg_src, neg_dst = negative_sample(
graph, self.neg_ratio*(n_val+n_test))
neg_n_val, neg_n_test = self.neg_ratio * n_val, self.neg_ratio * n_test
neg_val_src, neg_val_dst = neg_src[:neg_n_val], neg_dst[:neg_n_val]
neg_test_src, neg_test_dst = neg_src[neg_n_val:], neg_dst[neg_n_val:]
self._val_edges = (F.tensor(src[val_pos_idx]), F.tensor(dst[val_pos_idx])
), (F.tensor(neg_val_src), F.tensor(neg_val_dst))
self._test_edges = (F.tensor(src[test_pos_idx]),
F.tensor(dst[test_pos_idx])), (F.tensor(neg_test_src), F.tensor(neg_test_dst))
self._train_graph = create_dgl_graph(
(src[train_pos_idx], dst[train_pos_idx]), num_nodes=self.num_nodes)
self._train_graph.ndata["feat"] = graph.ndata["feat"]
def has_cache(self):
return os.path.isfile(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
def load(self):
gs, tensor_dict = utils.load_graphs(
os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
self.g = gs[0] self.g = gs[0]
self._train_graph = self.g
self._val_edges = (tensor_dict["val_pos_src"], tensor_dict["val_pos_dst"]), (
tensor_dict["val_neg_src"], tensor_dict["val_neg_dst"])
self._test_edges = (tensor_dict["test_pos_src"], tensor_dict["test_pos_dst"]), (
tensor_dict["test_neg_src"], tensor_dict["test_neg_dst"])
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'r') as f:
info = json.load(f)
self.split_ratio = info["split_ratio"]
self.neg_ratio = info["neg_ratio"]
def save(self): def save(self):
utils.save_graphs(os.path.join(self.save_path, 'graph.bin'), [self.g]) tensor_dict = {
with open(os.path.join(self.save_path, 'info.json'), 'w') as f: "val_pos_src": self._val_edges[0][0],
"val_pos_dst": self._val_edges[0][1],
"val_neg_src": self._val_edges[1][0],
"val_neg_dst": self._val_edges[1][1],
"test_pos_src": self._test_edges[0][0],
"test_pos_dst": self._test_edges[0][1],
"test_neg_src": self._test_edges[1][0],
"test_neg_dst": self._test_edges[1][1],
}
utils.save_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)), [
self._train_graph], tensor_dict)
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'w') as f:
json.dump({ json.dump({
'split_ratio' : self.split_ratio, 'split_ratio': self.split_ratio,
'target_ntype' : self.target_ntype, 'neg_ratio': self.neg_ratio}, f)
'num_classes' : self.num_classes}, f)
@property
def feat_size(self):
return self._train_graph.ndata["feat"].shape[-1]
@property
def train_graph(self):
return self._train_graph
@property
def val_edges(self):
return self._val_edges
@property
def test_edges(self):
return self._test_edges
def __getitem__(self, idx): def __getitem__(self, idx):
return self.g return self.g
......
...@@ -9,7 +9,7 @@ import yaml ...@@ -9,7 +9,7 @@ import yaml
import pytest import pytest
import dgl.data as data import dgl.data as data
from dgl import DGLError from dgl import DGLError
import dgl
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.") @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_minigc(): def test_minigc():
...@@ -1067,6 +1067,33 @@ def test_as_nodepred2(): ...@@ -1067,6 +1067,33 @@ def test_as_nodepred2():
ds = data.AsNodePredDataset(data.AIFBDataset(), [0.1, 0.1, 0.8], 'Personen', verbose=True) ds = data.AsNodePredDataset(data.AIFBDataset(), [0.1, 0.1, 0.8], 'Personen', verbose=True)
assert F.sum(F.astype(ds[0].nodes['Personen'].data['train_mask'], F.int32), 0) == int(ds[0].num_nodes('Personen') * 0.1) assert F.sum(F.astype(ds[0].nodes['Personen'].data['train_mask'], F.int32), 0) == int(ds[0].num_nodes('Personen') * 0.1)
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_as_linkpred():
# create
ds = data.AsLinkPredDataset(data.CoraGraphDataset(), split_ratio=[0.8, 0.1, 0.1], neg_ratio=1, verbose=True)
# Cora has 10556 edges, 10% test edges can be 1057
assert ds.test_edges[0][0].shape[0] == 1057
# negative samples, not guaranteed, so the assert is in a relaxed range
assert 1000 <= ds.test_edges[1][0].shape[0] <= 1057
# read from cache
ds = data.AsLinkPredDataset(data.CoraGraphDataset(), split_ratio=[0.7, 0.1, 0.2], neg_ratio=2, verbose=True)
assert ds.test_edges[0][0].shape[0] == 2112
# negative samples, not guaranteed to be ratio 2, so the assert is in a relaxed range
assert 4000 < ds.test_edges[1][0].shape[0] <= 4224
@unittest.skipIf(dgl.backend.backend_name != 'pytorch', reason="ogb only supports pytorch")
def test_as_linkpred_ogb():
from ogb.linkproppred import DglLinkPropPredDataset
ds = data.AsLinkPredDataset(DglLinkPropPredDataset("ogbl-collab"), split_ratio=None, verbose=True)
# original dataset has 46329 test edges
assert ds.test_edges[0][0].shape[0] == 46329
# force generate new split
ds = data.AsLinkPredDataset(DglLinkPropPredDataset("ogbl-collab"), split_ratio=[0.7, 0.2, 0.1], verbose=True)
assert ds.test_edges[0][0].shape[0] == 235812
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.") @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_as_nodepred_csvdataset(): def test_as_nodepred_csvdataset():
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
...@@ -1103,7 +1130,7 @@ def test_as_nodepred_csvdataset(): ...@@ -1103,7 +1130,7 @@ def test_as_nodepred_csvdataset():
assert 'label' in ds[0].ndata assert 'label' in ds[0].ndata
assert 'train_mask' not in ds[0].ndata assert 'train_mask' not in ds[0].ndata
assert not hasattr(ds[0], 'num_classes') assert not hasattr(ds[0], 'num_classes')
new_ds = data.AsNodePredDataset(ds, force_reload=True) new_ds = data.AsNodePredDataset(ds, split_ratio=[0.8, 0.1, 0.1], force_reload=True)
assert new_ds.num_classes == num_classes assert new_ds.num_classes == num_classes
assert 'feat' in new_ds[0].ndata assert 'feat' in new_ds[0].ndata
assert 'label' in new_ds[0].ndata assert 'label' in new_ds[0].ndata
......
...@@ -32,7 +32,7 @@ fi ...@@ -32,7 +32,7 @@ fi
conda activate ${DGLBACKEND}-ci conda activate ${DGLBACKEND}-ci
python3 -m pip install pytest pyyaml pandas pydantic rdflib || EXIT /B 1 python3 -m pip install pytest pyyaml pandas pydantic rdflib ogb || fail "pip install"
python3 -m pytest -v --junitxml=pytest_compute.xml tests/compute || fail "compute" python3 -m pytest -v --junitxml=pytest_compute.xml tests/compute || fail "compute"
python3 -m pytest -v --junitxml=pytest_backend.xml tests/$DGLBACKEND || fail "backend-specific" python3 -m pytest -v --junitxml=pytest_backend.xml tests/$DGLBACKEND || fail "backend-specific"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment