Unverified Commit 54b4bd0a authored by Gleb Bazhenov's avatar Gleb Bazhenov Committed by GitHub
Browse files

[Data] new functional for creating data splits in graph (#5418)



* new functional for creating data splits in graph

* minor fix in data split implementation

* apply suggestions from code review
Co-authored-by: default avatarMufei Li <mufeili1996@gmail.com>

* refactoring + unit tests

* fix test file name

* move imports to the top

* Revert "fix test file name"

This reverts commit 126323e38c8de6fdc73c826dccc9048aed8f9634.

* remove nccl submodule

* address linter issues

---------
Co-authored-by: default avatarMufei Li <mufeili1996@gmail.com>
Co-authored-by: default avatarHongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
parent a03dec05
...@@ -123,4 +123,6 @@ Utilities ...@@ -123,4 +123,6 @@ Utilities
utils.save_info utils.save_info
utils.load_info utils.load_info
utils.add_nodepred_split utils.add_nodepred_split
utils.mask_nodes_by_property
utils.add_node_property_split
utils.Subset utils.Subset
...@@ -8,6 +8,8 @@ import pickle ...@@ -8,6 +8,8 @@ import pickle
import sys import sys
import warnings import warnings
import networkx.algorithms as A
import numpy as np import numpy as np
import requests import requests
...@@ -29,6 +31,8 @@ __all__ = [ ...@@ -29,6 +31,8 @@ __all__ = [
"save_tensors", "save_tensors",
"load_tensors", "load_tensors",
"add_nodepred_split", "add_nodepred_split",
"add_node_property_split",
"mask_nodes_by_property",
] ]
...@@ -482,3 +486,191 @@ def add_nodepred_split(dataset, ratio, ntype=None): ...@@ -482,3 +486,191 @@ def add_nodepred_split(dataset, ratio, ntype=None):
g.nodes[ntype].data["train_mask"] = train_mask g.nodes[ntype].data["train_mask"] = train_mask
g.nodes[ntype].data["val_mask"] = val_mask g.nodes[ntype].data["val_mask"] = val_mask
g.nodes[ntype].data["test_mask"] = test_mask g.nodes[ntype].data["test_mask"] = test_mask
def mask_nodes_by_property(property_values, part_ratios, random_seed=None):
"""Provide the split masks for a node split with distributional shift based on a given
node property, as proposed in `Evaluating Robustness and Uncertainty of Graph Models
Under Structural Distributional Shifts <https://arxiv.org/abs/2302.13875v1>`__
It considers the in-distribution (ID) and out-of-distribution (OOD) subsets of nodes.
The ID subset includes training, validation and testing parts, while the OOD subset
includes validation and testing parts. It sorts the nodes in the ascending order of
their property values, splits them into 5 non-intersecting parts, and creates 5
associated node mask arrays:
- 3 for the ID nodes: ``'in_train_mask'``, ``'in_valid_mask'``, ``'in_test_mask'``,
- and 2 for the OOD nodes: ``'out_valid_mask'``, ``'out_test_mask'``.
Parameters
----------
property_values : numpy ndarray
The node property (float) values by which the dataset will be split.
The length of the array must be equal to the number of nodes in graph.
part_ratios : list
A list of 5 ratios for training, ID validation, ID test,
OOD validation, OOD testing parts. The values in the list must sum to one.
random_seed : int, optional
Random seed to fix for the initial permutation of nodes. It is
used to create a random order for the nodes that have the same
property values or belong to the ID subset. (default: None)
Returns
----------
split_masks : dict
A python dict storing the mask names as keys and the corresponding
node mask arrays as values.
Examples
--------
>>> num_nodes = 1000
>>> property_values = np.random.uniform(size=num_nodes)
>>> part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
>>> split_masks = dgl.data.utils.mask_nodes_by_property(property_values, part_ratios)
>>> print('in_valid_mask' in split_masks)
True
"""
num_nodes = len(property_values)
part_sizes = np.round(num_nodes * np.array(part_ratios)).astype(int)
part_sizes[-1] -= np.sum(part_sizes) - num_nodes
generator = np.random.RandomState(random_seed)
permutation = generator.permutation(num_nodes)
node_indices = np.arange(num_nodes)[permutation]
property_values = property_values[permutation]
in_distribution_size = np.sum(part_sizes[:3])
node_indices_ordered = node_indices[np.argsort(property_values)]
node_indices_ordered[:in_distribution_size] = generator.permutation(
node_indices_ordered[:in_distribution_size]
)
sections = np.cumsum(part_sizes)
node_split = np.split(node_indices_ordered, sections)[:-1]
mask_names = [
"in_train_mask",
"in_valid_mask",
"in_test_mask",
"out_valid_mask",
"out_test_mask",
]
split_masks = {}
for mask_name, node_indices in zip(mask_names, node_split):
split_mask = idx2mask(node_indices, num_nodes)
split_masks[mask_name] = generate_mask_tensor(split_mask)
return split_masks
def add_node_property_split(
dataset, part_ratios, property_name, ascending=True, random_seed=None
):
"""Create a node split with distributional shift based on a given node property,
as proposed in `Evaluating Robustness and Uncertainty of Graph Models Under
Structural Distributional Shifts <https://arxiv.org/abs/2302.13875v1>`__
It splits the nodes of each graph in the given dataset into 5 non-intersecting
parts based on their structural properties. This can be used for transductive node
prediction task with distributional shifts.
It considers the in-distribution (ID) and out-of-distribution (OOD) subsets of nodes.
The ID subset includes training, validation and testing parts, while the OOD subset
includes validation and testing parts. As a result, it creates 5 associated node mask
arrays for each graph:
- 3 for the ID nodes: ``'in_train_mask'``, ``'in_valid_mask'``, ``'in_test_mask'``,
- and 2 for the OOD nodes: ``'out_valid_mask'``, ``'out_test_mask'``.
This function implements 3 particular strategies for inducing distributional shifts
in graph — based on **popularity**, **locality** or **density**.
Parameters
----------
dataset : :class:`~DGLDataset` or list of :class:`~dgl.DGLGraph`
The dataset to induce structural distributional shift.
part_ratios : list
A list of 5 ratio values for training, ID validation, ID test,
OOD validation and OOD test parts. The values must sum to 1.0.
property_name : str
The name of the node property to be used, which must be
``'popularity'``, ``'locality'`` or ``'density'``.
ascending : bool, optional
Whether to sort nodes in the ascending order of the node property,
so that nodes with greater values of the property are considered
to be OOD (default: True)
random_seed : int, optional
Random seed to fix for the initial permutation of nodes. It is
used to create a random order for the nodes that have the same
property values or belong to the ID subset. (default: None)
Examples
--------
>>> dataset = dgl.data.AmazonCoBuyComputerDataset()
>>> print('in_valid_mask' in dataset[0].ndata)
False
>>> part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
>>> property_name = 'popularity'
>>> dgl.data.utils.add_node_property_split(dataset, part_ratios, property_name)
>>> print('in_valid_mask' in dataset[0].ndata)
True
"""
assert property_name in [
"popularity",
"locality",
"density",
], "The name of property has to be 'popularity', 'locality', or 'density'"
assert len(part_ratios) == 5, "part_ratios must contain 5 values"
import networkx as nx
for idx in range(len(dataset)):
graph_dgl = dataset[idx]
graph_nx = nx.Graph(graph_dgl.to_networkx())
compute_property_fn = _property_name_to_compute_fn[property_name]
property_values = compute_property_fn(graph_nx, ascending)
node_masks = mask_nodes_by_property(
property_values, part_ratios, random_seed
)
for mask_name, node_mask in node_masks.items():
graph_dgl.ndata[mask_name] = node_mask
def _compute_popularity_property(graph_nx, ascending=True):
direction = -1 if ascending else 1
property_values = direction * np.array(list(A.pagerank(graph_nx).values()))
return property_values
def _compute_locality_property(graph_nx, ascending=True):
num_nodes = graph_nx.number_of_nodes()
pagerank_values = np.array(list(A.pagerank(graph_nx).values()))
personalization = dict(zip(range(num_nodes), [0.0] * num_nodes))
personalization[np.argmax(pagerank_values)] = 1.0
direction = -1 if ascending else 1
property_values = direction * np.array(
list(A.pagerank(graph_nx, personalization=personalization).values())
)
return property_values
def _compute_density_property(graph_nx, ascending=True):
direction = -1 if ascending else 1
property_values = direction * np.array(
list(A.clustering(graph_nx).values())
)
return property_values
_property_name_to_compute_fn = {
"popularity": _compute_popularity_property,
"locality": _compute_locality_property,
"density": _compute_density_property,
}
...@@ -408,38 +408,6 @@ def test_cluster(): ...@@ -408,38 +408,6 @@ def test_cluster():
assert ds.num_classes == 6 assert ds.num_classes == 6
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_extract_archive():
# gzip
with tempfile.TemporaryDirectory() as src_dir:
gz_file = "gz_archive"
gz_path = os.path.join(src_dir, gz_file + ".gz")
content = b"test extract archive gzip"
with gzip.open(gz_path, "wb") as f:
f.write(content)
with tempfile.TemporaryDirectory() as dst_dir:
data.utils.extract_archive(gz_path, dst_dir, overwrite=True)
assert os.path.exists(os.path.join(dst_dir, gz_file))
# tar
with tempfile.TemporaryDirectory() as src_dir:
tar_file = "tar_archive"
tar_path = os.path.join(src_dir, tar_file + ".tar")
# default encode to utf8
content = "test extract archive tar\n".encode()
info = tarfile.TarInfo(name="tar_archive")
info.size = len(content)
with tarfile.open(tar_path, "w") as f:
f.addfile(info, io.BytesIO(content))
with tempfile.TemporaryDirectory() as dst_dir:
data.utils.extract_archive(tar_path, dst_dir, overwrite=True)
assert os.path.exists(os.path.join(dst_dir, tar_file))
def _test_construct_graphs_node_ids(): def _test_construct_graphs_node_ids():
from dgl.data.csv_dataset_base import ( from dgl.data.csv_dataset_base import (
DGLGraphConstructor, DGLGraphConstructor,
...@@ -1659,25 +1627,6 @@ def test_csvdataset(): ...@@ -1659,25 +1627,6 @@ def test_csvdataset():
_test_CSVDataset_customized_data_parser() _test_CSVDataset_customized_data_parser()
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_add_nodepred_split():
dataset = data.AmazonCoBuyComputerDataset()
print("train_mask" in dataset[0].ndata)
data.utils.add_nodepred_split(dataset, [0.8, 0.1, 0.1])
assert "train_mask" in dataset[0].ndata
dataset = data.AIFBDataset()
print("train_mask" in dataset[0].nodes["Publikationen"].data)
data.utils.add_nodepred_split(
dataset, [0.8, 0.1, 0.1], ntype="Publikationen"
)
assert "train_mask" in dataset[0].nodes["Publikationen"].data
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "gpu", F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.", reason="Datasets don't need to be tested on GPU.",
...@@ -2094,9 +2043,7 @@ if __name__ == "__main__": ...@@ -2094,9 +2043,7 @@ if __name__ == "__main__":
test_tudataset_regression() test_tudataset_regression()
test_fraud() test_fraud()
test_fakenews() test_fakenews()
test_extract_archive()
test_csvdataset() test_csvdataset()
test_add_nodepred_split()
test_as_nodepred1() test_as_nodepred1()
test_as_nodepred2() test_as_nodepred2()
test_as_nodepred_csvdataset() test_as_nodepred_csvdataset()
import gzip
import io
import os
import tarfile
import tempfile
import unittest
import backend as F
import dgl
import dgl.data as data
import numpy as np
import pandas as pd
import pytest
import yaml
from dgl import DGLError
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_add_nodepred_split():
dataset = data.AmazonCoBuyComputerDataset()
print("train_mask" in dataset[0].ndata)
data.utils.add_nodepred_split(dataset, [0.8, 0.1, 0.1])
assert "train_mask" in dataset[0].ndata
dataset = data.AIFBDataset()
print("train_mask" in dataset[0].nodes["Publikationen"].data)
data.utils.add_nodepred_split(
dataset, [0.8, 0.1, 0.1], ntype="Publikationen"
)
assert "train_mask" in dataset[0].nodes["Publikationen"].data
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_extract_archive():
# gzip
with tempfile.TemporaryDirectory() as src_dir:
gz_file = "gz_archive"
gz_path = os.path.join(src_dir, gz_file + ".gz")
content = b"test extract archive gzip"
with gzip.open(gz_path, "wb") as f:
f.write(content)
with tempfile.TemporaryDirectory() as dst_dir:
data.utils.extract_archive(gz_path, dst_dir, overwrite=True)
assert os.path.exists(os.path.join(dst_dir, gz_file))
# tar
with tempfile.TemporaryDirectory() as src_dir:
tar_file = "tar_archive"
tar_path = os.path.join(src_dir, tar_file + ".tar")
# default encode to utf8
content = "test extract archive tar\n".encode()
info = tarfile.TarInfo(name="tar_archive")
info.size = len(content)
with tarfile.open(tar_path, "w") as f:
f.addfile(info, io.BytesIO(content))
with tempfile.TemporaryDirectory() as dst_dir:
data.utils.extract_archive(tar_path, dst_dir, overwrite=True)
assert os.path.exists(os.path.join(dst_dir, tar_file))
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_mask_nodes_by_property():
num_nodes = 1000
property_values = np.random.uniform(size=num_nodes)
part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
split_masks = data.utils.mask_nodes_by_property(
property_values, part_ratios
)
assert "in_valid_mask" in split_masks
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_add_node_property_split():
dataset = data.AmazonCoBuyComputerDataset()
part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
for property_name in ["popularity", "locality", "density"]:
data.utils.add_node_property_split(dataset, part_ratios, property_name)
assert "in_valid_mask" in dataset[0].ndata
if __name__ == "__main__":
test_extract_archive()
test_add_nodepred_split()
test_mask_nodes_by_property()
test_add_node_property_split()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment