Unverified Commit 90a308f3 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[GraphBolt] split OnDiskDataset into separate file (#5963)

parent dc90ea16
......@@ -12,6 +12,7 @@ from .feature_store import *
from .feature_fetcher import *
from .copy_to import *
from .dataset import *
from .impl import *
from .dataloader import *
from .subgraph_sampler import *
......
"""GraphBolt Dataset."""
from typing import List, Optional
import pydantic
import pydantic_yaml
from typing import List
from .feature_store import FeatureStore
from .itemset import ItemSet, ItemSetDict
from .utils import read_data, tensor_to_tuple
__all__ = ["Dataset", "OnDiskDataset"]
__all__ = ["Dataset"]
class Dataset:
......@@ -54,123 +50,3 @@ class Dataset:
def feature(self) -> FeatureStore:
"""Return the feature."""
raise NotImplementedError
class OnDiskDataFormatEnum(pydantic_yaml.YamlStrEnum):
"""Enum of data format."""
TORCH = "torch"
NUMPY = "numpy"
class OnDiskTVTSet(pydantic.BaseModel):
"""Train-Validation-Test set."""
type_name: Optional[str]
format: OnDiskDataFormatEnum
in_memory: Optional[bool] = True
path: str
class OnDiskMetaData(pydantic_yaml.YamlModel):
"""Metadata specification in YAML.
As multiple node/edge types and multiple splits are supported, each TVT set
is a list of list of ``OnDiskTVTSet``.
"""
train_sets: Optional[List[List[OnDiskTVTSet]]]
validation_sets: Optional[List[List[OnDiskTVTSet]]]
test_sets: Optional[List[List[OnDiskTVTSet]]]
class OnDiskDataset(Dataset):
"""An on-disk dataset.
An on-disk dataset is a dataset which reads graph topology, feature data
and TVT set from disk. Due to limited resources, the data which are too
large to fit into RAM will remain on disk while others reside in RAM once
``OnDiskDataset`` is initialized. This behavior could be controled by user
via ``in_memory`` field in YAML file.
A full example of YAML file is as follows:
.. code-block:: yaml
train_sets:
- - type_name: paper # could be null for homogeneous graph.
format: numpy
in_memory: true # If not specified, default to true.
path: set/paper-train.npy
validation_sets:
- - type_name: paper
format: numpy
in_memory: true
path: set/paper-validation.npy
test_sets:
- - type_name: paper
format: numpy
in_memory: true
path: set/paper-test.npy
Parameters
----------
path: str
The YAML file path.
"""
def __init__(self, path: str) -> None:
with open(path, "r") as f:
self._meta = OnDiskMetaData.parse_raw(f.read(), proto="yaml")
self._train_sets = self._init_tvt_sets(self._meta.train_sets)
self._validation_sets = self._init_tvt_sets(self._meta.validation_sets)
self._test_sets = self._init_tvt_sets(self._meta.test_sets)
def train_sets(self) -> List[ItemSet] or List[ItemSetDict]:
"""Return the training set."""
return self._train_sets
def validation_sets(self) -> List[ItemSet] or List[ItemSetDict]:
"""Return the validation set."""
return self._validation_sets
def test_sets(self) -> List[ItemSet] or List[ItemSetDict]:
"""Return the test set."""
return self._test_sets
def graph(self) -> object:
"""Return the graph."""
raise NotImplementedError
def feature(self) -> FeatureStore:
"""Return the feature."""
raise NotImplementedError
def _init_tvt_sets(
self, tvt_sets: List[List[OnDiskTVTSet]]
) -> List[ItemSet] or List[ItemSetDict]:
"""Initialize the TVT sets."""
if (tvt_sets is None) or (len(tvt_sets) == 0):
return None
ret = []
for tvt_set in tvt_sets:
if (tvt_set is None) or (len(tvt_set) == 0):
ret.append(None)
if tvt_set[0].type_name is None:
assert (
len(tvt_set) == 1
), "Only one TVT set is allowed if type_name is not specified."
data = read_data(
tvt_set[0].path, tvt_set[0].format, tvt_set[0].in_memory
)
ret.append(ItemSet(tensor_to_tuple(data)))
else:
data = {}
for tvt in tvt_set:
data[tvt.type_name] = ItemSet(
tensor_to_tuple(
read_data(tvt.path, tvt.format, tvt.in_memory)
)
)
ret.append(ItemSetDict(data))
return ret
"""Implementation of GraphBolt."""
from .ondisk_dataset import *
"""GraphBolt OnDiskDataset."""
from typing import List, Optional
import pydantic
import pydantic_yaml
from ..dataset import Dataset
from ..feature_store import FeatureStore
from ..itemset import ItemSet, ItemSetDict
from ..utils import read_data, tensor_to_tuple
__all__ = ["OnDiskDataset"]
class OnDiskDataFormatEnum(pydantic_yaml.YamlStrEnum):
"""Enum of data format."""
TORCH = "torch"
NUMPY = "numpy"
class OnDiskTVTSet(pydantic.BaseModel):
"""Train-Validation-Test set."""
type_name: Optional[str]
format: OnDiskDataFormatEnum
in_memory: Optional[bool] = True
path: str
class OnDiskMetaData(pydantic_yaml.YamlModel):
"""Metadata specification in YAML.
As multiple node/edge types and multiple splits are supported, each TVT set
is a list of list of ``OnDiskTVTSet``.
"""
train_sets: Optional[List[List[OnDiskTVTSet]]]
validation_sets: Optional[List[List[OnDiskTVTSet]]]
test_sets: Optional[List[List[OnDiskTVTSet]]]
class OnDiskDataset(Dataset):
"""An on-disk dataset.
An on-disk dataset is a dataset which reads graph topology, feature data
and TVT set from disk. Due to limited resources, the data which are too
large to fit into RAM will remain on disk while others reside in RAM once
``OnDiskDataset`` is initialized. This behavior could be controled by user
via ``in_memory`` field in YAML file.
A full example of YAML file is as follows:
.. code-block:: yaml
train_sets:
- - type_name: paper # could be null for homogeneous graph.
format: numpy
in_memory: true # If not specified, default to true.
path: set/paper-train.npy
validation_sets:
- - type_name: paper
format: numpy
in_memory: true
path: set/paper-validation.npy
test_sets:
- - type_name: paper
format: numpy
in_memory: true
path: set/paper-test.npy
Parameters
----------
path: str
The YAML file path.
"""
def __init__(self, path: str) -> None:
with open(path, "r") as f:
self._meta = OnDiskMetaData.parse_raw(f.read(), proto="yaml")
self._train_sets = self._init_tvt_sets(self._meta.train_sets)
self._validation_sets = self._init_tvt_sets(self._meta.validation_sets)
self._test_sets = self._init_tvt_sets(self._meta.test_sets)
def train_sets(self) -> List[ItemSet] or List[ItemSetDict]:
"""Return the training set."""
return self._train_sets
def validation_sets(self) -> List[ItemSet] or List[ItemSetDict]:
"""Return the validation set."""
return self._validation_sets
def test_sets(self) -> List[ItemSet] or List[ItemSetDict]:
"""Return the test set."""
return self._test_sets
def graph(self) -> object:
"""Return the graph."""
raise NotImplementedError
def feature(self) -> FeatureStore:
"""Return the feature."""
raise NotImplementedError
def _init_tvt_sets(
self, tvt_sets: List[List[OnDiskTVTSet]]
) -> List[ItemSet] or List[ItemSetDict]:
"""Initialize the TVT sets."""
if (tvt_sets is None) or (len(tvt_sets) == 0):
return None
ret = []
for tvt_set in tvt_sets:
if (tvt_set is None) or (len(tvt_set) == 0):
ret.append(None)
if tvt_set[0].type_name is None:
assert (
len(tvt_set) == 1
), "Only one TVT set is allowed if type_name is not specified."
data = read_data(
tvt_set[0].path, tvt_set[0].format, tvt_set[0].in_memory
)
ret.append(ItemSet(tensor_to_tuple(data)))
else:
data = {}
for tvt in tvt_set:
data[tvt.type_name] = ItemSet(
tensor_to_tuple(
read_data(tvt.path, tvt.format, tvt.in_memory)
)
)
ret.append(ItemSetDict(data))
return ret
......@@ -20,443 +20,3 @@ def test_Dataset():
_ = dataset.graph()
with pytest.raises(NotImplementedError):
_ = dataset.feature()
def test_OnDiskDataset_TVTSet_exceptions():
"""Test excpetions thrown when parsing TVTSet."""
with tempfile.TemporaryDirectory() as test_dir:
yaml_file = os.path.join(test_dir, "test.yaml")
# Case 1: ``format`` is invalid.
yaml_content = """
train_sets:
- - type_name: paper
format: torch_invalid
path: set/paper-train.pt
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
with pytest.raises(pydantic.ValidationError):
_ = gb.OnDiskDataset(yaml_file)
# Case 2: ``type_name`` is not specified while multiple TVT sets are specified.
yaml_content = """
train_sets:
- - type_name: null
format: numpy
path: set/train.npy
- type_name: null
format: numpy
path: set/train.npy
"""
with open(yaml_file, "w") as f:
f.write(yaml_content)
with pytest.raises(
AssertionError,
match=r"Only one TVT set is allowed if type_name is not specified.",
):
_ = gb.OnDiskDataset(yaml_file)
def test_OnDiskDataset_TVTSet_ItemSet_id_label():
"""Test TVTSet which returns ItemSet with IDs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_ids = np.arange(1000)
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_ids, train_labels]).T
train_path = os.path.join(test_dir, "train.npy")
np.save(train_path, train_data)
validation_ids = np.arange(1000, 2000)
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_ids, validation_labels]).T
validation_path = os.path.join(test_dir, "validation.npy")
np.save(validation_path, validation_data)
test_ids = np.arange(2000, 3000)
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_ids, test_labels]).T
test_path = os.path.join(test_dir, "test.npy")
np.save(test_path, test_data)
# Case 1:
# all TVT sets are specified.
# ``type_name`` is not specified or specified as ``null``.
# ``in_memory`` could be ``true`` and ``false``.
yaml_content = f"""
train_sets:
- - type_name: null
format: numpy
in_memory: true
path: {train_path}
- - type_name: null
format: numpy
path: {train_path}
validation_sets:
- - format: numpy
path: {validation_path}
- - type_name: null
format: numpy
path: {validation_path}
test_sets:
- - type_name: null
format: numpy
in_memory: false
path: {test_path}
- - type_name: null
format: numpy
path: {test_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
# Verify train set.
train_sets = dataset.train_sets()
assert len(train_sets) == 2
for train_set in train_sets:
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet)
for i, (id, label) in enumerate(train_set):
assert id == train_ids[i]
assert label == train_labels[i]
train_sets = None
# Verify validation set.
validation_sets = dataset.validation_sets()
assert len(validation_sets) == 2
for validation_set in validation_sets:
assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet)
for i, (id, label) in enumerate(validation_set):
assert id == validation_ids[i]
assert label == validation_labels[i]
validation_sets = None
# Verify test set.
test_sets = dataset.test_sets()
assert len(test_sets) == 2
for test_set in test_sets:
assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet)
for i, (id, label) in enumerate(test_set):
assert id == test_ids[i]
assert label == test_labels[i]
test_sets = None
dataset = None
# Case 2: Some TVT sets are None.
yaml_content = f"""
train_sets:
- - type_name: null
format: numpy
path: {train_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
assert dataset.train_sets() is not None
assert dataset.validation_sets() is None
assert dataset.test_sets() is None
dataset = None
def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
"""Test TVTSet which returns ItemSet with IDs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_pairs = (np.arange(1000), np.arange(1000, 2000))
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T
train_path = os.path.join(test_dir, "train.npy")
np.save(train_path, train_data)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T
validation_path = os.path.join(test_dir, "validation.npy")
np.save(validation_path, validation_data)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T
test_path = os.path.join(test_dir, "test.npy")
np.save(test_path, test_data)
yaml_content = f"""
train_sets:
- - type_name: null
format: numpy
in_memory: true
path: {train_path}
- - type_name: null
format: numpy
path: {train_path}
validation_sets:
- - format: numpy
path: {validation_path}
- - type_name: null
format: numpy
path: {validation_path}
test_sets:
- - type_name: null
format: numpy
in_memory: false
path: {test_path}
- - type_name: null
format: numpy
path: {test_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
# Verify train set.
train_sets = dataset.train_sets()
assert len(train_sets) == 2
for train_set in train_sets:
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(train_set):
assert src == train_pairs[0][i]
assert dst == train_pairs[1][i]
assert label == train_labels[i]
train_sets = None
# Verify validation set.
validation_sets = dataset.validation_sets()
assert len(validation_sets) == 2
for validation_set in validation_sets:
assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(validation_set):
assert src == validation_pairs[0][i]
assert dst == validation_pairs[1][i]
assert label == validation_labels[i]
validation_sets = None
# Verify test set.
test_sets = dataset.test_sets()
assert len(test_sets) == 2
for test_set in test_sets:
assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(test_set):
assert src == test_pairs[0][i]
assert dst == test_pairs[1][i]
assert label == test_labels[i]
test_sets = None
dataset = None
def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
"""Test TVTSet which returns ItemSetDict with IDs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_ids = np.arange(1000)
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_ids, train_labels]).T
train_path = os.path.join(test_dir, "train.npy")
np.save(train_path, train_data)
validation_ids = np.arange(1000, 2000)
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_ids, validation_labels]).T
validation_path = os.path.join(test_dir, "validation.npy")
np.save(validation_path, validation_data)
test_ids = np.arange(2000, 3000)
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_ids, test_labels]).T
test_path = os.path.join(test_dir, "test.npy")
np.save(test_path, test_data)
yaml_content = f"""
train_sets:
- - type_name: paper
format: numpy
in_memory: true
path: {train_path}
- - type_name: author
format: numpy
path: {train_path}
validation_sets:
- - type_name: paper
format: numpy
path: {validation_path}
- - type_name: author
format: numpy
path: {validation_path}
test_sets:
- - type_name: paper
format: numpy
in_memory: false
path: {test_path}
- - type_name: author
format: numpy
path: {test_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
# Verify train set.
train_sets = dataset.train_sets()
assert len(train_sets) == 2
for train_set in train_sets:
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(train_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
id, label = item[key]
assert id == train_ids[i]
assert label == train_labels[i]
train_sets = None
# Verify validation set.
validation_sets = dataset.validation_sets()
assert len(validation_sets) == 2
for validation_set in validation_sets:
assert len(validation_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(validation_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
id, label = item[key]
assert id == validation_ids[i]
assert label == validation_labels[i]
validation_sets = None
# Verify test set.
test_sets = dataset.test_sets()
assert len(test_sets) == 2
for test_set in test_sets:
assert len(test_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(test_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
id, label = item[key]
assert id == test_ids[i]
assert label == test_labels[i]
test_sets = None
dataset = None
def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
"""Test TVTSet which returns ItemSetDict with node pairs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_pairs = (np.arange(1000), np.arange(1000, 2000))
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T
train_path = os.path.join(test_dir, "train.npy")
np.save(train_path, train_data)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T
validation_path = os.path.join(test_dir, "validation.npy")
np.save(validation_path, validation_data)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T
test_path = os.path.join(test_dir, "test.npy")
np.save(test_path, test_data)
yaml_content = f"""
train_sets:
- - type_name: paper
format: numpy
in_memory: true
path: {train_path}
- - type_name: author
format: numpy
path: {train_path}
validation_sets:
- - type_name: paper
format: numpy
path: {validation_path}
- - type_name: author
format: numpy
path: {validation_path}
test_sets:
- - type_name: paper
format: numpy
in_memory: false
path: {test_path}
- - type_name: author
format: numpy
path: {test_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
# Verify train set.
train_sets = dataset.train_sets()
assert len(train_sets) == 2
for train_set in train_sets:
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(train_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
src, dst, label = item[key]
assert src == train_pairs[0][i]
assert dst == train_pairs[1][i]
assert label == train_labels[i]
train_sets = None
# Verify validation set.
validation_sets = dataset.validation_sets()
assert len(validation_sets) == 2
for validation_set in validation_sets:
assert len(validation_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(validation_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
src, dst, label = item[key]
assert src == validation_pairs[0][i]
assert dst == validation_pairs[1][i]
assert label == validation_labels[i]
validation_sets = None
# Verify test set.
test_sets = dataset.test_sets()
assert len(test_sets) == 2
for test_set in test_sets:
assert len(test_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(test_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
src, dst, label = item[key]
assert src == test_pairs[0][i]
assert dst == test_pairs[1][i]
assert label == test_labels[i]
test_sets = None
dataset = None
import os
import tempfile
import numpy as np
import pydantic
import pytest
from dgl import graphbolt as gb
def test_OnDiskDataset_TVTSet_exceptions():
"""Test excpetions thrown when parsing TVTSet."""
with tempfile.TemporaryDirectory() as test_dir:
yaml_file = os.path.join(test_dir, "test.yaml")
# Case 1: ``format`` is invalid.
yaml_content = """
train_sets:
- - type_name: paper
format: torch_invalid
path: set/paper-train.pt
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
with pytest.raises(pydantic.ValidationError):
_ = gb.OnDiskDataset(yaml_file)
# Case 2: ``type_name`` is not specified while multiple TVT sets are specified.
yaml_content = """
train_sets:
- - type_name: null
format: numpy
path: set/train.npy
- type_name: null
format: numpy
path: set/train.npy
"""
with open(yaml_file, "w") as f:
f.write(yaml_content)
with pytest.raises(
AssertionError,
match=r"Only one TVT set is allowed if type_name is not specified.",
):
_ = gb.OnDiskDataset(yaml_file)
def test_OnDiskDataset_TVTSet_ItemSet_id_label():
"""Test TVTSet which returns ItemSet with IDs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_ids = np.arange(1000)
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_ids, train_labels]).T
train_path = os.path.join(test_dir, "train.npy")
np.save(train_path, train_data)
validation_ids = np.arange(1000, 2000)
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_ids, validation_labels]).T
validation_path = os.path.join(test_dir, "validation.npy")
np.save(validation_path, validation_data)
test_ids = np.arange(2000, 3000)
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_ids, test_labels]).T
test_path = os.path.join(test_dir, "test.npy")
np.save(test_path, test_data)
# Case 1:
# all TVT sets are specified.
# ``type_name`` is not specified or specified as ``null``.
# ``in_memory`` could be ``true`` and ``false``.
yaml_content = f"""
train_sets:
- - type_name: null
format: numpy
in_memory: true
path: {train_path}
- - type_name: null
format: numpy
path: {train_path}
validation_sets:
- - format: numpy
path: {validation_path}
- - type_name: null
format: numpy
path: {validation_path}
test_sets:
- - type_name: null
format: numpy
in_memory: false
path: {test_path}
- - type_name: null
format: numpy
path: {test_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
# Verify train set.
train_sets = dataset.train_sets()
assert len(train_sets) == 2
for train_set in train_sets:
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet)
for i, (id, label) in enumerate(train_set):
assert id == train_ids[i]
assert label == train_labels[i]
train_sets = None
# Verify validation set.
validation_sets = dataset.validation_sets()
assert len(validation_sets) == 2
for validation_set in validation_sets:
assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet)
for i, (id, label) in enumerate(validation_set):
assert id == validation_ids[i]
assert label == validation_labels[i]
validation_sets = None
# Verify test set.
test_sets = dataset.test_sets()
assert len(test_sets) == 2
for test_set in test_sets:
assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet)
for i, (id, label) in enumerate(test_set):
assert id == test_ids[i]
assert label == test_labels[i]
test_sets = None
dataset = None
# Case 2: Some TVT sets are None.
yaml_content = f"""
train_sets:
- - type_name: null
format: numpy
path: {train_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
assert dataset.train_sets() is not None
assert dataset.validation_sets() is None
assert dataset.test_sets() is None
dataset = None
def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
"""Test TVTSet which returns ItemSet with IDs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_pairs = (np.arange(1000), np.arange(1000, 2000))
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T
train_path = os.path.join(test_dir, "train.npy")
np.save(train_path, train_data)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T
validation_path = os.path.join(test_dir, "validation.npy")
np.save(validation_path, validation_data)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T
test_path = os.path.join(test_dir, "test.npy")
np.save(test_path, test_data)
yaml_content = f"""
train_sets:
- - type_name: null
format: numpy
in_memory: true
path: {train_path}
- - type_name: null
format: numpy
path: {train_path}
validation_sets:
- - format: numpy
path: {validation_path}
- - type_name: null
format: numpy
path: {validation_path}
test_sets:
- - type_name: null
format: numpy
in_memory: false
path: {test_path}
- - type_name: null
format: numpy
path: {test_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
# Verify train set.
train_sets = dataset.train_sets()
assert len(train_sets) == 2
for train_set in train_sets:
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(train_set):
assert src == train_pairs[0][i]
assert dst == train_pairs[1][i]
assert label == train_labels[i]
train_sets = None
# Verify validation set.
validation_sets = dataset.validation_sets()
assert len(validation_sets) == 2
for validation_set in validation_sets:
assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(validation_set):
assert src == validation_pairs[0][i]
assert dst == validation_pairs[1][i]
assert label == validation_labels[i]
validation_sets = None
# Verify test set.
test_sets = dataset.test_sets()
assert len(test_sets) == 2
for test_set in test_sets:
assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(test_set):
assert src == test_pairs[0][i]
assert dst == test_pairs[1][i]
assert label == test_labels[i]
test_sets = None
dataset = None
def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
"""Test TVTSet which returns ItemSetDict with IDs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_ids = np.arange(1000)
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_ids, train_labels]).T
train_path = os.path.join(test_dir, "train.npy")
np.save(train_path, train_data)
validation_ids = np.arange(1000, 2000)
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_ids, validation_labels]).T
validation_path = os.path.join(test_dir, "validation.npy")
np.save(validation_path, validation_data)
test_ids = np.arange(2000, 3000)
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_ids, test_labels]).T
test_path = os.path.join(test_dir, "test.npy")
np.save(test_path, test_data)
yaml_content = f"""
train_sets:
- - type_name: paper
format: numpy
in_memory: true
path: {train_path}
- - type_name: author
format: numpy
path: {train_path}
validation_sets:
- - type_name: paper
format: numpy
path: {validation_path}
- - type_name: author
format: numpy
path: {validation_path}
test_sets:
- - type_name: paper
format: numpy
in_memory: false
path: {test_path}
- - type_name: author
format: numpy
path: {test_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
# Verify train set.
train_sets = dataset.train_sets()
assert len(train_sets) == 2
for train_set in train_sets:
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(train_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
id, label = item[key]
assert id == train_ids[i]
assert label == train_labels[i]
train_sets = None
# Verify validation set.
validation_sets = dataset.validation_sets()
assert len(validation_sets) == 2
for validation_set in validation_sets:
assert len(validation_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(validation_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
id, label = item[key]
assert id == validation_ids[i]
assert label == validation_labels[i]
validation_sets = None
# Verify test set.
test_sets = dataset.test_sets()
assert len(test_sets) == 2
for test_set in test_sets:
assert len(test_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(test_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
id, label = item[key]
assert id == test_ids[i]
assert label == test_labels[i]
test_sets = None
dataset = None
def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
"""Test TVTSet which returns ItemSetDict with node pairs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_pairs = (np.arange(1000), np.arange(1000, 2000))
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T
train_path = os.path.join(test_dir, "train.npy")
np.save(train_path, train_data)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T
validation_path = os.path.join(test_dir, "validation.npy")
np.save(validation_path, validation_data)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T
test_path = os.path.join(test_dir, "test.npy")
np.save(test_path, test_data)
yaml_content = f"""
train_sets:
- - type_name: paper
format: numpy
in_memory: true
path: {train_path}
- - type_name: author
format: numpy
path: {train_path}
validation_sets:
- - type_name: paper
format: numpy
path: {validation_path}
- - type_name: author
format: numpy
path: {validation_path}
test_sets:
- - type_name: paper
format: numpy
in_memory: false
path: {test_path}
- - type_name: author
format: numpy
path: {test_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
# Verify train set.
train_sets = dataset.train_sets()
assert len(train_sets) == 2
for train_set in train_sets:
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(train_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
src, dst, label = item[key]
assert src == train_pairs[0][i]
assert dst == train_pairs[1][i]
assert label == train_labels[i]
train_sets = None
# Verify validation set.
validation_sets = dataset.validation_sets()
assert len(validation_sets) == 2
for validation_set in validation_sets:
assert len(validation_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(validation_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
src, dst, label = item[key]
assert src == validation_pairs[0][i]
assert dst == validation_pairs[1][i]
assert label == validation_labels[i]
validation_sets = None
# Verify test set.
test_sets = dataset.test_sets()
assert len(test_sets) == 2
for test_set in test_sets:
assert len(test_set) == 1000
assert isinstance(train_set, gb.ItemSetDict)
for i, item in enumerate(test_set):
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
src, dst, label = item[key]
assert src == test_pairs[0][i]
assert dst == test_pairs[1][i]
assert label == test_labels[i]
test_sets = None
dataset = None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment