Unverified Commit a67d9e6f authored by Andrei Ivanov's avatar Andrei Ivanov Committed by GitHub
Browse files

Improved GraphBolt `ondisk_dataset` tests. (#6641)


Co-authored-by: default avatarHongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
parent e02caa67
...@@ -4,6 +4,7 @@ import random ...@@ -4,6 +4,7 @@ import random
import re import re
import tempfile import tempfile
import unittest import unittest
import warnings
import gb_test_utils as gbt import gb_test_utils as gbt
import numpy as np import numpy as np
...@@ -16,12 +17,27 @@ import yaml ...@@ -16,12 +17,27 @@ import yaml
from dgl import graphbolt as gb from dgl import graphbolt as gb
def write_yaml_file(yaml_content, dir):
os.makedirs(os.path.join(dir, "preprocessed"), exist_ok=True)
yaml_file = os.path.join(dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
def load_dataset(dataset):
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=UserWarning)
return dataset.load()
def write_yaml_and_load_dataset(yaml_content, dir):
write_yaml_file(yaml_content, dir)
return load_dataset(gb.OnDiskDataset(dir))
def test_OnDiskDataset_TVTSet_exceptions(): def test_OnDiskDataset_TVTSet_exceptions():
"""Test excpetions thrown when parsing TVTSet.""" """Test excpetions thrown when parsing TVTSet."""
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
# Case 1: ``format`` is invalid. # Case 1: ``format`` is invalid.
yaml_content = """ yaml_content = """
tasks: tasks:
...@@ -32,9 +48,7 @@ def test_OnDiskDataset_TVTSet_exceptions(): ...@@ -32,9 +48,7 @@ def test_OnDiskDataset_TVTSet_exceptions():
- format: torch_invalid - format: torch_invalid
path: set/paper-train.pt path: set/paper-train.pt
""" """
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml") write_yaml_file(yaml_content, test_dir)
with open(yaml_file, "w") as f:
f.write(yaml_content)
with pytest.raises(pydantic.ValidationError): with pytest.raises(pydantic.ValidationError):
_ = gb.OnDiskDataset(test_dir).load() _ = gb.OnDiskDataset(test_dir).load()
...@@ -53,8 +67,7 @@ def test_OnDiskDataset_TVTSet_exceptions(): ...@@ -53,8 +67,7 @@ def test_OnDiskDataset_TVTSet_exceptions():
- format: numpy - format: numpy
path: set/train.npy path: set/train.npy
""" """
with open(yaml_file, "w") as f: write_yaml_file(yaml_content, test_dir)
f.write(yaml_content)
with pytest.raises( with pytest.raises(
AssertionError, AssertionError,
match=r"Only one TVT set is allowed if type is not specified.", match=r"Only one TVT set is allowed if type is not specified.",
...@@ -107,12 +120,7 @@ def test_OnDiskDataset_multiple_tasks(): ...@@ -107,12 +120,7 @@ def test_OnDiskDataset_multiple_tasks():
in_memory: true in_memory: true
path: {train_labels_path} path: {train_labels_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
assert len(dataset.tasks) == 2 assert len(dataset.tasks) == 2
for task_id in range(2): for task_id in range(2):
...@@ -162,12 +170,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_names(): ...@@ -162,12 +170,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_names():
in_memory: true in_memory: true
path: {train_labels_path} path: {train_labels_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
# Verify train set. # Verify train set.
train_set = dataset.tasks[0].train_set train_set = dataset.tasks[0].train_set
...@@ -209,12 +212,7 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_names(): ...@@ -209,12 +212,7 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_names():
in_memory: true in_memory: true
path: {train_labels_path} path: {train_labels_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
# Verify train set. # Verify train set.
train_set = dataset.tasks[0].train_set train_set = dataset.tasks[0].train_set
...@@ -295,12 +293,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -295,12 +293,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
in_memory: true in_memory: true
path: {test_labels_path} path: {test_labels_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
# Verify tasks. # Verify tasks.
assert len(dataset.tasks) == 1 assert len(dataset.tasks) == 1
...@@ -348,11 +341,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -348,11 +341,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
- format: numpy - format: numpy
path: {train_ids_path} path: {train_ids_path}
""" """
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml") dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
assert dataset.tasks[0].train_set is not None assert dataset.tasks[0].train_set is not None
assert dataset.tasks[0].validation_set is None assert dataset.tasks[0].validation_set is None
assert dataset.tasks[0].test_set is None assert dataset.tasks[0].test_set is None
...@@ -421,12 +410,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_labels(): ...@@ -421,12 +410,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_labels():
in_memory: true in_memory: true
path: {test_labels_path} path: {test_labels_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
# Verify train set. # Verify train set.
train_set = dataset.tasks[0].train_set train_set = dataset.tasks[0].train_set
...@@ -529,12 +513,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_negs(): ...@@ -529,12 +513,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_negs():
in_memory: true in_memory: true
path: {test_neg_dst_path} path: {test_neg_dst_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
# Verify train set. # Verify train set.
train_set = dataset.tasks[0].train_set train_set = dataset.tasks[0].train_set
...@@ -631,12 +610,7 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label(): ...@@ -631,12 +610,7 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
format: numpy format: numpy
path: {test_path} path: {test_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
# Verify train set. # Verify train set.
train_set = dataset.tasks[0].train_set train_set = dataset.tasks[0].train_set
...@@ -772,12 +746,7 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pairs_labels(): ...@@ -772,12 +746,7 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pairs_labels():
in_memory: true in_memory: true
path: {test_labels_path} path: {test_labels_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
# Verify train set. # Verify train set.
train_set = dataset.tasks[0].train_set train_set = dataset.tasks[0].train_set
...@@ -882,12 +851,7 @@ def test_OnDiskDataset_Feature_heterograph(): ...@@ -882,12 +851,7 @@ def test_OnDiskDataset_Feature_heterograph():
in_memory: true in_memory: true
path: {edge_data_label_path} path: {edge_data_label_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
# Verify feature data storage. # Verify feature data storage.
feature_data = dataset.feature feature_data = dataset.feature
...@@ -982,12 +946,7 @@ def test_OnDiskDataset_Feature_homograph(): ...@@ -982,12 +946,7 @@ def test_OnDiskDataset_Feature_homograph():
in_memory: true in_memory: true
path: {edge_data_label_path} path: {edge_data_label_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
# Verify feature data storage. # Verify feature data storage.
feature_data = dataset.feature feature_data = dataset.feature
...@@ -1034,10 +993,7 @@ def test_OnDiskDataset_Graph_Exceptions(): ...@@ -1034,10 +993,7 @@ def test_OnDiskDataset_Graph_Exceptions():
type: CSRSamplingGraph type: CSRSamplingGraph
path: /path/to/graph path: /path/to/graph
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) write_yaml_file(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
with pytest.raises( with pytest.raises(
pydantic.ValidationError, pydantic.ValidationError,
...@@ -1060,12 +1016,7 @@ def test_OnDiskDataset_Graph_homogeneous(): ...@@ -1060,12 +1016,7 @@ def test_OnDiskDataset_Graph_homogeneous():
type: FusedCSCSamplingGraph type: FusedCSCSamplingGraph
path: {graph_path} path: {graph_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
graph2 = dataset.graph graph2 = dataset.graph
assert graph.total_num_nodes == graph2.total_num_nodes assert graph.total_num_nodes == graph2.total_num_nodes
...@@ -1103,12 +1054,7 @@ def test_OnDiskDataset_Graph_heterogeneous(): ...@@ -1103,12 +1054,7 @@ def test_OnDiskDataset_Graph_heterogeneous():
type: FusedCSCSamplingGraph type: FusedCSCSamplingGraph
path: {graph_path} path: {graph_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
graph2 = dataset.graph graph2 = dataset.graph
assert graph.total_num_nodes == graph2.total_num_nodes assert graph.total_num_nodes == graph2.total_num_nodes
...@@ -1130,23 +1076,14 @@ def test_OnDiskDataset_Metadata(): ...@@ -1130,23 +1076,14 @@ def test_OnDiskDataset_Metadata():
yaml_content = f""" yaml_content = f"""
dataset_name: {dataset_name} dataset_name: {dataset_name}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
assert dataset.dataset_name == dataset_name assert dataset.dataset_name == dataset_name
# Only dataset_name is specified. # Only dataset_name is specified.
yaml_content = f""" yaml_content = f"""
dataset_name: {dataset_name} dataset_name: {dataset_name}
""" """
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml") dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
assert dataset.dataset_name == dataset_name assert dataset.dataset_name == dataset_name
...@@ -1645,7 +1582,7 @@ def test_OnDiskDataset_load_feature(): ...@@ -1645,7 +1582,7 @@ def test_OnDiskDataset_load_feature():
dataset = gb.OnDiskDataset(test_dir).load() dataset = gb.OnDiskDataset(test_dir).load()
original_feature_data = dataset.feature original_feature_data = dataset.feature
dataset.yaml_data["feature_data"][0]["in_memory"] = True dataset.yaml_data["feature_data"][0]["in_memory"] = True
dataset.load() load_dataset(dataset)
modify_feature_data = dataset.feature modify_feature_data = dataset.feature
# After modify the `in_memory` field, the feature data should be # After modify the `in_memory` field, the feature data should be
# equal. # equal.
...@@ -1664,7 +1601,7 @@ def test_OnDiskDataset_load_feature(): ...@@ -1664,7 +1601,7 @@ def test_OnDiskDataset_load_feature():
AssertionError, AssertionError,
match="^Pytorch tensor can only be loaded in memory,", match="^Pytorch tensor can only be loaded in memory,",
): ):
dataset.load() load_dataset(dataset)
dataset = gb.OnDiskDataset(test_dir) dataset = gb.OnDiskDataset(test_dir)
dataset.yaml_data["feature_data"][0]["in_memory"] = True dataset.yaml_data["feature_data"][0]["in_memory"] = True
...@@ -1672,7 +1609,7 @@ def test_OnDiskDataset_load_feature(): ...@@ -1672,7 +1609,7 @@ def test_OnDiskDataset_load_feature():
# If `format` is torch and `in_memory` is True, it will # If `format` is torch and `in_memory` is True, it will
# raise an UnpicklingError. # raise an UnpicklingError.
with pytest.raises(pickle.UnpicklingError): with pytest.raises(pickle.UnpicklingError):
dataset.load() load_dataset(dataset)
# Case3. Test modify the `path` field. # Case3. Test modify the `path` field.
dataset = gb.OnDiskDataset(test_dir) dataset = gb.OnDiskDataset(test_dir)
...@@ -1682,18 +1619,18 @@ def test_OnDiskDataset_load_feature(): ...@@ -1682,18 +1619,18 @@ def test_OnDiskDataset_load_feature():
FileNotFoundError, FileNotFoundError,
match=r"\[Errno 2\] No such file or directory:", match=r"\[Errno 2\] No such file or directory:",
): ):
dataset.load() load_dataset(dataset)
# Modifying the `path` field to an absolute path should work. # Modifying the `path` field to an absolute path should work.
# In os.path.join, if a segment is an absolute path (which # In os.path.join, if a segment is an absolute path (which
# on Windows requires both a drive and a root), then all # on Windows requires both a drive and a root), then all
# previous segments are ignored and joining continues from # previous segments are ignored and joining continues from
# the absolute path segment. # the absolute path segment.
dataset = gb.OnDiskDataset(test_dir).load() dataset = load_dataset(gb.OnDiskDataset(test_dir))
original_feature_data = dataset.feature original_feature_data = dataset.feature
dataset.yaml_data["feature_data"][0]["path"] = os.path.join( dataset.yaml_data["feature_data"][0]["path"] = os.path.join(
test_dir, dataset.yaml_data["feature_data"][0]["path"] test_dir, dataset.yaml_data["feature_data"][0]["path"]
) )
dataset.load() load_dataset(dataset)
modify_feature_data = dataset.feature modify_feature_data = dataset.feature
assert torch.equal( assert torch.equal(
original_feature_data.read("node", None, "feat"), original_feature_data.read("node", None, "feat"),
...@@ -1907,12 +1844,7 @@ def test_OnDiskDataset_all_nodes_set_homo(): ...@@ -1907,12 +1844,7 @@ def test_OnDiskDataset_all_nodes_set_homo():
type: FusedCSCSamplingGraph type: FusedCSCSamplingGraph
path: {graph_path} path: {graph_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
all_nodes_set = dataset.all_nodes_set all_nodes_set = dataset.all_nodes_set
assert isinstance(all_nodes_set, gb.ItemSet) assert isinstance(all_nodes_set, gb.ItemSet)
assert all_nodes_set.names == ("seed_nodes",) assert all_nodes_set.names == ("seed_nodes",)
...@@ -1949,12 +1881,7 @@ def test_OnDiskDataset_all_nodes_set_hetero(): ...@@ -1949,12 +1881,7 @@ def test_OnDiskDataset_all_nodes_set_hetero():
type: FusedCSCSamplingGraph type: FusedCSCSamplingGraph
path: {graph_path} path: {graph_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) dataset = write_yaml_and_load_dataset(yaml_content, test_dir)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
all_nodes_set = dataset.all_nodes_set all_nodes_set = dataset.all_nodes_set
assert isinstance(all_nodes_set, gb.ItemSetDict) assert isinstance(all_nodes_set, gb.ItemSetDict)
assert all_nodes_set.names == ("seed_nodes",) assert all_nodes_set.names == ("seed_nodes",)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment