[GraphBolt] convert TVT from list of list to list (#6080)

47d37e91 · Rhett Ying · GitHub · 12ade95c · 47d37e91 · 47d37e91
Unverified Commit 47d37e91 authored Aug 02, 2023 by Rhett Ying Committed by GitHub Aug 02, 2023
5 changed files
--- a/python/dgl/graphbolt/dataset.py
+++ b/python/dgl/graphbolt/dataset.py
 """GraphBolt Dataset."""

-from typing import Dict, List
+from typing import Dict

 from .feature_store import FeatureStore
 from .itemset import ItemSet, ItemSetDict
@@ -32,18 +32,18 @@ class Dataset:
    """

    @property
-    def train_sets(self) -> List[ItemSet] or List[ItemSetDict]:
-        """Return the training sets."""
+    def train_set(self) -> ItemSet or ItemSetDict:
+        """Return the training set."""
        raise NotImplementedError

    @property
-    def validation_sets(self) -> List[ItemSet] or List[ItemSetDict]:
-        """Return the validation sets."""
+    def validation_set(self) -> ItemSet or ItemSetDict:
+        """Return the validation set."""
        raise NotImplementedError

    @property
-    def test_sets(self) -> List[ItemSet] or List[ItemSetDict]:
-        """Return the test sets."""
+    def test_set(self) -> ItemSet or ItemSetDict:
+        """Return the test set."""
        raise NotImplementedError

    @property

--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -165,45 +165,42 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
                )

    # 7. Save the train/val/test split according to the output_config.
-    for set_name in ["train_sets", "validation_sets", "test_sets"]:
+    for set_name in ["train_set", "validation_set", "test_set"]:
        if set_name not in input_config:
            continue
-        for intput_set_split, output_set_split in zip(
+        for input_set_per_type, output_set_per_type in zip(
            input_config[set_name], output_config[set_name]
        ):
-            for input_set_per_type, output_set_per_type in zip(
-                intput_set_split, output_set_split
+            for input_data, output_data in zip(
+                input_set_per_type["data"], output_set_per_type["data"]
            ):
-                for input_data, output_data in zip(
-                    input_set_per_type["data"], output_set_per_type["data"]
-                ):
-                    # Always save the feature in numpy format.
-                    output_data["format"] = "numpy"
-                    output_data["path"] = str(
-                        processed_dir_prefix
-                        / input_data["path"].replace("pt", "npy")
+                # Always save the feature in numpy format.
+                output_data["format"] = "numpy"
+                output_data["path"] = str(
+                    processed_dir_prefix
+                    / input_data["path"].replace("pt", "npy")
+                )
+                if input_data["format"] == "numpy":
+                    # If the original format is numpy, just copy the file.
+                    os.makedirs(
+                        dataset_path / os.path.dirname(output_data["path"]),
+                        exist_ok=True,
+                    )
+                    shutil.copy(
+                        dataset_path / input_data["path"],
+                        dataset_path / output_data["path"],
+                    )
+                else:
+                    # If the original format is not numpy, convert it to numpy.
+                    input_set = read_data(
+                        dataset_path / input_data["path"],
+                        input_data["format"],
+                    )
+                    save_data(
+                        input_set,
+                        dataset_path / output_data["path"],
+                        output_set_per_type["format"],
                    )
-                    if input_data["format"] == "numpy":
-                        # If the original format is numpy, just copy the file.
-                        os.makedirs(
-                            dataset_path / os.path.dirname(output_data["path"]),
-                            exist_ok=True,
-                        )
-                        shutil.copy(
-                            dataset_path / input_data["path"],
-                            dataset_path / output_data["path"],
-                        )
-                    else:
-                        # If the original format is not numpy, convert it to numpy.
-                        input_set = read_data(
-                            dataset_path / input_data["path"],
-                            input_data["format"],
-                        )
-                        save_data(
-                            input_set,
-                            dataset_path / output_data["path"],
-                            output_set_per_type["format"],
-                        )

    # 8. Save the output_config.
    output_config_path = dataset_path / "output_config.yaml"
@@ -245,27 +242,27 @@ class OnDiskDataset(Dataset):
            format: numpy
            in_memory: false
            path: edge_data/author-writes-paper-feat.npy
-        train_sets:
-          - - type: paper # could be null for homogeneous graph.
-              data: # multiple data sources could be specified.
-                - format: numpy
-                  in_memory: true # If not specified, default to true.
-                  path: set/paper-train-src.npy
-                - format: numpy
-                  in_memory: false
-                  path: set/paper-train-dst.npy
-        validation_sets:
-          - - type: paper
-              data:
-                - format: numpy
-                  in_memory: true
-                  path: set/paper-validation.npy
-        test_sets:
-          - - type: paper
-              data:
-                - format: numpy
-                  in_memory: true
-                  path: set/paper-test.npy
+        train_set:
+          - type: paper # could be null for homogeneous graph.
+            data: # multiple data sources could be specified.
+              - format: numpy
+                in_memory: true # If not specified, default to true.
+                path: set/paper-train-src.npy
+              - format: numpy
+                in_memory: false
+                path: set/paper-train-dst.npy
+        validation_set:
+          - type: paper
+            data:
+              - format: numpy
+                in_memory: true
+                path: set/paper-validation.npy
+        test_set:
+          - type: paper
+            data:
+              - format: numpy
+                in_memory: true
+                path: set/paper-test.npy

    Parameters
    ----------
@@ -285,24 +282,24 @@ class OnDiskDataset(Dataset):
        self._num_labels = self._meta.num_labels
        self._graph = self._load_graph(self._meta.graph_topology)
        self._feature = load_feature_stores(self._meta.feature_data)
-        self._train_sets = self._init_tvt_sets(self._meta.train_sets)
-        self._validation_sets = self._init_tvt_sets(self._meta.validation_sets)
-        self._test_sets = self._init_tvt_sets(self._meta.test_sets)
+        self._train_set = self._init_tvt_set(self._meta.train_set)
+        self._validation_set = self._init_tvt_set(self._meta.validation_set)
+        self._test_set = self._init_tvt_set(self._meta.test_set)

    @property
-    def train_sets(self) -> List[ItemSet] or List[ItemSetDict]:
+    def train_set(self) -> ItemSet or ItemSetDict:
        """Return the training set."""
-        return self._train_sets
+        return self._train_set

    @property
-    def validation_sets(self) -> List[ItemSet] or List[ItemSetDict]:
+    def validation_set(self) -> ItemSet or ItemSetDict:
        """Return the validation set."""
-        return self._validation_sets
+        return self._validation_set

    @property
-    def test_sets(self) -> List[ItemSet] or List[ItemSetDict]:
+    def test_set(self) -> ItemSet or ItemSetDict:
        """Return the test set."""
-        return self._test_sets
+        return self._test_set

    @property
    def graph(self) -> object:
@@ -341,36 +338,31 @@ class OnDiskDataset(Dataset):
            f"Graph topology type {graph_topology.type} is not supported."
        )

-    def _init_tvt_sets(
-        self, tvt_sets: List[List[OnDiskTVTSet]]
-    ) -> List[ItemSet] or List[ItemSetDict]:
-        """Initialize the TVT sets."""
-        if (tvt_sets is None) or (len(tvt_sets) == 0):
-            return None
-        ret = []
-        for tvt_set in tvt_sets:
-            if (tvt_set is None) or (len(tvt_set) == 0):
-                ret.append(None)
-            if tvt_set[0].type is None:
-                assert (
-                    len(tvt_set) == 1
-                ), "Only one TVT set is allowed if type is not specified."
-                ret.append(
-                    ItemSet(
-                        tuple(
-                            read_data(data.path, data.format, data.in_memory)
-                            for data in tvt_set[0].data
-                        )
-                    )
+    def _init_tvt_set(
+        self, tvt_set: List[OnDiskTVTSet]
+    ) -> ItemSet or ItemSetDict:
+        """Initialize the TVT set."""
+        ret = None
+        if (tvt_set is None) or (len(tvt_set) == 0):
+            return ret
+        if tvt_set[0].type is None:
+            assert (
+                len(tvt_set) == 1
+            ), "Only one TVT set is allowed if type is not specified."
+            ret = ItemSet(
+                tuple(
+                    read_data(data.path, data.format, data.in_memory)
+                    for data in tvt_set[0].data
                )
-            else:
-                data = {}
-                for tvt in tvt_set:
-                    data[tvt.type] = ItemSet(
-                        tuple(
-                            read_data(data.path, data.format, data.in_memory)
-                            for data in tvt.data
-                        )
+            )
+        else:
+            data = {}
+            for tvt in tvt_set:
+                data[tvt.type] = ItemSet(
+                    tuple(
+                        read_data(data.path, data.format, data.in_memory)
+                        for data in tvt.data
                    )
-                ret.append(ItemSetDict(data))
+                )
+            ret = ItemSetDict(data)
        return ret
--- a/python/dgl/graphbolt/impl/ondisk_metadata.py
+++ b/python/dgl/graphbolt/impl/ondisk_metadata.py
@@ -83,6 +83,6 @@ class OnDiskMetaData(pydantic.BaseModel):
    num_labels: Optional[int] = None
    graph_topology: Optional[OnDiskGraphTopology] = None
    feature_data: Optional[List[OnDiskFeatureData]] = []
-    train_sets: Optional[List[List[OnDiskTVTSet]]] = []
-    validation_sets: Optional[List[List[OnDiskTVTSet]]] = []
-    test_sets: Optional[List[List[OnDiskTVTSet]]] = []
+    train_set: Optional[List[OnDiskTVTSet]] = []
+    validation_set: Optional[List[OnDiskTVTSet]] = []
+    test_set: Optional[List[OnDiskTVTSet]] = []
--- a/tests/python/pytorch/graphbolt/test_dataset.py
+++ b/tests/python/pytorch/graphbolt/test_dataset.py
-import os
-import tempfile
-
-import numpy as np
-
-import pydantic
 import pytest
 from dgl import graphbolt as gb

@@ -11,15 +5,15 @@ from dgl import graphbolt as gb
 def test_Dataset():
    dataset = gb.Dataset()
    with pytest.raises(NotImplementedError):
-        _ = dataset.train_sets()
+        _ = dataset.train_set
    with pytest.raises(NotImplementedError):
-        _ = dataset.validation_sets()
+        _ = dataset.validation_set
    with pytest.raises(NotImplementedError):
-        _ = dataset.test_sets()
+        _ = dataset.test_set
    with pytest.raises(NotImplementedError):
-        _ = dataset.graph()
+        _ = dataset.graph
    with pytest.raises(NotImplementedError):
-        _ = dataset.feature()
+        _ = dataset.feature
    with pytest.raises(NotImplementedError):
        _ = dataset.dataset_name
    with pytest.raises(NotImplementedError):

--- a/tests/python/pytorch/graphbolt/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/test_ondisk_dataset.py
@@ -20,11 +20,11 @@ def test_OnDiskDataset_TVTSet_exceptions():

        # Case 1: ``format`` is invalid.
        yaml_content = """
-        train_sets:
-          - - type: paper
-              data:
-                - format: torch_invalid
-                  path: set/paper-train.pt
+        train_set:
+          - type: paper
+            data:
+              - format: torch_invalid
+                path: set/paper-train.pt
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
@@ -34,15 +34,15 @@ def test_OnDiskDataset_TVTSet_exceptions():

        # Case 2: ``type`` is not specified while multiple TVT sets are specified.
        yaml_content = """
-            train_sets:
-              - - type: null
-                  data:
-                    - format: numpy
-                      path: set/train.npy
-                - type: null
-                  data:
-                    - format: numpy
-                      path: set/train.npy
+            train_set:
+              - type: null
+                data:
+                  - format: numpy
+                    path: set/train.npy
+              - type: null
+                data:
+                  - format: numpy
+                    path: set/train.npy
        """
        with open(yaml_file, "w") as f:
            f.write(yaml_content)
@@ -82,32 +82,32 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
        #   ``type`` is not specified or specified as ``null``.
        #   ``in_memory`` could be ``true`` and ``false``.
        yaml_content = f"""
-            train_sets:
-              - - type: null
-                  data:
-                    - format: numpy
-                      in_memory: true
-                      path: {train_ids_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {train_labels_path}
-            validation_sets:
-              - - data:
-                    - format: numpy
-                      in_memory: true
-                      path: {validation_ids_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {validation_labels_path}
-            test_sets:
-              - - type: null
-                  data:
-                    - format: numpy
-                      in_memory: true
-                      path: {test_ids_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {test_labels_path}
+            train_set:
+              - type: null
+                data:
+                  - format: numpy
+                    in_memory: true
+                    path: {train_ids_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {train_labels_path}
+            validation_set:
+              - data:
+                  - format: numpy
+                    in_memory: true
+                    path: {validation_ids_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {validation_labels_path}
+            test_set:
+              - type: null
+                data:
+                  - format: numpy
+                    in_memory: true
+                    path: {test_ids_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {test_labels_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
@@ -116,55 +116,49 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
        dataset = gb.OnDiskDataset(yaml_file)

        # Verify train set.
-        train_sets = dataset.train_sets
-        assert len(train_sets) == 1
-        for train_set in train_sets:
-            assert len(train_set) == 1000
-            assert isinstance(train_set, gb.ItemSet)
-            for i, (id, label) in enumerate(train_set):
-                assert id == train_ids[i]
-                assert label == train_labels[i]
-        train_sets = None
+        train_set = dataset.train_set
+        assert len(train_set) == 1000
+        assert isinstance(train_set, gb.ItemSet)
+        for i, (id, label) in enumerate(train_set):
+            assert id == train_ids[i]
+            assert label == train_labels[i]
+        train_set = None

        # Verify validation set.
-        validation_sets = dataset.validation_sets
-        assert len(validation_sets) == 1
-        for validation_set in validation_sets:
-            assert len(validation_set) == 1000
-            assert isinstance(validation_set, gb.ItemSet)
-            for i, (id, label) in enumerate(validation_set):
-                assert id == validation_ids[i]
-                assert label == validation_labels[i]
-        validation_sets = None
+        validation_set = dataset.validation_set
+        assert len(validation_set) == 1000
+        assert isinstance(validation_set, gb.ItemSet)
+        for i, (id, label) in enumerate(validation_set):
+            assert id == validation_ids[i]
+            assert label == validation_labels[i]
+        validation_set = None

        # Verify test set.
-        test_sets = dataset.test_sets
-        assert len(test_sets) == 1
-        for test_set in test_sets:
-            assert len(test_set) == 1000
-            assert isinstance(test_set, gb.ItemSet)
-            for i, (id, label) in enumerate(test_set):
-                assert id == test_ids[i]
-                assert label == test_labels[i]
-        test_sets = None
+        test_set = dataset.test_set
+        assert len(test_set) == 1000
+        assert isinstance(test_set, gb.ItemSet)
+        for i, (id, label) in enumerate(test_set):
+            assert id == test_ids[i]
+            assert label == test_labels[i]
+        test_set = None
        dataset = None

        # Case 2: Some TVT sets are None.
        yaml_content = f"""
-            train_sets:
-              - - type: null
-                  data:
-                    - format: numpy
-                      path: {train_ids_path}
+            train_set:
+              - type: null
+                data:
+                  - format: numpy
+                    path: {train_ids_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
            f.write(yaml_content)

        dataset = gb.OnDiskDataset(yaml_file)
-        assert dataset.train_sets is not None
-        assert dataset.validation_sets is None
-        assert dataset.test_sets is None
+        assert dataset.train_set is not None
+        assert dataset.validation_set is None
+        assert dataset.test_set is None
        dataset = None


@@ -202,41 +196,41 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
        np.save(test_labels_path, test_labels)

        yaml_content = f"""
-            train_sets:
-              - - type: null
-                  data:
-                    - format: numpy
-                      in_memory: true
-                      path: {train_src_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {train_dst_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {train_labels_path}
-            validation_sets:
-              - - data:
-                    - format: numpy
-                      in_memory: true
-                      path: {validation_src_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {validation_dst_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {validation_labels_path}
-            test_sets:
-              - - type: null
-                  data:
-                    - format: numpy
-                      in_memory: true
-                      path: {test_src_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {test_dst_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {test_labels_path}
+            train_set:
+              - type: null
+                data:
+                  - format: numpy
+                    in_memory: true
+                    path: {train_src_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {train_dst_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {train_labels_path}
+            validation_set:
+              - data:
+                  - format: numpy
+                    in_memory: true
+                    path: {validation_src_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {validation_dst_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {validation_labels_path}
+            test_set:
+              - type: null
+                data:
+                  - format: numpy
+                    in_memory: true
+                    path: {test_src_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {test_dst_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {test_labels_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
@@ -245,40 +239,34 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
        dataset = gb.OnDiskDataset(yaml_file)

        # Verify train set.
-        train_sets = dataset.train_sets
-        assert len(train_sets) == 1
-        for train_set in train_sets:
-            assert len(train_set) == 1000
-            assert isinstance(train_set, gb.ItemSet)
-            for i, (src, dst, label) in enumerate(train_set):
-                assert src == train_src[i]
-                assert dst == train_dst[i]
-                assert label == train_labels[i]
-        train_sets = None
+        train_set = dataset.train_set
+        assert len(train_set) == 1000
+        assert isinstance(train_set, gb.ItemSet)
+        for i, (src, dst, label) in enumerate(train_set):
+            assert src == train_src[i]
+            assert dst == train_dst[i]
+            assert label == train_labels[i]
+        train_set = None

        # Verify validation set.
-        validation_sets = dataset.validation_sets
-        assert len(validation_sets) == 1
-        for validation_set in validation_sets:
-            assert len(validation_set) == 1000
-            assert isinstance(validation_set, gb.ItemSet)
-            for i, (src, dst, label) in enumerate(validation_set):
-                assert src == validation_src[i]
-                assert dst == validation_dst[i]
-                assert label == validation_labels[i]
-        validation_sets = None
+        validation_set = dataset.validation_set
+        assert len(validation_set) == 1000
+        assert isinstance(validation_set, gb.ItemSet)
+        for i, (src, dst, label) in enumerate(validation_set):
+            assert src == validation_src[i]
+            assert dst == validation_dst[i]
+            assert label == validation_labels[i]
+        validation_set = None

        # Verify test set.
-        test_sets = dataset.test_sets
-        assert len(test_sets) == 1
-        for test_set in test_sets:
-            assert len(test_set) == 1000
-            assert isinstance(test_set, gb.ItemSet)
-            for i, (src, dst, label) in enumerate(test_set):
-                assert src == test_src[i]
-                assert dst == test_dst[i]
-                assert label == test_labels[i]
-        test_sets = None
+        test_set = dataset.test_set
+        assert len(test_set) == 1000
+        assert isinstance(test_set, gb.ItemSet)
+        for i, (src, dst, label) in enumerate(test_set):
+            assert src == test_src[i]
+            assert dst == test_dst[i]
+            assert label == test_labels[i]
+        test_set = None
        dataset = None


@@ -320,41 +308,41 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
        np.save(test_neg_dst_path, test_neg_dst)

        yaml_content = f"""
-            train_sets:
-              - - type: null
-                  data:
-                    - format: numpy
-                      in_memory: true
-                      path: {train_src_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {train_dst_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {train_neg_dst_path}
-            validation_sets:
-              - - data:
-                    - format: numpy
-                      in_memory: true
-                      path: {validation_src_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {validation_dst_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {validation_neg_dst_path}
-            test_sets:
-              - - type: null
-                  data:
-                    - format: numpy
-                      in_memory: true
-                      path: {test_src_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {test_dst_path}
-                    - format: numpy
-                      in_memory: true
-                      path: {test_neg_dst_path}
+            train_set:
+              - type: null
+                data:
+                  - format: numpy
+                    in_memory: true
+                    path: {train_src_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {train_dst_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {train_neg_dst_path}
+            validation_set:
+              - data:
+                  - format: numpy
+                    in_memory: true
+                    path: {validation_src_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {validation_dst_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {validation_neg_dst_path}
+            test_set:
+              - type: null
+                data:
+                  - format: numpy
+                    in_memory: true
+                    path: {test_src_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {test_dst_path}
+                  - format: numpy
+                    in_memory: true
+                    path: {test_neg_dst_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
@@ -363,42 +351,34 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
        dataset = gb.OnDiskDataset(yaml_file)

        # Verify train set.
-        train_sets = dataset.train_sets
-        assert len(train_sets) == 1
-        for train_set in train_sets:
-            assert len(train_set) == 1000
-            assert isinstance(train_set, gb.ItemSet)
-            for i, (src, dst, negs) in enumerate(train_set):
-                assert src == train_src[i]
-                assert dst == train_dst[i]
-                assert torch.equal(negs, torch.from_numpy(train_neg_dst[i]))
-        train_sets = None
+        train_set = dataset.train_set
+        assert len(train_set) == 1000
+        assert isinstance(train_set, gb.ItemSet)
+        for i, (src, dst, negs) in enumerate(train_set):
+            assert src == train_src[i]
+            assert dst == train_dst[i]
+            assert torch.equal(negs, torch.from_numpy(train_neg_dst[i]))
+        train_set = None

        # Verify validation set.
-        validation_sets = dataset.validation_sets
-        assert len(validation_sets) == 1
-        for validation_set in validation_sets:
-            assert len(validation_set) == 1000
-            assert isinstance(validation_set, gb.ItemSet)
-            for i, (src, dst, negs) in enumerate(validation_set):
-                assert src == validation_src[i]
-                assert dst == validation_dst[i]
-                assert torch.equal(
-                    negs, torch.from_numpy(validation_neg_dst[i])
-                )
-        validation_sets = None
+        validation_set = dataset.validation_set
+        assert len(validation_set) == 1000
+        assert isinstance(validation_set, gb.ItemSet)
+        for i, (src, dst, negs) in enumerate(validation_set):
+            assert src == validation_src[i]
+            assert dst == validation_dst[i]
+            assert torch.equal(negs, torch.from_numpy(validation_neg_dst[i]))
+        validation_set = None

        # Verify test set.
-        test_sets = dataset.test_sets
-        assert len(test_sets) == 1
-        for test_set in test_sets:
-            assert len(test_set) == 1000
-            assert isinstance(test_set, gb.ItemSet)
-            for i, (src, dst, negs) in enumerate(test_set):
-                assert src == test_src[i]
-                assert dst == test_dst[i]
-                assert torch.equal(negs, torch.from_numpy(test_neg_dst[i]))
-        test_sets = None
+        test_set = dataset.test_set
+        assert len(test_set) == 1000
+        assert isinstance(test_set, gb.ItemSet)
+        for i, (src, dst, negs) in enumerate(test_set):
+            assert src == test_src[i]
+            assert dst == test_dst[i]
+            assert torch.equal(negs, torch.from_numpy(test_neg_dst[i]))
+        test_set = None
        dataset = None


@@ -424,35 +404,35 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
        np.save(test_path, test_data)

        yaml_content = f"""
-            train_sets:
-              - - type: paper
-                  data:
-                    - format: numpy
-                      in_memory: true
-                      path: {train_path}
-              - - type: author
-                  data:
-                    - format: numpy
-                      path: {train_path}
-            validation_sets:
-              - - type: paper
-                  data:
-                    - format: numpy
-                      path: {validation_path}
-              - - type: author
-                  data:
-                    - format: numpy
-                      path: {validation_path}
-            test_sets:
-              - - type: paper
-                  data:
-                    - format: numpy
-                      in_memory: false
-                      path: {test_path}
-              - - type: author
-                  data:
-                    - format: numpy
-                      path: {test_path}
+            train_set:
+              - type: paper
+                data:
+                  - format: numpy
+                    in_memory: true
+                    path: {train_path}
+              - type: author
+                data:
+                  - format: numpy
+                    path: {train_path}
+            validation_set:
+              - type: paper
+                data:
+                  - format: numpy
+                    path: {validation_path}
+              - type: author
+                data:
+                  - format: numpy
+                    path: {validation_path}
+            test_set:
+              - type: paper
+                data:
+                  - format: numpy
+                    in_memory: false
+                    path: {test_path}
+              - type: author
+                data:
+                  - format: numpy
+                    path: {test_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
@@ -461,52 +441,46 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
        dataset = gb.OnDiskDataset(yaml_file)

        # Verify train set.
-        train_sets = dataset.train_sets
-        assert len(train_sets) == 2
-        for train_set in train_sets:
-            assert len(train_set) == 1000
-            assert isinstance(train_set, gb.ItemSetDict)
-            for i, item in enumerate(train_set):
-                assert isinstance(item, dict)
-                assert len(item) == 1
-                key = list(item.keys())[0]
-                assert key in ["paper", "author"]
-                id, label = item[key]
-                assert id == train_ids[i]
-                assert label == train_labels[i]
-        train_sets = None
+        train_set = dataset.train_set
+        assert len(train_set) == 2000
+        assert isinstance(train_set, gb.ItemSetDict)
+        for i, item in enumerate(train_set):
+            assert isinstance(item, dict)
+            assert len(item) == 1
+            key = list(item.keys())[0]
+            assert key in ["paper", "author"]
+            id, label = item[key]
+            assert id == train_ids[i % 1000]
+            assert label == train_labels[i % 1000]
+        train_set = None

        # Verify validation set.
-        validation_sets = dataset.validation_sets
-        assert len(validation_sets) == 2
-        for validation_set in validation_sets:
-            assert len(validation_set) == 1000
-            assert isinstance(train_set, gb.ItemSetDict)
-            for i, item in enumerate(validation_set):
-                assert isinstance(item, dict)
-                assert len(item) == 1
-                key = list(item.keys())[0]
-                assert key in ["paper", "author"]
-                id, label = item[key]
-                assert id == validation_ids[i]
-                assert label == validation_labels[i]
-        validation_sets = None
+        validation_set = dataset.validation_set
+        assert len(validation_set) == 2000
+        assert isinstance(validation_set, gb.ItemSetDict)
+        for i, item in enumerate(validation_set):
+            assert isinstance(item, dict)
+            assert len(item) == 1
+            key = list(item.keys())[0]
+            assert key in ["paper", "author"]
+            id, label = item[key]
+            assert id == validation_ids[i % 1000]
+            assert label == validation_labels[i % 1000]
+        validation_set = None

        # Verify test set.
-        test_sets = dataset.test_sets
-        assert len(test_sets) == 2
-        for test_set in test_sets:
-            assert len(test_set) == 1000
-            assert isinstance(train_set, gb.ItemSetDict)
-            for i, item in enumerate(test_set):
-                assert isinstance(item, dict)
-                assert len(item) == 1
-                key = list(item.keys())[0]
-                assert key in ["paper", "author"]
-                id, label = item[key]
-                assert id == test_ids[i]
-                assert label == test_labels[i]
-        test_sets = None
+        test_set = dataset.test_set
+        assert len(test_set) == 2000
+        assert isinstance(test_set, gb.ItemSetDict)
+        for i, item in enumerate(test_set):
+            assert isinstance(item, dict)
+            assert len(item) == 1
+            key = list(item.keys())[0]
+            assert key in ["paper", "author"]
+            id, label = item[key]
+            assert id == test_ids[i % 1000]
+            assert label == test_labels[i % 1000]
+        test_set = None
        dataset = None


@@ -532,35 +506,35 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
        np.save(test_path, test_data)

        yaml_content = f"""
-            train_sets:
-              - - type: paper
-                  data:
-                    - format: numpy
-                      in_memory: true
-                      path: {train_path}
-              - - type: author
-                  data:
-                    - format: numpy
-                      path: {train_path}
-            validation_sets:
-              - - type: paper
-                  data:
-                    - format: numpy
-                      path: {validation_path}
-              - - type: author
-                  data:
-                    - format: numpy
-                      path: {validation_path}
-            test_sets:
-              - - type: paper
-                  data:
-                    - format: numpy
-                      in_memory: false
-                      path: {test_path}
-              - - type: author
-                  data:
-                    - format: numpy
-                      path: {test_path}
+            train_set:
+              - type: paper
+                data:
+                  - format: numpy
+                    in_memory: true
+                    path: {train_path}
+              - type: author
+                data:
+                  - format: numpy
+                    path: {train_path}
+            validation_set:
+              - type: paper
+                data:
+                  - format: numpy
+                    path: {validation_path}
+              - type: author
+                data:
+                  - format: numpy
+                    path: {validation_path}
+            test_set:
+              - type: paper
+                data:
+                  - format: numpy
+                    in_memory: false
+                    path: {test_path}
+              - type: author
+                data:
+                  - format: numpy
+                    path: {test_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
@@ -569,55 +543,49 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
        dataset = gb.OnDiskDataset(yaml_file)

        # Verify train set.
-        train_sets = dataset.train_sets
-        assert len(train_sets) == 2
-        for train_set in train_sets:
-            assert len(train_set) == 1000
-            assert isinstance(train_set, gb.ItemSetDict)
-            for i, item in enumerate(train_set):
-                assert isinstance(item, dict)
-                assert len(item) == 1
-                key = list(item.keys())[0]
-                assert key in ["paper", "author"]
-                src, dst, label = item[key]
-                assert src == train_pairs[0][i]
-                assert dst == train_pairs[1][i]
-                assert label == train_labels[i]
-        train_sets = None
+        train_set = dataset.train_set
+        assert len(train_set) == 2000
+        assert isinstance(train_set, gb.ItemSetDict)
+        for i, item in enumerate(train_set):
+            assert isinstance(item, dict)
+            assert len(item) == 1
+            key = list(item.keys())[0]
+            assert key in ["paper", "author"]
+            src, dst, label = item[key]
+            assert src == train_pairs[0][i % 1000]
+            assert dst == train_pairs[1][i % 1000]
+            assert label == train_labels[i % 1000]
+        train_set = None

        # Verify validation set.
-        validation_sets = dataset.validation_sets
-        assert len(validation_sets) == 2
-        for validation_set in validation_sets:
-            assert len(validation_set) == 1000
-            assert isinstance(train_set, gb.ItemSetDict)
-            for i, item in enumerate(validation_set):
-                assert isinstance(item, dict)
-                assert len(item) == 1
-                key = list(item.keys())[0]
-                assert key in ["paper", "author"]
-                src, dst, label = item[key]
-                assert src == validation_pairs[0][i]
-                assert dst == validation_pairs[1][i]
-                assert label == validation_labels[i]
-        validation_sets = None
+        validation_set = dataset.validation_set
+        assert len(validation_set) == 2000
+        assert isinstance(validation_set, gb.ItemSetDict)
+        for i, item in enumerate(validation_set):
+            assert isinstance(item, dict)
+            assert len(item) == 1
+            key = list(item.keys())[0]
+            assert key in ["paper", "author"]
+            src, dst, label = item[key]
+            assert src == validation_pairs[0][i % 1000]
+            assert dst == validation_pairs[1][i % 1000]
+            assert label == validation_labels[i % 1000]
+        validation_set = None

        # Verify test set.
-        test_sets = dataset.test_sets
-        assert len(test_sets) == 2
-        for test_set in test_sets:
-            assert len(test_set) == 1000
-            assert isinstance(train_set, gb.ItemSetDict)
-            for i, item in enumerate(test_set):
-                assert isinstance(item, dict)
-                assert len(item) == 1
-                key = list(item.keys())[0]
-                assert key in ["paper", "author"]
-                src, dst, label = item[key]
-                assert src == test_pairs[0][i]
-                assert dst == test_pairs[1][i]
-                assert label == test_labels[i]
-        test_sets = None
+        test_set = dataset.test_set
+        assert len(test_set) == 2000
+        assert isinstance(test_set, gb.ItemSetDict)
+        for i, item in enumerate(test_set):
+            assert isinstance(item, dict)
+            assert len(item) == 1
+            key = list(item.keys())[0]
+            assert key in ["paper", "author"]
+            src, dst, label = item[key]
+            assert src == test_pairs[0][i % 1000]
+            assert dst == test_pairs[1][i % 1000]
+            assert label == test_labels[i % 1000]
+        test_set = None
        dataset = None


@@ -995,21 +963,21 @@ def test_OnDiskDataset_preprocess_homogeneous():
                  format: numpy
                  in_memory: false
                  path: data/node-feat.npy
-            train_sets:
-                - - type_name: null
-                    data:
-                      - format: numpy
-                        path: set/train.npy
-            validation_sets:
-                - - type_name: null
-                    data:
-                      - format: numpy
-                        path: set/validation.npy
-            test_sets:
-                - - type_name: null
-                    data:
-                      - format: numpy
-                        path: set/test.npy
+            train_set:
+                - type_name: null
+                  data:
+                    - format: numpy
+                      path: set/train.npy
+            validation_set:
+                - type_name: null
+                  data:
+                    - format: numpy
+                      path: set/validation.npy
+            test_set:
+                - type_name: null
+                  data:
+                    - format: numpy
+                      path: set/test.npy
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f: