[GraphBolt] change TVT format of OnDiskDataset (#6076)

14f396d0 · Rhett Ying · GitHub · 17f6c4c9 · 14f396d0 · 14f396d0
Unverified Commit 14f396d0 authored Aug 01, 2023 by Rhett Ying Committed by GitHub Aug 01, 2023
4 changed files
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -15,7 +15,7 @@ import dgl
 from ..dataset import Dataset
 from ..itemset import ItemSet, ItemSetDict
-from ..utils import read_data, save_data, tensor_to_tuple
+from ..utils import read_data, save_data
 from .csc_sampling_graph import (
    CSCSamplingGraph,
@@ -173,33 +173,35 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
        ):
            for input_set_per_type, output_set_per_type in zip(
                intput_set_split, output_set_split
+            ):
+                for input_data, output_data in zip(
+                    input_set_per_type["data"], output_set_per_type["data"]
                ):
                    # Always save the feature in numpy format.
-                output_set_per_type["format"] = "numpy"
+                    output_data["format"] = "numpy"
-                output_set_per_type["path"] = str(
+                    output_data["path"] = str(
                        processed_dir_prefix
-                    / input_set_per_type["path"].replace("pt", "npy")
+                        / input_data["path"].replace("pt", "npy")
                    )
-                if input_set_per_type["format"] == "numpy":
+                    if input_data["format"] == "numpy":
                        # If the original format is numpy, just copy the file.
                        os.makedirs(
-                        dataset_path
+                            dataset_path / os.path.dirname(output_data["path"]),
-                        / os.path.dirname(output_set_per_type["path"]),
                            exist_ok=True,
                        )
                        shutil.copy(
-                        dataset_path / input_set_per_type["path"],
+                            dataset_path / input_data["path"],
-                        dataset_path / output_set_per_type["path"],
+                            dataset_path / output_data["path"],
                        )
                    else:
                        # If the original format is not numpy, convert it to numpy.
                        input_set = read_data(
-                        dataset_path / input_set_per_type["path"],
+                            dataset_path / input_data["path"],
-                        input_set_per_type["format"],
+                            input_data["format"],
                        )
                        save_data(
                            input_set,
-                        dataset_path / output_set_per_type["path"],
+                            dataset_path / output_data["path"],
                            output_set_per_type["format"],
                        )
@@ -245,17 +247,23 @@ class OnDiskDataset(Dataset):
            path: edge_data/author-writes-paper-feat.npy
        train_sets:
          - - type: paper # could be null for homogeneous graph.
-              format: numpy
+              data: # multiple data sources could be specified.
+                - format: numpy
                  in_memory: true # If not specified, default to true.
-              path: set/paper-train.npy
+                  path: set/paper-train-src.npy
+                - format: numpy
+                  in_memory: false
+                  path: set/paper-train-dst.npy
        validation_sets:
          - - type: paper
-              format: numpy
+              data:
+                - format: numpy
                  in_memory: true
                  path: set/paper-validation.npy
        test_sets:
          - - type: paper
-              format: numpy
+              data:
+                - format: numpy
                  in_memory: true
                  path: set/paper-test.npy
@@ -347,16 +355,21 @@ class OnDiskDataset(Dataset):
                assert (
                    len(tvt_set) == 1
                ), "Only one TVT set is allowed if type is not specified."
-                data = read_data(
+                ret.append(
-                    tvt_set[0].path, tvt_set[0].format, tvt_set[0].in_memory
+                    ItemSet(
+                        tuple(
+                            read_data(data.path, data.format, data.in_memory)
+                            for data in tvt_set[0].data
+                        )
+                    )
                )
-                ret.append(ItemSet(tensor_to_tuple(data)))
            else:
                data = {}
                for tvt in tvt_set:
                    data[tvt.type] = ItemSet(
-                        tensor_to_tuple(
+                        tuple(
-                            read_data(tvt.path, tvt.format, tvt.in_memory)
+                            read_data(data.path, data.format, data.in_memory)
+                            for data in tvt.data
                        )
                    )
                ret.append(ItemSetDict(data))

--- a/python/dgl/graphbolt/impl/ondisk_metadata.py
+++ b/python/dgl/graphbolt/impl/ondisk_metadata.py
@@ -8,6 +8,7 @@ import pydantic
 __all__ = [
    "OnDiskFeatureDataFormat",
+    "OnDiskTVTSetData",
    "OnDiskTVTSet",
    "OnDiskFeatureDataDomain",
    "OnDiskFeatureData",
@@ -24,15 +25,21 @@ class OnDiskFeatureDataFormat(str, Enum):
    NUMPY = "numpy"
-class OnDiskTVTSet(pydantic.BaseModel):
+class OnDiskTVTSetData(pydantic.BaseModel):
-    """Train-Validation-Test set."""
+    """Train-Validation-Test set data."""
-    type: Optional[str] = None
    format: OnDiskFeatureDataFormat
    in_memory: Optional[bool] = True
    path: str
+class OnDiskTVTSet(pydantic.BaseModel):
+    """Train-Validation-Test set."""
+    type: Optional[str] = None
+    data: List[OnDiskTVTSetData]
 class OnDiskFeatureDataDomain(str, Enum):
    """Enum of feature data domain."""

--- a/python/dgl/graphbolt/utils/internal.py
+++ b/python/dgl/graphbolt/utils/internal.py
@@ -45,9 +45,3 @@ def save_data(data, path, fmt):
        np.save(path, data)
    elif fmt == "torch":
        torch.save(data, path)
-def tensor_to_tuple(data):
-    """Split a torch.Tensor in column-wise to a tuple."""
-    assert isinstance(data, torch.Tensor), "data must be a torch.Tensor"
-    return tuple(data.t())
--- a/tests/python/pytorch/graphbolt/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/test_ondisk_dataset.py
@@ -22,7 +22,8 @@ def test_OnDiskDataset_TVTSet_exceptions():
        yaml_content = """
        train_sets:
          - - type: paper
-              format: torch_invalid
+              data:
+                - format: torch_invalid
                  path: set/paper-train.pt
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
@@ -35,10 +36,12 @@ def test_OnDiskDataset_TVTSet_exceptions():
        yaml_content = """
            train_sets:
              - - type: null
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: set/train.npy
                - type: null
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: set/train.npy
        """
        with open(yaml_file, "w") as f:
@@ -54,22 +57,25 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
    """Test TVTSet which returns ItemSet with IDs and labels."""
    with tempfile.TemporaryDirectory() as test_dir:
        train_ids = np.arange(1000)
+        train_ids_path = os.path.join(test_dir, "train_ids.npy")
+        np.save(train_ids_path, train_ids)
        train_labels = np.random.randint(0, 10, size=1000)
-        train_data = np.vstack([train_ids, train_labels]).T
+        train_labels_path = os.path.join(test_dir, "train_labels.npy")
-        train_path = os.path.join(test_dir, "train.npy")
+        np.save(train_labels_path, train_labels)
-        np.save(train_path, train_data)
        validation_ids = np.arange(1000, 2000)
+        validation_ids_path = os.path.join(test_dir, "validation_ids.npy")
+        np.save(validation_ids_path, validation_ids)
        validation_labels = np.random.randint(0, 10, size=1000)
-        validation_data = np.vstack([validation_ids, validation_labels]).T
+        validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
-        validation_path = os.path.join(test_dir, "validation.npy")
+        np.save(validation_labels_path, validation_labels)
-        np.save(validation_path, validation_data)
        test_ids = np.arange(2000, 3000)
+        test_ids_path = os.path.join(test_dir, "test_ids.npy")
+        np.save(test_ids_path, test_ids)
        test_labels = np.random.randint(0, 10, size=1000)
-        test_data = np.vstack([test_ids, test_labels]).T
+        test_labels_path = os.path.join(test_dir, "test_labels.npy")
-        test_path = os.path.join(test_dir, "test.npy")
+        np.save(test_labels_path, test_labels)
-        np.save(test_path, test_data)
        # Case 1:
        #   all TVT sets are specified.
@@ -78,26 +84,30 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
        yaml_content = f"""
            train_sets:
              - - type: null
-                  format: numpy
+                  data:
+                    - format: numpy
                      in_memory: true
-                  path: {train_path}
+                      path: {train_ids_path}
-              - - type: null
+                    - format: numpy
-                  format: numpy
+                      in_memory: true
-                  path: {train_path}
+                      path: {train_labels_path}
            validation_sets:
-              - - format: numpy
+              - - data:
-                  path: {validation_path}
+                    - format: numpy
-              - - type: null
+                      in_memory: true
-                  format: numpy
+                      path: {validation_ids_path}
-                  path: {validation_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {validation_labels_path}
            test_sets:
              - - type: null
-                  format: numpy
+                  data:
-                  in_memory: false
+                    - format: numpy
-                  path: {test_path}
+                      in_memory: true
-              - - type: null
+                      path: {test_ids_path}
-                  format: numpy
+                    - format: numpy
-                  path: {test_path}
+                      in_memory: true
+                      path: {test_labels_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
@@ -107,7 +117,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
        # Verify train set.
        train_sets = dataset.train_sets
-        assert len(train_sets) == 2
+        assert len(train_sets) == 1
        for train_set in train_sets:
            assert len(train_set) == 1000
            assert isinstance(train_set, gb.ItemSet)
@@ -118,7 +128,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
        # Verify validation set.
        validation_sets = dataset.validation_sets
-        assert len(validation_sets) == 2
+        assert len(validation_sets) == 1
        for validation_set in validation_sets:
            assert len(validation_set) == 1000
            assert isinstance(validation_set, gb.ItemSet)
@@ -129,7 +139,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
        # Verify test set.
        test_sets = dataset.test_sets
-        assert len(test_sets) == 2
+        assert len(test_sets) == 1
        for test_set in test_sets:
            assert len(test_set) == 1000
            assert isinstance(test_set, gb.ItemSet)
@@ -143,8 +153,9 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
        yaml_content = f"""
            train_sets:
              - - type: null
-                  format: numpy
+                  data:
-                  path: {train_path}
+                    - format: numpy
+                      path: {train_ids_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
@@ -160,47 +171,72 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
 def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
    """Test TVTSet which returns ItemSet with IDs and labels."""
    with tempfile.TemporaryDirectory() as test_dir:
-        train_pairs = (np.arange(1000), np.arange(1000, 2000))
+        train_src = np.arange(1000)
+        train_src_path = os.path.join(test_dir, "train_src.npy")
+        np.save(train_src_path, train_src)
+        train_dst = np.arange(1000, 2000)
+        train_dst_path = os.path.join(test_dir, "train_dst.npy")
+        np.save(train_dst_path, train_dst)
        train_labels = np.random.randint(0, 10, size=1000)
-        train_data = np.vstack([train_pairs, train_labels]).T
+        train_labels_path = os.path.join(test_dir, "train_labels.npy")
-        train_path = os.path.join(test_dir, "train.npy")
+        np.save(train_labels_path, train_labels)
-        np.save(train_path, train_data)
+        validation_src = np.arange(1000, 2000)
-        validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
+        validation_src_path = os.path.join(test_dir, "validation_src.npy")
+        np.save(validation_src_path, validation_src)
+        validation_dst = np.arange(2000, 3000)
+        validation_dst_path = os.path.join(test_dir, "validation_dst.npy")
+        np.save(validation_dst_path, validation_dst)
        validation_labels = np.random.randint(0, 10, size=1000)
-        validation_data = np.vstack([validation_pairs, validation_labels]).T
+        validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
-        validation_path = os.path.join(test_dir, "validation.npy")
+        np.save(validation_labels_path, validation_labels)
-        np.save(validation_path, validation_data)
+        test_src = np.arange(2000, 3000)
-        test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
+        test_src_path = os.path.join(test_dir, "test_src.npy")
+        np.save(test_src_path, test_src)
+        test_dst = np.arange(3000, 4000)
+        test_dst_path = os.path.join(test_dir, "test_dst.npy")
+        np.save(test_dst_path, test_dst)
        test_labels = np.random.randint(0, 10, size=1000)
-        test_data = np.vstack([test_pairs, test_labels]).T
+        test_labels_path = os.path.join(test_dir, "test_labels.npy")
-        test_path = os.path.join(test_dir, "test.npy")
+        np.save(test_labels_path, test_labels)
-        np.save(test_path, test_data)
        yaml_content = f"""
            train_sets:
              - - type: null
-                  format: numpy
+                  data:
+                    - format: numpy
                      in_memory: true
-                  path: {train_path}
+                      path: {train_src_path}
-              - - type: null
+                    - format: numpy
-                  format: numpy
+                      in_memory: true
-                  path: {train_path}
+                      path: {train_dst_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {train_labels_path}
            validation_sets:
-              - - format: numpy
+              - - data:
-                  path: {validation_path}
+                    - format: numpy
-              - - type: null
+                      in_memory: true
-                  format: numpy
+                      path: {validation_src_path}
-                  path: {validation_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {validation_dst_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {validation_labels_path}
            test_sets:
              - - type: null
-                  format: numpy
+                  data:
-                  in_memory: false
+                    - format: numpy
-                  path: {test_path}
+                      in_memory: true
-              - - type: null
+                      path: {test_src_path}
-                  format: numpy
+                    - format: numpy
-                  path: {test_path}
+                      in_memory: true
+                      path: {test_dst_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {test_labels_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
        with open(yaml_file, "w") as f:
@@ -210,42 +246,162 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
        # Verify train set.
        train_sets = dataset.train_sets
-        assert len(train_sets) == 2
+        assert len(train_sets) == 1
        for train_set in train_sets:
            assert len(train_set) == 1000
            assert isinstance(train_set, gb.ItemSet)
            for i, (src, dst, label) in enumerate(train_set):
-                assert src == train_pairs[0][i]
+                assert src == train_src[i]
-                assert dst == train_pairs[1][i]
+                assert dst == train_dst[i]
                assert label == train_labels[i]
        train_sets = None
        # Verify validation set.
        validation_sets = dataset.validation_sets
-        assert len(validation_sets) == 2
+        assert len(validation_sets) == 1
        for validation_set in validation_sets:
            assert len(validation_set) == 1000
            assert isinstance(validation_set, gb.ItemSet)
            for i, (src, dst, label) in enumerate(validation_set):
-                assert src == validation_pairs[0][i]
+                assert src == validation_src[i]
-                assert dst == validation_pairs[1][i]
+                assert dst == validation_dst[i]
                assert label == validation_labels[i]
        validation_sets = None
        # Verify test set.
        test_sets = dataset.test_sets
-        assert len(test_sets) == 2
+        assert len(test_sets) == 1
        for test_set in test_sets:
            assert len(test_set) == 1000
            assert isinstance(test_set, gb.ItemSet)
            for i, (src, dst, label) in enumerate(test_set):
-                assert src == test_pairs[0][i]
+                assert src == test_src[i]
-                assert dst == test_pairs[1][i]
+                assert dst == test_dst[i]
                assert label == test_labels[i]
        test_sets = None
        dataset = None
+def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
+    """Test TVTSet which returns ItemSet with node pairs and negative ones."""
+    with tempfile.TemporaryDirectory() as test_dir:
+        train_src = np.arange(1000)
+        train_src_path = os.path.join(test_dir, "train_src.npy")
+        np.save(train_src_path, train_src)
+        train_dst = np.arange(1000, 2000)
+        train_dst_path = os.path.join(test_dir, "train_dst.npy")
+        np.save(train_dst_path, train_dst)
+        train_neg_dst = np.random.choice(1000 * 10, size=1000 * 10).reshape(
+            1000, 10
+        )
+        train_neg_dst_path = os.path.join(test_dir, "train_neg_dst.npy")
+        np.save(train_neg_dst_path, train_neg_dst)
+        validation_src = np.arange(1000, 2000)
+        validation_src_path = os.path.join(test_dir, "validation_src.npy")
+        np.save(validation_src_path, validation_src)
+        validation_dst = np.arange(2000, 3000)
+        validation_dst_path = os.path.join(test_dir, "validation_dst.npy")
+        np.save(validation_dst_path, validation_dst)
+        validation_neg_dst = train_neg_dst + 1
+        validation_neg_dst_path = os.path.join(
+            test_dir, "validation_neg_dst.npy"
+        )
+        np.save(validation_neg_dst_path, validation_neg_dst)
+        test_src = np.arange(2000, 3000)
+        test_src_path = os.path.join(test_dir, "test_src.npy")
+        np.save(test_src_path, test_src)
+        test_dst = np.arange(3000, 4000)
+        test_dst_path = os.path.join(test_dir, "test_dst.npy")
+        np.save(test_dst_path, test_dst)
+        test_neg_dst = train_neg_dst + 2
+        test_neg_dst_path = os.path.join(test_dir, "test_neg_dst.npy")
+        np.save(test_neg_dst_path, test_neg_dst)
+        yaml_content = f"""
+            train_sets:
+              - - type: null
+                  data:
+                    - format: numpy
+                      in_memory: true
+                      path: {train_src_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {train_dst_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {train_neg_dst_path}
+            validation_sets:
+              - - data:
+                    - format: numpy
+                      in_memory: true
+                      path: {validation_src_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {validation_dst_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {validation_neg_dst_path}
+            test_sets:
+              - - type: null
+                  data:
+                    - format: numpy
+                      in_memory: true
+                      path: {test_src_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {test_dst_path}
+                    - format: numpy
+                      in_memory: true
+                      path: {test_neg_dst_path}
+        """
+        yaml_file = os.path.join(test_dir, "test.yaml")
+        with open(yaml_file, "w") as f:
+            f.write(yaml_content)
+        dataset = gb.OnDiskDataset(yaml_file)
+        # Verify train set.
+        train_sets = dataset.train_sets
+        assert len(train_sets) == 1
+        for train_set in train_sets:
+            assert len(train_set) == 1000
+            assert isinstance(train_set, gb.ItemSet)
+            for i, (src, dst, negs) in enumerate(train_set):
+                assert src == train_src[i]
+                assert dst == train_dst[i]
+                assert torch.equal(negs, torch.from_numpy(train_neg_dst[i]))
+        train_sets = None
+        # Verify validation set.
+        validation_sets = dataset.validation_sets
+        assert len(validation_sets) == 1
+        for validation_set in validation_sets:
+            assert len(validation_set) == 1000
+            assert isinstance(validation_set, gb.ItemSet)
+            for i, (src, dst, negs) in enumerate(validation_set):
+                assert src == validation_src[i]
+                assert dst == validation_dst[i]
+                assert torch.equal(
+                    negs, torch.from_numpy(validation_neg_dst[i])
+                )
+        validation_sets = None
+        # Verify test set.
+        test_sets = dataset.test_sets
+        assert len(test_sets) == 1
+        for test_set in test_sets:
+            assert len(test_set) == 1000
+            assert isinstance(test_set, gb.ItemSet)
+            for i, (src, dst, negs) in enumerate(test_set):
+                assert src == test_src[i]
+                assert dst == test_dst[i]
+                assert torch.equal(negs, torch.from_numpy(test_neg_dst[i]))
+        test_sets = None
+        dataset = None
 def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
    """Test TVTSet which returns ItemSetDict with IDs and labels."""
    with tempfile.TemporaryDirectory() as test_dir:
@@ -270,26 +426,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
        yaml_content = f"""
            train_sets:
              - - type: paper
-                  format: numpy
+                  data:
+                    - format: numpy
                      in_memory: true
                      path: {train_path}
              - - type: author
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: {train_path}
            validation_sets:
              - - type: paper
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: {validation_path}
              - - type: author
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: {validation_path}
            test_sets:
              - - type: paper
-                  format: numpy
+                  data:
+                    - format: numpy
                      in_memory: false
                      path: {test_path}
              - - type: author
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: {test_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
@@ -372,26 +534,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
        yaml_content = f"""
            train_sets:
              - - type: paper
-                  format: numpy
+                  data:
+                    - format: numpy
                      in_memory: true
                      path: {train_path}
              - - type: author
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: {train_path}
            validation_sets:
              - - type: paper
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: {validation_path}
              - - type: author
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: {validation_path}
            test_sets:
              - - type: paper
-                  format: numpy
+                  data:
+                    - format: numpy
                      in_memory: false
                      path: {test_path}
              - - type: author
-                  format: numpy
+                  data:
+                    - format: numpy
                      path: {test_path}
        """
        yaml_file = os.path.join(test_dir, "test.yaml")
@@ -829,16 +997,18 @@ def test_OnDiskDataset_preprocess_homogeneous():
                  path: data/node-feat.npy
            train_sets:
                - - type_name: null
-                    # shape: (num_trains, 3), 3 for (src, dst, label).
+                    data:
-                    format: numpy
+                      - format: numpy
                        path: set/train.npy
            validation_sets:
                - - type_name: null
-                    format: numpy
+                    data:
+                      - format: numpy
                        path: set/validation.npy
            test_sets:
                - - type_name: null
-                    format: numpy
+                    data:
+                      - format: numpy
                        path: set/test.npy
        """
        yaml_file = os.path.join(test_dir, "test.yaml")