[GraphBolt] update names of ItemSet in OnDiskDataset testcases (#6289)

50b05723 · Rhett Ying · GitHub · 19d63943 · 50b05723 · 50b05723
Unverified Commit 50b05723 authored Sep 06, 2023 by Rhett Ying Committed by GitHub Sep 06, 2023
2 changed files
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -313,24 +313,36 @@ class OnDiskDataset(Dataset):
            train_set:
              - type: paper # could be null for homogeneous graph.
                data: # multiple data sources could be specified.
-                  - format: numpy
+                  - name: node_pairs
+                    format: numpy
                    in_memory: true # If not specified, default to true.
-                    path: set/paper-train-src.npy
-                  - format: numpy
+                    path: set/paper-train-node_pairs.npy
+                  - name: labels
+                    format: numpy
                    in_memory: false
-                    path: set/paper-train-dst.npy
+                    path: set/paper-train-labels.npy
            validation_set:
              - type: paper
                data:
-                  - format: numpy
+                  - name: node_pairs
+                    format: numpy
+                    in_memory: true
+                    path: set/paper-validation-node_pairs.npy
+                  - name: labels
+                    format: numpy
                    in_memory: true
-                    path: set/paper-validation.npy
+                    path: set/paper-validation-labels.npy
            test_set:
              - type: paper
                data:
-                  - format: numpy
+                  - name: node_pairs
+                    format: numpy
+                    in_memory: true
+                    path: set/paper-test-node_pairs.npy
+                  - name: labels
+                    format: numpy
                    in_memory: true
-                    path: set/paper-test.npy
+                    path: set/paper-test-labels.npy

    Parameters
    ----------

--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -287,35 +287,28 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
        dataset = None


-def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
-    """Test TVTSet which returns ItemSet with IDs and labels."""
+def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_labels():
+    """Test TVTSet which returns ItemSet with node pairs and labels."""
    with tempfile.TemporaryDirectory() as test_dir:
-        train_src = np.arange(1000)
-        train_src_path = os.path.join(test_dir, "train_src.npy")
-        np.save(train_src_path, train_src)
-        train_dst = np.arange(1000, 2000)
-        train_dst_path = os.path.join(test_dir, "train_dst.npy")
-        np.save(train_dst_path, train_dst)
+        train_node_pairs = np.arange(2000).reshape(1000, 2)
+        train_node_pairs_path = os.path.join(test_dir, "train_node_pairs.npy")
+        np.save(train_node_pairs_path, train_node_pairs)
        train_labels = np.random.randint(0, 10, size=1000)
        train_labels_path = os.path.join(test_dir, "train_labels.npy")
        np.save(train_labels_path, train_labels)

-        validation_src = np.arange(1000, 2000)
-        validation_src_path = os.path.join(test_dir, "validation_src.npy")
-        np.save(validation_src_path, validation_src)
-        validation_dst = np.arange(2000, 3000)
-        validation_dst_path = os.path.join(test_dir, "validation_dst.npy")
-        np.save(validation_dst_path, validation_dst)
+        validation_node_pairs = np.arange(2000, 4000).reshape(1000, 2)
+        validation_node_pairs_path = os.path.join(
+            test_dir, "validation_node_pairs.npy"
+        )
+        np.save(validation_node_pairs_path, validation_node_pairs)
        validation_labels = np.random.randint(0, 10, size=1000)
        validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
        np.save(validation_labels_path, validation_labels)

-        test_src = np.arange(2000, 3000)
-        test_src_path = os.path.join(test_dir, "test_src.npy")
-        np.save(test_src_path, test_src)
-        test_dst = np.arange(3000, 4000)
-        test_dst_path = os.path.join(test_dir, "test_dst.npy")
-        np.save(test_dst_path, test_dst)
+        test_node_pairs = np.arange(4000, 6000).reshape(1000, 2)
+        test_node_pairs_path = os.path.join(test_dir, "test_node_pairs.npy")
+        np.save(test_node_pairs_path, test_node_pairs)
        test_labels = np.random.randint(0, 10, size=1000)
        test_labels_path = os.path.join(test_dir, "test_labels.npy")
        np.save(test_labels_path, test_labels)
@@ -326,28 +319,20 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
                train_set:
                  - type: null
                    data:
-                      - name: src
+                      - name: node_pairs
                        format: numpy
                        in_memory: true
-                        path: {train_src_path}
-                      - name: dst
-                        format: numpy
-                        in_memory: true
-                        path: {train_dst_path}
+                        path: {train_node_pairs_path}
                      - name: labels
                        format: numpy
                        in_memory: true
                        path: {train_labels_path}
                validation_set:
                  - data:
-                      - name: src
-                        format: numpy
-                        in_memory: true
-                        path: {validation_src_path}
-                      - name: dst
+                      - name: node_pairs
                        format: numpy
                        in_memory: true
-                        path: {validation_dst_path}
+                        path: {validation_node_pairs_path}
                      - name: labels
                        format: numpy
                        in_memory: true
@@ -355,14 +340,10 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
                test_set:
                  - type: null
                    data:
-                      - name: src
-                        format: numpy
-                        in_memory: true
-                        path: {test_src_path}
-                      - name: dst
+                      - name: node_pairs
                        format: numpy
                        in_memory: true
-                        path: {test_dst_path}
+                        path: {test_node_pairs_path}
                      - name: labels
                        format: numpy
                        in_memory: true
@@ -379,70 +360,63 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
        train_set = dataset.tasks[0].train_set
        assert len(train_set) == 1000
        assert isinstance(train_set, gb.ItemSet)
-        for i, (src, dst, label) in enumerate(train_set):
-            assert src == train_src[i]
-            assert dst == train_dst[i]
+        for i, (node_pair, label) in enumerate(train_set):
+            assert node_pair[0] == train_node_pairs[i][0]
+            assert node_pair[1] == train_node_pairs[i][1]
            assert label == train_labels[i]
-        assert train_set.names == ("src", "dst", "labels")
+        assert train_set.names == ("node_pairs", "labels")
        train_set = None

        # Verify validation set.
        validation_set = dataset.tasks[0].validation_set
        assert len(validation_set) == 1000
        assert isinstance(validation_set, gb.ItemSet)
-        for i, (src, dst, label) in enumerate(validation_set):
-            assert src == validation_src[i]
-            assert dst == validation_dst[i]
+        for i, (node_pair, label) in enumerate(validation_set):
+            assert node_pair[0] == validation_node_pairs[i][0]
+            assert node_pair[1] == validation_node_pairs[i][1]
            assert label == validation_labels[i]
-        assert validation_set.names == ("src", "dst", "labels")
+        assert validation_set.names == ("node_pairs", "labels")
        validation_set = None

        # Verify test set.
        test_set = dataset.tasks[0].test_set
        assert len(test_set) == 1000
        assert isinstance(test_set, gb.ItemSet)
-        for i, (src, dst, label) in enumerate(test_set):
-            assert src == test_src[i]
-            assert dst == test_dst[i]
+        for i, (node_pair, label) in enumerate(test_set):
+            assert node_pair[0] == test_node_pairs[i][0]
+            assert node_pair[1] == test_node_pairs[i][1]
            assert label == test_labels[i]
-        assert test_set.names == ("src", "dst", "labels")
+        assert test_set.names == ("node_pairs", "labels")
        test_set = None
        dataset = None


-def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
+def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_negs():
    """Test TVTSet which returns ItemSet with node pairs and negative ones."""
    with tempfile.TemporaryDirectory() as test_dir:
-        train_src = np.arange(1000)
-        train_src_path = os.path.join(test_dir, "train_src.npy")
-        np.save(train_src_path, train_src)
-        train_dst = np.arange(1000, 2000)
-        train_dst_path = os.path.join(test_dir, "train_dst.npy")
-        np.save(train_dst_path, train_dst)
+        train_node_pairs = np.arange(2000).reshape(1000, 2)
+        train_node_pairs_path = os.path.join(test_dir, "train_node_pairs.npy")
+        np.save(train_node_pairs_path, train_node_pairs)
        train_neg_dst = np.random.choice(1000 * 10, size=1000 * 10).reshape(
            1000, 10
        )
        train_neg_dst_path = os.path.join(test_dir, "train_neg_dst.npy")
        np.save(train_neg_dst_path, train_neg_dst)

-        validation_src = np.arange(1000, 2000)
-        validation_src_path = os.path.join(test_dir, "validation_src.npy")
-        np.save(validation_src_path, validation_src)
-        validation_dst = np.arange(2000, 3000)
-        validation_dst_path = os.path.join(test_dir, "validation_dst.npy")
-        np.save(validation_dst_path, validation_dst)
+        validation_node_pairs = np.arange(2000, 4000).reshape(1000, 2)
+        validation_node_pairs_path = os.path.join(
+            test_dir, "validation_node_pairs.npy"
+        )
+        np.save(validation_node_pairs_path, validation_node_pairs)
        validation_neg_dst = train_neg_dst + 1
        validation_neg_dst_path = os.path.join(
            test_dir, "validation_neg_dst.npy"
        )
        np.save(validation_neg_dst_path, validation_neg_dst)

-        test_src = np.arange(2000, 3000)
-        test_src_path = os.path.join(test_dir, "test_src.npy")
-        np.save(test_src_path, test_src)
-        test_dst = np.arange(3000, 4000)
-        test_dst_path = os.path.join(test_dir, "test_dst.npy")
-        np.save(test_dst_path, test_dst)
+        test_node_pairs = np.arange(4000, 6000).reshape(1000, 2)
+        test_node_pairs_path = os.path.join(test_dir, "test_node_pairs.npy")
+        np.save(test_node_pairs_path, test_node_pairs)
        test_neg_dst = train_neg_dst + 2
        test_neg_dst_path = os.path.join(test_dir, "test_neg_dst.npy")
        np.save(test_neg_dst_path, test_neg_dst)
@@ -453,44 +427,32 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
                train_set:
                  - type: null
                    data:
-                      - name: src
+                      - name: node_pairs
                        format: numpy
                        in_memory: true
-                        path: {train_src_path}
-                      - name: dst
-                        format: numpy
-                        in_memory: true
-                        path: {train_dst_path}
-                      - name: negative_dst
+                        path: {train_node_pairs_path}
+                      - name: negative_dsts
                        format: numpy
                        in_memory: true
                        path: {train_neg_dst_path}
                validation_set:
                  - data:
-                      - name: src
-                        format: numpy
-                        in_memory: true
-                        path: {validation_src_path}
-                      - name: dst
+                      - name: node_pairs
                        format: numpy
                        in_memory: true
-                        path: {validation_dst_path}
-                      - name: negative_dst
+                        path: {validation_node_pairs_path}
+                      - name: negative_dsts
                        format: numpy
                        in_memory: true
                        path: {validation_neg_dst_path}
                test_set:
                  - type: null
                    data:
-                      - name: src
-                        format: numpy
-                        in_memory: true
-                        path: {test_src_path}
-                      - name: dst
+                      - name: node_pairs
                        format: numpy
                        in_memory: true
-                        path: {test_dst_path}
-                      - name: negative_dst
+                        path: {test_node_pairs_path}
+                      - name: negative_dsts
                        format: numpy
                        in_memory: true
                        path: {test_neg_dst_path}
@@ -506,33 +468,33 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
        train_set = dataset.tasks[0].train_set
        assert len(train_set) == 1000
        assert isinstance(train_set, gb.ItemSet)
-        for i, (src, dst, negs) in enumerate(train_set):
-            assert src == train_src[i]
-            assert dst == train_dst[i]
+        for i, (node_pair, negs) in enumerate(train_set):
+            assert node_pair[0] == train_node_pairs[i][0]
+            assert node_pair[1] == train_node_pairs[i][1]
            assert torch.equal(negs, torch.from_numpy(train_neg_dst[i]))
-        assert train_set.names == ("src", "dst", "negative_dst")
+        assert train_set.names == ("node_pairs", "negative_dsts")
        train_set = None

        # Verify validation set.
        validation_set = dataset.tasks[0].validation_set
        assert len(validation_set) == 1000
        assert isinstance(validation_set, gb.ItemSet)
-        for i, (src, dst, negs) in enumerate(validation_set):
-            assert src == validation_src[i]
-            assert dst == validation_dst[i]
+        for i, (node_pair, negs) in enumerate(validation_set):
+            assert node_pair[0] == validation_node_pairs[i][0]
+            assert node_pair[1] == validation_node_pairs[i][1]
            assert torch.equal(negs, torch.from_numpy(validation_neg_dst[i]))
-        assert validation_set.names == ("src", "dst", "negative_dst")
+        assert validation_set.names == ("node_pairs", "negative_dsts")
        validation_set = None

        # Verify test set.
        test_set = dataset.tasks[0].test_set
        assert len(test_set) == 1000
        assert isinstance(test_set, gb.ItemSet)
-        for i, (src, dst, negs) in enumerate(test_set):
-            assert src == test_src[i]
-            assert dst == test_dst[i]
+        for i, (node_pair, negs) in enumerate(test_set):
+            assert node_pair[0] == test_node_pairs[i][0]
+            assert node_pair[1] == test_node_pairs[i][1]
            assert torch.equal(negs, torch.from_numpy(test_neg_dst[i]))
-        assert test_set.names == ("src", "dst", "negative_dst")
+        assert test_set.names == ("node_pairs", "negative_dsts")
        test_set = None
        dataset = None

@@ -651,65 +613,92 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
        dataset = None


-def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
+def test_OnDiskDataset_TVTSet_ItemSetDict_node_pairs_labels():
    """Test TVTSet which returns ItemSetDict with node pairs and labels."""
    with tempfile.TemporaryDirectory() as test_dir:
-        train_pairs = (np.arange(1000), np.arange(1000, 2000))
+        train_node_pairs = np.arange(2000).reshape(1000, 2)
+        train_node_pairs_path = os.path.join(test_dir, "train_node_pairs.npy")
+        np.save(train_node_pairs_path, train_node_pairs)
        train_labels = np.random.randint(0, 10, size=1000)
-        train_data = np.vstack([train_pairs, train_labels]).T
-        train_path = os.path.join(test_dir, "train.npy")
-        np.save(train_path, train_data)
+        train_labels_path = os.path.join(test_dir, "train_labels.npy")
+        np.save(train_labels_path, train_labels)

-        validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
+        validation_node_pairs = np.arange(2000, 4000).reshape(1000, 2)
+        validation_node_pairs_path = os.path.join(
+            test_dir, "validation_node_pairs.npy"
+        )
+        np.save(validation_node_pairs_path, validation_node_pairs)
        validation_labels = np.random.randint(0, 10, size=1000)
-        validation_data = np.vstack([validation_pairs, validation_labels]).T
-        validation_path = os.path.join(test_dir, "validation.npy")
-        np.save(validation_path, validation_data)
+        validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
+        np.save(validation_labels_path, validation_labels)

-        test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
+        test_node_pairs = np.arange(4000, 6000).reshape(1000, 2)
+        test_node_pairs_path = os.path.join(test_dir, "test_node_pairs.npy")
+        np.save(test_node_pairs_path, test_node_pairs)
        test_labels = np.random.randint(0, 10, size=1000)
-        test_data = np.vstack([test_pairs, test_labels]).T
-        test_path = os.path.join(test_dir, "test.npy")
-        np.save(test_path, test_data)
+        test_labels_path = os.path.join(test_dir, "test_labels.npy")
+        np.save(test_labels_path, test_labels)

        yaml_content = f"""
            tasks:
              - name: edge_classification
                train_set:
-                  - type: paper
+                  - type: paper:cites:paper
                    data:
-                      - name: node_pair
+                      - name: node_pairs
                        format: numpy
                        in_memory: true
-                        path: {train_path}
-                  - type: author
+                        path: {train_node_pairs_path}
+                      - name: labels
+                        format: numpy
+                        in_memory: true
+                        path: {train_labels_path}
+                  - type: author:writes:paper
                    data:
-                      - name: node_pair
+                      - name: node_pairs
                        format: numpy
-                        path: {train_path}
+                        path: {train_node_pairs_path}
+                      - name: labels
+                        format: numpy
+                        path: {train_labels_path}
                validation_set:
-                  - type: paper
+                  - type: paper:cites:paper
                    data:
-                      - name: node_pair
+                      - name: node_pairs
                        format: numpy
-                        path: {validation_path}
-                  - type: author
+                        path: {validation_node_pairs_path}
+                      - name: labels
+                        format: numpy
+                        path: {validation_labels_path}
+                  - type: author:writes:paper
                    data:
-                      - name: node_pair
+                      - name: node_pairs
                        format: numpy
-                        path: {validation_path}
+                        path: {validation_node_pairs_path}
+                      - name: labels
+                        format: numpy
+                        path: {validation_labels_path}
                test_set:
-                  - type: paper
+                  - type: paper:cites:paper
                    data:
-                      - name: node_pair
+                      - name: node_pairs
                        format: numpy
-                        in_memory: false
-                        path: {test_path}
-                  - type: author
+                        in_memory: true
+                        path: {test_node_pairs_path}
+                      - name: labels
+                        format: numpy
+                        in_memory: true
+                        path: {test_labels_path}
+                  - type: author:writes:paper
                    data:
-                      - name: node_pair
+                      - name: node_pairs
                        format: numpy
-                        path: {test_path}
+                        in_memory: true
+                        path: {test_node_pairs_path}
+                      - name: labels
+                        format: numpy
+                        in_memory: true
+                        path: {test_labels_path}
        """
        os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True)
        yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
@@ -726,12 +715,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
            assert isinstance(item, dict)
            assert len(item) == 1
            key = list(item.keys())[0]
-            assert key in ["paper", "author"]
-            src, dst, label = item[key]
-            assert src == train_pairs[0][i % 1000]
-            assert dst == train_pairs[1][i % 1000]
+            assert key in ["paper:cites:paper", "author:writes:paper"]
+            node_pair, label = item[key]
+            assert node_pair[0] == train_node_pairs[i % 1000][0]
+            assert node_pair[1] == train_node_pairs[i % 1000][1]
            assert label == train_labels[i % 1000]
-        assert train_set.names == ("node_pair",)
+        assert train_set.names == ("node_pairs", "labels")
        train_set = None

        # Verify validation set.
@@ -742,12 +731,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
            assert isinstance(item, dict)
            assert len(item) == 1
            key = list(item.keys())[0]
-            assert key in ["paper", "author"]
-            src, dst, label = item[key]
-            assert src == validation_pairs[0][i % 1000]
-            assert dst == validation_pairs[1][i % 1000]
+            assert key in ["paper:cites:paper", "author:writes:paper"]
+            node_pair, label = item[key]
+            assert node_pair[0] == validation_node_pairs[i % 1000][0]
+            assert node_pair[1] == validation_node_pairs[i % 1000][1]
            assert label == validation_labels[i % 1000]
-        assert validation_set.names == ("node_pair",)
+        assert validation_set.names == ("node_pairs", "labels")
        validation_set = None

        # Verify test set.
@@ -758,12 +747,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
            assert isinstance(item, dict)
            assert len(item) == 1
            key = list(item.keys())[0]
-            assert key in ["paper", "author"]
-            src, dst, label = item[key]
-            assert src == test_pairs[0][i % 1000]
-            assert dst == test_pairs[1][i % 1000]
+            assert key in ["paper:cites:paper", "author:writes:paper"]
+            node_pair, label = item[key]
+            assert node_pair[0] == test_node_pairs[i % 1000][0]
+            assert node_pair[1] == test_node_pairs[i % 1000][1]
            assert label == test_labels[i % 1000]
-        assert test_set.names == ("node_pair",)
+        assert test_set.names == ("node_pairs", "labels")
        test_set = None
        dataset = None