[GraphBolt] Combining `test_OnDiskDataset_preprocess_yaml_content` tests for...

[GraphBolt] Combining `test_OnDiskDataset_preprocess_yaml_content` tests for Windows and Unix. (#7148)

[GraphBolt] Combining `test_OnDiskDataset_preprocess_yaml_content` tests for...
[GraphBolt] Combining `test_OnDiskDataset_preprocess_yaml_content` tests for Windows and Unix. (#7148)
85c030f6 · Andrei Ivanov · GitHub · 5d2d1453 · 85c030f6
Unverified Commit 85c030f6 authored Feb 28, 2024 by Andrei Ivanov Committed by GitHub Feb 29, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 179 deletions

tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py +28 -179

No files found.
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -1625,8 +1625,7 @@ def test_OnDiskDataset_preprocess_path():
            _ = gb.OnDiskDataset(fake_dir)


-@unittest.skipIf(os.name == "nt", "Skip on Windows")
-def test_OnDiskDataset_preprocess_yaml_content_unix():
+def test_OnDiskDataset_preprocess_yaml_content():
    """Test if the preprocessed metadata.yaml is correct."""
    with tempfile.TemporaryDirectory() as test_dir:
        # All metadata fields are specified.
@@ -1640,196 +1639,45 @@ def test_OnDiskDataset_preprocess_yaml_content_unix():
        neighbors = np.random.randint(0, num_nodes, size=(num_edges))
        edges = np.stack([nodes, neighbors], axis=1)
        # Write into edges/edge.csv
-        os.makedirs(os.path.join(test_dir, "edges/"), exist_ok=True)
-        edges = pd.DataFrame(edges, columns=["src", "dst"])
-        edges.to_csv(
-            os.path.join(test_dir, "edges/edge.csv"),
-            index=False,
-            header=False,
-        )
-
-        # Generate random graph edge-feats.
-        edge_feats = np.random.rand(num_edges, 5)
-        os.makedirs(os.path.join(test_dir, "data/"), exist_ok=True)
-        np.save(os.path.join(test_dir, "data/edge-feat.npy"), edge_feats)
-
-        # Generate random node-feats.
-        node_feats = np.random.rand(num_nodes, 10)
-        np.save(os.path.join(test_dir, "data/node-feat.npy"), node_feats)
-
-        # Generate train/test/valid set.
-        os.makedirs(os.path.join(test_dir, "set/"), exist_ok=True)
-        train_pairs = (np.arange(1000), np.arange(1000, 2000))
-        train_labels = np.random.randint(0, 10, size=1000)
-        train_data = np.vstack([train_pairs, train_labels]).T
-        train_path = os.path.join(test_dir, "set/train.npy")
-        np.save(train_path, train_data)
-
-        validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
-        validation_labels = np.random.randint(0, 10, size=1000)
-        validation_data = np.vstack([validation_pairs, validation_labels]).T
-        validation_path = os.path.join(test_dir, "set/validation.npy")
-        np.save(validation_path, validation_data)
-
-        test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
-        test_labels = np.random.randint(0, 10, size=1000)
-        test_data = np.vstack([test_pairs, test_labels]).T
-        test_path = os.path.join(test_dir, "set/test.npy")
-        np.save(test_path, test_data)
-
-        yaml_content = f"""
-            dataset_name: {dataset_name}
-            graph: # graph structure and required attributes.
-                nodes:
-                    - num: {num_nodes}
-                edges:
-                    - format: csv
-                      path: edges/edge.csv
-                feature_data:
-                    - domain: edge
-                      type: null
-                      name: feat
-                      format: numpy
-                      path: data/edge-feat.npy
-            feature_data:
-                - domain: node
-                  type: null
-                  name: feat
-                  format: numpy
-                  in_memory: false
-                  path: data/node-feat.npy
-            tasks:
-              - name: node_classification
-                num_classes: {num_classes}
-                train_set:
-                  - type: null
-                    data:
-                      - format: numpy
-                        path: set/train.npy
-                validation_set:
-                  - type: null
-                    data:
-                      - format: numpy
-                        path: set/validation.npy
-                test_set:
-                  - type: null
-                    data:
-                      - format: numpy
-                        path: set/test.npy
-        """
-        yaml_file = os.path.join(test_dir, "metadata.yaml")
-        with open(yaml_file, "w") as f:
-            f.write(yaml_content)
-
-        preprocessed_metadata_path = gb.preprocess_ondisk_dataset(test_dir)
-        with open(preprocessed_metadata_path, "r") as f:
-            yaml_data = yaml.safe_load(f)
-
-        target_yaml_content = f"""
-            dataset_name: {dataset_name}
-            graph_topology:
-              type: FusedCSCSamplingGraph
-              path: preprocessed/fused_csc_sampling_graph.pt
-            feature_data:
-              - domain: node
-                type: null
-                name: feat
-                format: numpy
-                in_memory: false
-                path: preprocessed/data/node-feat.npy
-            tasks:
-              - name: node_classification
-                num_classes: {num_classes}
-                train_set:
-                  - type: null
-                    data:
-                      - format: numpy
-                        path: preprocessed/set/train.npy
-                validation_set:
-                  - type: null
-                    data:
-                      - format: numpy
-                        path: preprocessed/set/validation.npy
-                test_set:
-                  - type: null
-                    data:
-                      - format: numpy
-                        path: preprocessed/set/test.npy
-            include_original_edge_id: False
-        """
-        target_yaml_data = yaml.safe_load(target_yaml_content)
-        # Check yaml content.
-        assert (
-            yaml_data == target_yaml_data
-        ), "The preprocessed metadata.yaml is not correct."
-
-        # Check file existence.
-        assert os.path.exists(
-            os.path.join(test_dir, yaml_data["graph_topology"]["path"])
-        )
-        assert os.path.exists(
-            os.path.join(test_dir, yaml_data["feature_data"][0]["path"])
-        )
-        for set_name in ["train_set", "validation_set", "test_set"]:
-            assert os.path.exists(
-                os.path.join(
-                    test_dir,
-                    yaml_data["tasks"][0][set_name][0]["data"][0]["path"],
-                )
-            )
-
-
-@unittest.skipIf(os.name != "nt", "Skip on Unix")
-def test_OnDiskDataset_preprocess_yaml_content_windows():
-    """Test if the preprocessed metadata.yaml is correct."""
-    with tempfile.TemporaryDirectory() as test_dir:
-        # All metadata fields are specified.
-        dataset_name = "graphbolt_test"
-        num_nodes = 4000
-        num_edges = 20000
-        num_classes = 10
-
-        # Generate random edges.
-        nodes = np.repeat(np.arange(num_nodes), 5)
-        neighbors = np.random.randint(0, num_nodes, size=(num_edges))
-        edges = np.stack([nodes, neighbors], axis=1)
-        # Write into edges/edge.csv
-        os.makedirs(os.path.join(test_dir, "edges\\"), exist_ok=True)
+        os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
        edges = pd.DataFrame(edges, columns=["src", "dst"])
+        edge_path = os.path.join("edges", "edge.csv")
        edges.to_csv(
-            os.path.join(test_dir, "edges\\edge.csv"),
+            os.path.join(test_dir, edge_path),
            index=False,
            header=False,
        )

        # Generate random graph edge-feats.
        edge_feats = np.random.rand(num_edges, 5)
-        os.makedirs(os.path.join(test_dir, "data\\"), exist_ok=True)
-        np.save(os.path.join(test_dir, "data\\edge-feat.npy"), edge_feats)
+        os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
+        feature_edge = os.path.join("data", "edge-feat.npy")
+        np.save(os.path.join(test_dir, feature_edge), edge_feats)

        # Generate random node-feats.
        node_feats = np.random.rand(num_nodes, 10)
-        np.save(os.path.join(test_dir, "data\\node-feat.npy"), node_feats)
+        feature_node = os.path.join("data", "node-feat.npy")
+        np.save(os.path.join(test_dir, feature_node), node_feats)

        # Generate train/test/valid set.
-        os.makedirs(os.path.join(test_dir, "set\\"), exist_ok=True)
+        os.makedirs(os.path.join(test_dir, "set"), exist_ok=True)
        train_pairs = (np.arange(1000), np.arange(1000, 2000))
        train_labels = np.random.randint(0, 10, size=1000)
        train_data = np.vstack([train_pairs, train_labels]).T
-        train_path = os.path.join(test_dir, "set\\train.npy")
-        np.save(train_path, train_data)
+        train_path = os.path.join("set", "train.npy")
+        np.save(os.path.join(test_dir, train_path), train_data)

        validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
        validation_labels = np.random.randint(0, 10, size=1000)
        validation_data = np.vstack([validation_pairs, validation_labels]).T
-        validation_path = os.path.join(test_dir, "set\\validation.npy")
-        np.save(validation_path, validation_data)
+        validation_path = os.path.join("set", "validation.npy")
+        np.save(os.path.join(test_dir, validation_path), validation_data)

        test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
        test_labels = np.random.randint(0, 10, size=1000)
        test_data = np.vstack([test_pairs, test_labels]).T
-        test_path = os.path.join(test_dir, "set\\test.npy")
-        np.save(test_path, test_data)
+        test_path = os.path.join("set", "test.npy")
+        np.save(os.path.join(test_dir, test_path), test_data)

        yaml_content = f"""
            dataset_name: {dataset_name}
@@ -1838,21 +1686,21 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
                    - num: {num_nodes}
                edges:
                    - format: csv
-                      path: edges\\edge.csv
+                      path: {edge_path}
                feature_data:
                    - domain: edge
                      type: null
                      name: feat
                      format: numpy
                      in_memory: true
-                      path: data\\edge-feat.npy
+                      path: {feature_edge}
            feature_data:
                - domain: node
                  type: null
                  name: feat
                  format: numpy
                  in_memory: false
-                  path: data\\node-feat.npy
+                  path: {feature_node}
            tasks:
              - name: node_classification
                num_classes: {num_classes}
@@ -1860,17 +1708,17 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
                  - type: null
                    data:
                      - format: numpy
-                        path: set\\train.npy
+                        path: {train_path}
                validation_set:
                  - type: null
                    data:
                      - format: numpy
-                        path: set\\validation.npy
+                        path: {validation_path}
                test_set:
                  - type: null
                    data:
                      - format: numpy
-                        path: set\\test.npy
+                        path: {test_path}
        """
        yaml_file = os.path.join(test_dir, "metadata.yaml")
        with open(yaml_file, "w") as f:
@@ -1880,18 +1728,19 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
        with open(preprocessed_metadata_path, "r") as f:
            yaml_data = yaml.safe_load(f)

+        topo_path = os.path.join("preprocessed", "fused_csc_sampling_graph.pt")
        target_yaml_content = f"""
            dataset_name: {dataset_name}
            graph_topology:
              type: FusedCSCSamplingGraph
-              path: preprocessed\\fused_csc_sampling_graph.pt
+              path: {topo_path}
            feature_data:
              - domain: node
                type: null
                name: feat
                format: numpy
                in_memory: false
-                path: preprocessed\\data\\node-feat.npy
+                path: {os.path.join("preprocessed", feature_node)}
            tasks:
              - name: node_classification
                num_classes: {num_classes}
@@ -1899,17 +1748,17 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
                  - type: null
                    data:
                      - format: numpy
-                        path: preprocessed\\set\\train.npy
+                        path: {os.path.join("preprocessed", train_path)}
                validation_set:
                  - type: null
                    data:
                      - format: numpy
-                        path: preprocessed\\set\\validation.npy
+                        path: {os.path.join("preprocessed", validation_path)}
                test_set:
                  - type: null
                    data:
                      - format: numpy
-                        path: preprocessed\\set\\test.npy
+                        path: {os.path.join("preprocessed", test_path)}
            include_original_edge_id: False
        """
        target_yaml_data = yaml.safe_load(target_yaml_content)