[GraphBolt] Support numpy for edges when constructing graph in `preprocess_ondisk_dataset`. (#6916)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>

[GraphBolt] Support numpy for edges when constructing graph in `preprocess_ondisk_dataset`. (#6916)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
a697e791 · yxy235 · GitHub · b3224ce8 · a697e791 · a697e791
Unverified Commit a697e791 authored Jan 09, 2024 by yxy235 Committed by GitHub Jan 09, 2024
5 changed files
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -4,7 +4,6 @@ import os
 from copy import deepcopy
 from typing import Dict, List, Union

-import pandas as pd
 import torch
 import yaml

@@ -14,7 +13,12 @@ from ...base import dgl_warning
 from ...data.utils import download, extract_archive
 from ..base import etype_str_to_tuple
 from ..dataset import Dataset, Task
-from ..internal import copy_or_convert_data, get_attributes, read_data
+from ..internal import (
+    copy_or_convert_data,
+    get_attributes,
+    read_data,
+    read_edges,
+)
 from ..itemset import ItemSet, ItemSetDict
 from ..sampling_graph import SamplingGraph
 from .fused_csc_sampling_graph import from_dglgraph, FusedCSCSamplingGraph
@@ -86,14 +90,9 @@ def preprocess_ondisk_dataset(
    if is_homogeneous:
        # Homogeneous graph.
        num_nodes = input_config["graph"]["nodes"][0]["num"]
-        edge_data = pd.read_csv(
-            os.path.join(
-                dataset_dir, input_config["graph"]["edges"][0]["path"]
-            ),
-            names=["src", "dst"],
-        )
-        src, dst = edge_data["src"].to_numpy(), edge_data["dst"].to_numpy()
-
+        edge_fmt = input_config["graph"]["edges"][0]["format"]
+        edge_path = input_config["graph"]["edges"][0]["path"]
+        src, dst = read_edges(dataset_dir, edge_fmt, edge_path)
        g = dgl.graph((src, dst), num_nodes=num_nodes)
    else:
        # Heterogeneous graph.
@@ -104,12 +103,9 @@ def preprocess_ondisk_dataset(
        # Construct the data dict.
        data_dict = {}
        for edge_info in input_config["graph"]["edges"]:
-            edge_data = pd.read_csv(
-                os.path.join(dataset_dir, edge_info["path"]),
-                names=["src", "dst"],
-            )
-            src = torch.tensor(edge_data["src"])
-            dst = torch.tensor(edge_data["dst"])
+            edge_fmt = edge_info["format"]
+            edge_path = edge_info["path"]
+            src, dst = read_edges(dataset_dir, edge_fmt, edge_path)
            data_dict[etype_str_to_tuple(edge_info["type"])] = (src, dst)
        # Construct the heterograph.
        g = dgl.heterograph(data_dict, num_nodes_dict)

--- a/python/dgl/graphbolt/internal/utils.py
+++ b/python/dgl/graphbolt/internal/utils.py
@@ -4,6 +4,7 @@ import os
 import shutil

 import numpy as np
+import pandas as pd
 import torch
 from numpy.lib.format import read_array_header_1_0, read_array_header_2_0

@@ -120,3 +121,27 @@ def get_attributes(_obj) -> list:
        and not callable(getattr(_obj, attribute))
    ]
    return attributes
+
+
+def read_edges(dataset_dir, edge_fmt, edge_path):
+    """Read egde data from numpy or csv."""
+    assert edge_fmt in [
+        "numpy",
+        "csv",
+    ], f"`numpy` or `csv` is expected when reading edges but got `{edge_fmt}`."
+    if edge_fmt == "numpy":
+        edge_data = read_data(
+            os.path.join(dataset_dir, edge_path),
+            edge_fmt,
+        )
+        assert (
+            edge_data.shape[0] == 2 and len(edge_data.shape) == 2
+        ), f"The shape of edges should be (2, N), but got {edge_data.shape}."
+        src, dst = edge_data.numpy()
+    else:
+        edge_data = pd.read_csv(
+            os.path.join(dataset_dir, edge_path),
+            names=["src", "dst"],
+        )
+        src, dst = edge_data["src"].to_numpy(), edge_data["dst"].to_numpy()
+    return (src, dst)
--- a/tests/python/pytorch/graphbolt/gb_test_utils.py
+++ b/tests/python/pytorch/graphbolt/gb_test_utils.py
@@ -88,15 +88,19 @@ def random_hetero_graph(num_nodes, num_edges, num_ntypes, num_etypes):


 def random_homo_graphbolt_graph(
-    test_dir, dataset_name, num_nodes, num_edges, num_classes
+    test_dir, dataset_name, num_nodes, num_edges, num_classes, edge_fmt="csv"
 ):
    """Generate random graphbolt version homograph"""
    # Generate random edges.
    nodes = np.repeat(np.arange(num_nodes), 5)
    neighbors = np.random.randint(0, num_nodes, size=(num_edges))
    edges = np.stack([nodes, neighbors], axis=1)
-    # Wrtie into edges/edge.csv
    os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
+    assert edge_fmt in ["numpy", "csv"], print(
+        "only numpy and csv are supported for edges."
+    )
+    if edge_fmt == "csv":
+        # Wrtie into edges/edge.csv
        edges = pd.DataFrame(edges, columns=["src", "dst"])
        edge_path = os.path.join("edges", "edge.csv")
        edges.to_csv(
@@ -104,6 +108,11 @@ def random_homo_graphbolt_graph(
            index=False,
            header=False,
        )
+    else:
+        # Wrtie into edges/edge.npy
+        edges = edges.T
+        edge_path = os.path.join("edges", "edge.npy")
+        np.save(os.path.join(test_dir, edge_path), edges)

    # Generate random graph edge-feats.
    edge_feats = np.random.rand(num_edges, num_classes)
@@ -153,7 +162,7 @@ def random_homo_graphbolt_graph(
            nodes:
                - num: {num_nodes}
            edges:
-                - format: csv
+                - format: {edge_fmt}
                  path: {edge_path}
            feature_data:
                - domain: edge
@@ -203,7 +212,7 @@ def random_homo_graphbolt_graph(


 def genereate_raw_data_for_hetero_dataset(
-    test_dir, dataset_name, num_nodes, num_edges, num_classes
+    test_dir, dataset_name, num_nodes, num_edges, num_classes, edge_fmt="csv"
 ):
    # Generate edges.
    edges_path = {}
@@ -211,8 +220,12 @@ def genereate_raw_data_for_hetero_dataset(
        src_ntype, etype_str, dst_ntype = etype
        src = torch.randint(0, num_nodes[src_ntype], (num_edge,))
        dst = torch.randint(0, num_nodes[dst_ntype], (num_edge,))
-        # Write into edges/edge.csv
        os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
+        assert edge_fmt in ["numpy", "csv"], print(
+            "only numpy and csv are supported for edges."
+        )
+        if edge_fmt == "csv":
+            # Write into edges/edge.csv
            edges = pd.DataFrame(
                np.stack([src, dst], axis=1), columns=["src", "dst"]
            )
@@ -222,6 +235,10 @@ def genereate_raw_data_for_hetero_dataset(
                index=False,
                header=False,
            )
+        else:
+            edges = np.stack([src, dst], axis=1).T
+            edge_path = os.path.join("edges", f"{etype_str}.npy")
+            np.save(os.path.join(test_dir, edge_path), edges)
        edges_path[etype_str] = edge_path

    # Generate node features.
@@ -263,10 +280,10 @@ def genereate_raw_data_for_hetero_dataset(
              num: {num_nodes["item"]}
          edges:
            - type: "user:follow:user"
-              format: csv
+              format: {edge_fmt}
              path: {edges_path["follow"]}
            - type: "user:click:item"
-              format: csv
+              format: {edge_fmt}
              path: {edges_path["click"]}
        feature_data:
          - domain: node

--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -1095,7 +1095,8 @@ def test_OnDiskDataset_Metadata():
        assert dataset.dataset_name == dataset_name


-def test_OnDiskDataset_preprocess_homogeneous():
+@pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
+def test_OnDiskDataset_preprocess_homogeneous(edge_fmt):
    """Test preprocess of OnDiskDataset."""
    with tempfile.TemporaryDirectory() as test_dir:
        # All metadata fields are specified.
@@ -1111,6 +1112,7 @@ def test_OnDiskDataset_preprocess_homogeneous():
            num_nodes,
            num_edges,
            num_classes,
+            edge_fmt=edge_fmt,
        )
        yaml_file = os.path.join(test_dir, "metadata.yaml")
        with open(yaml_file, "w") as f:
@@ -1160,6 +1162,7 @@ def test_OnDiskDataset_preprocess_homogeneous():
            num_nodes,
            num_edges,
            num_classes,
+            edge_fmt=edge_fmt,
        )
        yaml_file = os.path.join(test_dir, "metadata.yaml")
        with open(yaml_file, "w") as f:
@@ -1527,7 +1530,8 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
            )


-def test_OnDiskDataset_load_name():
+@pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
+def test_OnDiskDataset_load_name(edge_fmt):
    """Test preprocess of OnDiskDataset."""
    with tempfile.TemporaryDirectory() as test_dir:
        # All metadata fields are specified.
@@ -1543,6 +1547,7 @@ def test_OnDiskDataset_load_name():
            num_nodes,
            num_edges,
            num_classes,
+            edge_fmt=edge_fmt,
        )
        yaml_file = os.path.join(test_dir, "metadata.yaml")
        with open(yaml_file, "w") as f:
@@ -1556,7 +1561,8 @@ def test_OnDiskDataset_load_name():
        dataset = None


-def test_OnDiskDataset_load_feature():
+@pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
+def test_OnDiskDataset_load_feature(edge_fmt):
    """Test preprocess of OnDiskDataset."""
    with tempfile.TemporaryDirectory() as test_dir:
        # All metadata fields are specified.
@@ -1572,6 +1578,7 @@ def test_OnDiskDataset_load_feature():
            num_nodes,
            num_edges,
            num_classes,
+            edge_fmt=edge_fmt,
        )
        yaml_file = os.path.join(test_dir, "metadata.yaml")
        with open(yaml_file, "w") as f:
@@ -1640,7 +1647,8 @@ def test_OnDiskDataset_load_feature():
        dataset = None


-def test_OnDiskDataset_load_graph():
+@pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
+def test_OnDiskDataset_load_graph(edge_fmt):
    """Test preprocess of OnDiskDataset."""
    with tempfile.TemporaryDirectory() as test_dir:
        # All metadata fields are specified.
@@ -1656,6 +1664,7 @@ def test_OnDiskDataset_load_graph():
            num_nodes,
            num_edges,
            num_classes,
+            edge_fmt=edge_fmt,
        )
        yaml_file = os.path.join(test_dir, "metadata.yaml")
        with open(yaml_file, "w") as f:
@@ -1723,6 +1732,7 @@ def test_OnDiskDataset_load_graph():
            num_nodes,
            num_edges,
            num_classes,
+            edge_fmt=edge_fmt,
        )
        yaml_file = os.path.join(test_dir, "metadata.yaml")
        with open(yaml_file, "w") as f:
@@ -1739,7 +1749,8 @@ def test_OnDiskDataset_load_graph():
        dataset = None


-def test_OnDiskDataset_load_tasks():
+@pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
+def test_OnDiskDataset_load_tasks(edge_fmt):
    """Test preprocess of OnDiskDataset."""
    with tempfile.TemporaryDirectory() as test_dir:
        # All metadata fields are specified.
@@ -1755,6 +1766,7 @@ def test_OnDiskDataset_load_tasks():
            num_nodes,
            num_edges,
            num_classes,
+            edge_fmt=edge_fmt,
        )
        yaml_file = os.path.join(test_dir, "metadata.yaml")
        with open(yaml_file, "w") as f:
@@ -2028,7 +2040,8 @@ def test_BuiltinDataset():


 @pytest.mark.parametrize("include_original_edge_id", [True, False])
-def test_OnDiskDataset_homogeneous(include_original_edge_id):
+@pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
+def test_OnDiskDataset_homogeneous(include_original_edge_id, edge_fmt):
    """Preprocess and instantiate OnDiskDataset for homogeneous graph."""
    with tempfile.TemporaryDirectory() as test_dir:
        # All metadata fields are specified.
@@ -2044,6 +2057,7 @@ def test_OnDiskDataset_homogeneous(include_original_edge_id):
            num_nodes,
            num_edges,
            num_classes,
+            edge_fmt=edge_fmt,
        )
        yaml_file = os.path.join(test_dir, "metadata.yaml")
        with open(yaml_file, "w") as f:
@@ -2095,7 +2109,8 @@ def test_OnDiskDataset_homogeneous(include_original_edge_id):


 @pytest.mark.parametrize("include_original_edge_id", [True, False])
-def test_OnDiskDataset_heterogeneous(include_original_edge_id):
+@pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
+def test_OnDiskDataset_heterogeneous(include_original_edge_id, edge_fmt):
    """Preprocess and instantiate OnDiskDataset for heterogeneous graph."""
    with tempfile.TemporaryDirectory() as test_dir:
        dataset_name = "OnDiskDataset_hetero"
@@ -2114,6 +2129,7 @@ def test_OnDiskDataset_heterogeneous(include_original_edge_id):
            num_nodes,
            num_edges,
            num_classes,
+            edge_fmt=edge_fmt,
        )

        dataset = gb.OnDiskDataset(

--- a/tests/python/pytorch/graphbolt/utils/test_internal.py
+++ b/tests/python/pytorch/graphbolt/utils/test_internal.py
 import os
+import re
 import tempfile

 import dgl.graphbolt.internal as internal
 import numpy as np
+import pandas as pd
 import pytest
 import torch

@@ -141,3 +143,60 @@ def test_copy_or_convert_data(data_fmt, save_fmt, is_feature):
        data = None
        tensor_data = None
        out_data = None
+
+
+@pytest.mark.parametrize("edge_fmt", ["csv", "numpy"])
+def test_read_edges(edge_fmt):
+    with tempfile.TemporaryDirectory() as test_dir:
+        num_nodes = 40
+        num_edges = 200
+        nodes = np.repeat(np.arange(num_nodes), 5)
+        neighbors = np.random.randint(0, num_nodes, size=(num_edges))
+        edges = np.stack([nodes, neighbors], axis=1)
+        os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
+        if edge_fmt == "csv":
+            # Wrtie into edges/edge.csv
+            edges = pd.DataFrame(edges, columns=["src", "dst"])
+            edge_path = os.path.join("edges", "edge.csv")
+            edges.to_csv(
+                os.path.join(test_dir, edge_path),
+                index=False,
+                header=False,
+            )
+        else:
+            # Wrtie into edges/edge.npy
+            edges = edges.T
+            edge_path = os.path.join("edges", "edge.npy")
+            np.save(os.path.join(test_dir, edge_path), edges)
+        src, dst = internal.read_edges(test_dir, edge_fmt, edge_path)
+        assert src.all() == nodes.all()
+        assert dst.all() == neighbors.all()
+
+
+def test_read_edges_error():
+    # 1. Unsupported file format.
+    with pytest.raises(
+        AssertionError,
+        match="`numpy` or `csv` is expected when reading edges but got `fake-type`.",
+    ):
+        internal.read_edges("test_dir", "fake-type", "edge_path")
+
+    # 2. Unexpected shape of numpy array
+    with tempfile.TemporaryDirectory() as test_dir:
+        num_nodes = 40
+        num_edges = 200
+        nodes = np.repeat(np.arange(num_nodes), 5)
+        neighbors = np.random.randint(0, num_nodes, size=(num_edges))
+        edges = np.stack([nodes, neighbors, nodes], axis=1)
+        os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
+        # Wrtie into edges/edge.npy
+        edges = edges.T
+        edge_path = os.path.join("edges", "edge.npy")
+        np.save(os.path.join(test_dir, edge_path), edges)
+        with pytest.raises(
+            AssertionError,
+            match=re.escape(
+                "The shape of edges should be (2, N), but got torch.Size([3, 200])."
+            ),
+        ):
+            internal.read_edges(test_dir, "numpy", edge_path)