[GraphBolt] fix preprocess issue for single ntype/etype graph (#7011)

523bbb4c · Rhett Ying · GitHub · 0bfe34d9 · 523bbb4c · 523bbb4c
Unverified Commit 523bbb4c authored Jan 25, 2024 by Rhett Ying Committed by GitHub Jan 25, 2024
Showing with 110 additions and 15 deletions

python/dgl/graphbolt/impl/ondisk_dataset.py python/dgl/graphbolt/impl/ondisk_dataset.py +30 -15

tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py +80 -0

No files found.
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -118,7 +118,18 @@ def preprocess_ondisk_dataset(
    # 2. Load the edge data and create a DGLGraph.
    if "graph" not in input_config:
        raise RuntimeError("Invalid config: does not contain graph field.")
-    is_homogeneous = "type" not in input_config["graph"]["nodes"][0]
+    # For any graph that node/edge types are specified, we construct DGLGraph
+    # with `dgl.heterograph()` even there's only one node/edge type. This is
+    # because we want to save the node/edge types in the graph. So the logic of
+    # checking whether the graph is homogeneous is different from the logic in
+    # `DGLGraph.is_homogeneous()`. Otherwise, we construct DGLGraph with
+    # `dgl.graph()`.
+    is_homogeneous = (
+        len(input_config["graph"]["nodes"]) == 1
+        and len(input_config["graph"]["edges"]) == 1
+        and "type" not in input_config["graph"]["nodes"][0]
+        and "type" not in input_config["graph"]["edges"][0]
+    )
    if is_homogeneous:
        # Homogeneous graph.
        num_nodes = input_config["graph"]["nodes"][0]["num"]
@@ -178,20 +189,24 @@ def preprocess_ondisk_dataset(
        if not is_homogeneous:
            # For heterogenous graph, a node/edge feature must cover all
            # node/edge types.
-            for feat_name, feat_data in g.ndata.items():
+            ntypes = g.ntypes
-                existing_types = set(feat_data.keys())
+            assert all(
-                assert existing_types == set(g.ntypes), (
+                set(g.nodes[ntypes[0]].data.keys())
-                    f"Node feature {feat_name} does not cover all node types."
+                == set(g.nodes[ntype].data.keys())
-                    + f"Existing types: {existing_types}."
+                for ntype in ntypes
-                    + f"Expected types: {g.ntypes}."
+            ), (
-                )
+                "Node feature does not cover all node types: "
-            for feat_name, feat_data in g.edata.items():
+                + f"{set(g.nodes[ntype].data.keys() for ntype in ntypes)}."
-                existing_types = set(feat_data.keys())
+            )
-                assert existing_types == set(g.canonical_etypes), (
+            etypes = g.canonical_etypes
-                    f"Edge feature {feat_name} does not cover all edge types."
+            assert all(
-                    + f"Existing types: {existing_types}."
+                set(g.edges[etypes[0]].data.keys())
-                    + f"Expected types: {g.etypes}."
+                == set(g.edges[etype].data.keys())
-                )
+                for etype in etypes
+            ), (
+                "Edge feature does not cover all edge types: "
+                + f"{set(g.edges[etype].data.keys() for etype in etypes)}."
+            )
    # 4. Convert the DGLGraph to a FusedCSCSamplingGraph.
    fused_csc_sampling_graph = from_dglgraph(

--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -2742,3 +2742,83 @@ def test_OnDiskDataset_load_tasks_selectively():
            dataset = gb.OnDiskDataset(test_dir).load(tasks=2)
        dataset = None
+def test_OnDiskDataset_preprocess_graph_with_single_type():
+    """Test for graph with single node/edge type."""
+    with tempfile.TemporaryDirectory() as test_dir:
+        # All metadata fields are specified.
+        dataset_name = "graphbolt_test"
+        num_nodes = 4000
+        num_edges = 20000
+        # Generate random edges.
+        nodes = np.repeat(np.arange(num_nodes), 5)
+        neighbors = np.random.randint(0, num_nodes, size=(num_edges))
+        edges = np.stack([nodes, neighbors], axis=1)
+        # Wrtie into edges/edge.csv
+        os.makedirs(os.path.join(test_dir, "edges/"), exist_ok=True)
+        edges = pd.DataFrame(edges, columns=["src", "dst"])
+        edges.to_csv(
+            os.path.join(test_dir, "edges/edge.csv"),
+            index=False,
+            header=False,
+        )
+        # Generate random graph edge-feats.
+        edge_feats = np.random.rand(num_edges, 5)
+        os.makedirs(os.path.join(test_dir, "data/"), exist_ok=True)
+        np.save(os.path.join(test_dir, "data/edge-feat.npy"), edge_feats)
+        # Generate random node-feats.
+        node_feats = np.random.rand(num_nodes, 10)
+        np.save(os.path.join(test_dir, "data/node-feat.npy"), node_feats)
+        yaml_content = f"""
+            dataset_name: {dataset_name}
+            graph: # graph structure and required attributes.
+                nodes:
+                    - num: {num_nodes}
+                      type: author
+                edges:
+                    - type: author:collab:author
+                      format: csv
+                      path: edges/edge.csv
+                feature_data:
+                    - domain: edge
+                      type: author:collab:author
+                      name: feat
+                      format: numpy
+                      path: data/edge-feat.npy
+                    - domain: node
+                      type: author
+                      name: feat
+                      format: numpy
+                      path: data/node-feat.npy
+        """
+        yaml_file = os.path.join(test_dir, "metadata.yaml")
+        with open(yaml_file, "w") as f:
+            f.write(yaml_content)
+        dataset = gb.OnDiskDataset(test_dir).load()
+        assert dataset.dataset_name == dataset_name
+        graph = dataset.graph
+        assert isinstance(graph, gb.FusedCSCSamplingGraph)
+        assert graph.total_num_nodes == num_nodes
+        assert graph.total_num_edges == num_edges
+        assert (
+            graph.node_attributes is not None
+            and "feat" in graph.node_attributes
+        )
+        assert (
+            graph.edge_attributes is not None
+            and "feat" in graph.edge_attributes
+        )
+        assert torch.equal(graph.node_type_offset, torch.tensor([0, num_nodes]))
+        assert torch.equal(
+            graph.type_per_edge,
+            torch.zeros(num_edges),
+        )
+        assert graph.edge_type_to_id == {"author:collab:author": 0}
+        assert graph.node_type_to_id == {"author": 0}