[GraphBolt] Modify preprocess to support ondisk datasets with 1D features. (#6433)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal> Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>

[GraphBolt] Modify preprocess to support ondisk datasets with 1D features. (#6433)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal> Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
8b37564b · yxy235 · GitHub · 4c883d89 · 8b37564b · 8b37564b
Unverified Commit 8b37564b authored Oct 18, 2023 by yxy235 Committed by GitHub Oct 18, 2023
5 changed files
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -16,7 +16,7 @@ from ..base import etype_str_to_tuple
 from ..dataset import Dataset, Task
 from ..itemset import ItemSet, ItemSetDict
 from ..sampling_graph import SamplingGraph
-from ..utils import read_data, save_data
+from ..utils import get_npy_dim, read_data, save_data
 from .csc_sampling_graph import (
    CSCSamplingGraph,
    from_dglgraph,
@@ -43,12 +43,20 @@ def _copy_or_convert_data(
 ):
    """Copy or convert the data from input_path to output_path."""
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    # If the original format is numpy, just copy the file.
    if input_format == "numpy":
-        # If the original format is numpy, just copy the file.
+        # If dim of the data is 1, reshape it to n * 1 and save it to output_path.
-        shutil.copyfile(input_path, output_path)
+        if get_npy_dim(input_path) == 1:
+            data = read_data(input_path, input_format, in_memory)
+            data = data.reshape(-1, 1)
+            save_data(data, output_path, output_format)
+        else:
+            shutil.copyfile(input_path, output_path)
    else:
        # If the original format is not numpy, convert it to numpy.
        data = read_data(input_path, input_format, in_memory)
+        if data.dim() == 1:
+            data = data.reshape(-1, 1)
        save_data(data, output_path, output_format)

--- a/python/dgl/graphbolt/utils/internal.py
+++ b/python/dgl/graphbolt/utils/internal.py
@@ -4,6 +4,7 @@ import os
 import numpy as np
 import torch
+from numpy.lib.format import read_array_header_1_0, read_array_header_2_0
 def _read_torch_data(path):
@@ -57,3 +58,23 @@ def save_data(data, path, fmt):
            )
            data = data.contiguous()
        torch.save(data, path)
+def get_npy_dim(npy_path):
+    """Get the dim of numpy file."""
+    with open(npy_path, "rb") as f:
+        # For the read_array_header API provided by numpy will only read the
+        # length of the header, it will cause parsing failure and error if
+        # first 8 bytes which contains magin string and version are not read
+        # ahead of time. So, we need to make sure we have skipped these 8
+        # bytes.
+        f.seek(8, 0)
+        try:
+            shape, _, _ = read_array_header_1_0(f)
+        except ValueError:
+            try:
+                shape, _, _ = read_array_header_2_0(f)
+            except ValueError:
+                raise ValueError("Invalid file format")
+        return len(shape)
--- a/tests/python/pytorch/graphbolt/gb_test_utils.py
+++ b/tests/python/pytorch/graphbolt/gb_test_utils.py
@@ -88,7 +88,10 @@ def random_homo_graphbolt_graph(
    np.save(os.path.join(test_dir, edge_feat_path), edge_feats)
    # Generate random node-feats.
-    node_feats = np.random.rand(num_nodes, num_classes)
+    if num_classes == 1:
+        node_feats = np.random.rand(num_nodes)
+    else:
+        node_feats = np.random.rand(num_nodes, num_classes)
    node_feat_path = os.path.join("data", "node-feat.npy")
    np.save(os.path.join(test_dir, node_feat_path), node_feats)

--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -1849,6 +1849,40 @@ def test_OnDiskDataset_all_nodes_set_hetero():
        dataset = None
+def test_OnDiskDataset_load_1D_feature():
+    with tempfile.TemporaryDirectory() as test_dir:
+        # All metadata fields are specified.
+        dataset_name = "graphbolt_test"
+        num_nodes = 4000
+        num_edges = 20000
+        num_classes = 1
+        # Generate random graph.
+        yaml_content = gbt.random_homo_graphbolt_graph(
+            test_dir,
+            dataset_name,
+            num_nodes,
+            num_edges,
+            num_classes,
+        )
+        yaml_file = os.path.join(test_dir, "metadata.yaml")
+        with open(yaml_file, "w") as f:
+            f.write(yaml_content)
+        with open(yaml_file, "r") as f:
+            input_config = yaml.safe_load(f)
+        node_feat = np.load(
+            os.path.join(test_dir, input_config["feature_data"][0]["path"])
+        )
+        dataset = gb.OnDiskDataset(test_dir).load()
+        feature = dataset.feature.read("node", None, "feat")
+        assert torch.equal(torch.from_numpy(node_feat.reshape(-1, 1)), feature)
+        dataset = None
+        node_feat = None
+        feature = None
 def test_BuiltinDataset():
    """Test BuiltinDataset."""
    with tempfile.TemporaryDirectory() as test_dir:
@@ -1869,6 +1903,8 @@ def test_BuiltinDataset():
        assert dataset.tasks is not None
        assert dataset.dataset_name == dataset_name
+        dataset = None
        # Case 3: dataset is not available.
        dataset_name = "fake_name"
        with pytest.raises(

--- a/tests/python/pytorch/graphbolt/utils/test_internal.py
+++ b/tests/python/pytorch/graphbolt/utils/test_internal.py
@@ -81,3 +81,19 @@ def test_save_data(data_fmt, save_fmt, contiguous):
            assert np.array_equal(tensor_data.numpy(), loaded_data)
        data = tensor_data = loaded_data = None
+@pytest.mark.parametrize("fmt", ["torch", "numpy"])
+def test_get_npy_dim(fmt):
+    with tempfile.TemporaryDirectory() as test_dir:
+        data = np.array([[1, 2, 4], [2, 5, 3]])
+        type_name = "pt" if fmt == "torch" else "npy"
+        file_name = os.path.join(test_dir, f"save_data.{type_name}")
+        if fmt == "numpy":
+            np.save(file_name, data)
+            assert utils.get_npy_dim(file_name) == 2
+        elif fmt == "torch":
+            torch.save(torch.from_numpy(data), file_name)
+            with pytest.raises(ValueError):
+                utils.get_npy_dim(file_name)
+        data = None