Unverified Commit 8b37564b authored by yxy235's avatar yxy235 Committed by GitHub
Browse files

[GraphBolt] Modify preprocess to support ondisk datasets with 1D features. (#6433)


Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
Co-authored-by: default avatarHongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
parent 4c883d89
...@@ -16,7 +16,7 @@ from ..base import etype_str_to_tuple ...@@ -16,7 +16,7 @@ from ..base import etype_str_to_tuple
from ..dataset import Dataset, Task from ..dataset import Dataset, Task
from ..itemset import ItemSet, ItemSetDict from ..itemset import ItemSet, ItemSetDict
from ..sampling_graph import SamplingGraph from ..sampling_graph import SamplingGraph
from ..utils import read_data, save_data from ..utils import get_npy_dim, read_data, save_data
from .csc_sampling_graph import ( from .csc_sampling_graph import (
CSCSamplingGraph, CSCSamplingGraph,
from_dglgraph, from_dglgraph,
...@@ -43,12 +43,20 @@ def _copy_or_convert_data( ...@@ -43,12 +43,20 @@ def _copy_or_convert_data(
): ):
"""Copy or convert the data from input_path to output_path.""" """Copy or convert the data from input_path to output_path."""
os.makedirs(os.path.dirname(output_path), exist_ok=True) os.makedirs(os.path.dirname(output_path), exist_ok=True)
# If the original format is numpy, just copy the file.
if input_format == "numpy": if input_format == "numpy":
# If the original format is numpy, just copy the file. # If dim of the data is 1, reshape it to n * 1 and save it to output_path.
shutil.copyfile(input_path, output_path) if get_npy_dim(input_path) == 1:
data = read_data(input_path, input_format, in_memory)
data = data.reshape(-1, 1)
save_data(data, output_path, output_format)
else:
shutil.copyfile(input_path, output_path)
else: else:
# If the original format is not numpy, convert it to numpy. # If the original format is not numpy, convert it to numpy.
data = read_data(input_path, input_format, in_memory) data = read_data(input_path, input_format, in_memory)
if data.dim() == 1:
data = data.reshape(-1, 1)
save_data(data, output_path, output_format) save_data(data, output_path, output_format)
......
...@@ -4,6 +4,7 @@ import os ...@@ -4,6 +4,7 @@ import os
import numpy as np import numpy as np
import torch import torch
from numpy.lib.format import read_array_header_1_0, read_array_header_2_0
def _read_torch_data(path): def _read_torch_data(path):
...@@ -57,3 +58,23 @@ def save_data(data, path, fmt): ...@@ -57,3 +58,23 @@ def save_data(data, path, fmt):
) )
data = data.contiguous() data = data.contiguous()
torch.save(data, path) torch.save(data, path)
def get_npy_dim(npy_path):
"""Get the dim of numpy file."""
with open(npy_path, "rb") as f:
# For the read_array_header API provided by numpy will only read the
# length of the header, it will cause parsing failure and error if
# first 8 bytes which contains magin string and version are not read
# ahead of time. So, we need to make sure we have skipped these 8
# bytes.
f.seek(8, 0)
try:
shape, _, _ = read_array_header_1_0(f)
except ValueError:
try:
shape, _, _ = read_array_header_2_0(f)
except ValueError:
raise ValueError("Invalid file format")
return len(shape)
...@@ -88,7 +88,10 @@ def random_homo_graphbolt_graph( ...@@ -88,7 +88,10 @@ def random_homo_graphbolt_graph(
np.save(os.path.join(test_dir, edge_feat_path), edge_feats) np.save(os.path.join(test_dir, edge_feat_path), edge_feats)
# Generate random node-feats. # Generate random node-feats.
node_feats = np.random.rand(num_nodes, num_classes) if num_classes == 1:
node_feats = np.random.rand(num_nodes)
else:
node_feats = np.random.rand(num_nodes, num_classes)
node_feat_path = os.path.join("data", "node-feat.npy") node_feat_path = os.path.join("data", "node-feat.npy")
np.save(os.path.join(test_dir, node_feat_path), node_feats) np.save(os.path.join(test_dir, node_feat_path), node_feats)
......
...@@ -1849,6 +1849,40 @@ def test_OnDiskDataset_all_nodes_set_hetero(): ...@@ -1849,6 +1849,40 @@ def test_OnDiskDataset_all_nodes_set_hetero():
dataset = None dataset = None
def test_OnDiskDataset_load_1D_feature():
with tempfile.TemporaryDirectory() as test_dir:
# All metadata fields are specified.
dataset_name = "graphbolt_test"
num_nodes = 4000
num_edges = 20000
num_classes = 1
# Generate random graph.
yaml_content = gbt.random_homo_graphbolt_graph(
test_dir,
dataset_name,
num_nodes,
num_edges,
num_classes,
)
yaml_file = os.path.join(test_dir, "metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
with open(yaml_file, "r") as f:
input_config = yaml.safe_load(f)
node_feat = np.load(
os.path.join(test_dir, input_config["feature_data"][0]["path"])
)
dataset = gb.OnDiskDataset(test_dir).load()
feature = dataset.feature.read("node", None, "feat")
assert torch.equal(torch.from_numpy(node_feat.reshape(-1, 1)), feature)
dataset = None
node_feat = None
feature = None
def test_BuiltinDataset(): def test_BuiltinDataset():
"""Test BuiltinDataset.""" """Test BuiltinDataset."""
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
...@@ -1869,6 +1903,8 @@ def test_BuiltinDataset(): ...@@ -1869,6 +1903,8 @@ def test_BuiltinDataset():
assert dataset.tasks is not None assert dataset.tasks is not None
assert dataset.dataset_name == dataset_name assert dataset.dataset_name == dataset_name
dataset = None
# Case 3: dataset is not available. # Case 3: dataset is not available.
dataset_name = "fake_name" dataset_name = "fake_name"
with pytest.raises( with pytest.raises(
......
...@@ -81,3 +81,19 @@ def test_save_data(data_fmt, save_fmt, contiguous): ...@@ -81,3 +81,19 @@ def test_save_data(data_fmt, save_fmt, contiguous):
assert np.array_equal(tensor_data.numpy(), loaded_data) assert np.array_equal(tensor_data.numpy(), loaded_data)
data = tensor_data = loaded_data = None data = tensor_data = loaded_data = None
@pytest.mark.parametrize("fmt", ["torch", "numpy"])
def test_get_npy_dim(fmt):
with tempfile.TemporaryDirectory() as test_dir:
data = np.array([[1, 2, 4], [2, 5, 3]])
type_name = "pt" if fmt == "torch" else "npy"
file_name = os.path.join(test_dir, f"save_data.{type_name}")
if fmt == "numpy":
np.save(file_name, data)
assert utils.get_npy_dim(file_name) == 2
elif fmt == "torch":
torch.save(torch.from_numpy(data), file_name)
with pytest.raises(ValueError):
utils.get_npy_dim(file_name)
data = None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment