Unverified Commit 85c030f6 authored by Andrei Ivanov's avatar Andrei Ivanov Committed by GitHub
Browse files

[GraphBolt] Combining `test_OnDiskDataset_preprocess_yaml_content` tests for...

[GraphBolt] Combining `test_OnDiskDataset_preprocess_yaml_content` tests for Windows and Unix. (#7148)
parent 5d2d1453
......@@ -1625,8 +1625,7 @@ def test_OnDiskDataset_preprocess_path():
_ = gb.OnDiskDataset(fake_dir)
@unittest.skipIf(os.name == "nt", "Skip on Windows")
def test_OnDiskDataset_preprocess_yaml_content_unix():
def test_OnDiskDataset_preprocess_yaml_content():
"""Test if the preprocessed metadata.yaml is correct."""
with tempfile.TemporaryDirectory() as test_dir:
# All metadata fields are specified.
......@@ -1640,196 +1639,45 @@ def test_OnDiskDataset_preprocess_yaml_content_unix():
neighbors = np.random.randint(0, num_nodes, size=(num_edges))
edges = np.stack([nodes, neighbors], axis=1)
# Write into edges/edge.csv
os.makedirs(os.path.join(test_dir, "edges/"), exist_ok=True)
edges = pd.DataFrame(edges, columns=["src", "dst"])
edges.to_csv(
os.path.join(test_dir, "edges/edge.csv"),
index=False,
header=False,
)
# Generate random graph edge-feats.
edge_feats = np.random.rand(num_edges, 5)
os.makedirs(os.path.join(test_dir, "data/"), exist_ok=True)
np.save(os.path.join(test_dir, "data/edge-feat.npy"), edge_feats)
# Generate random node-feats.
node_feats = np.random.rand(num_nodes, 10)
np.save(os.path.join(test_dir, "data/node-feat.npy"), node_feats)
# Generate train/test/valid set.
os.makedirs(os.path.join(test_dir, "set/"), exist_ok=True)
train_pairs = (np.arange(1000), np.arange(1000, 2000))
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T
train_path = os.path.join(test_dir, "set/train.npy")
np.save(train_path, train_data)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T
validation_path = os.path.join(test_dir, "set/validation.npy")
np.save(validation_path, validation_data)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T
test_path = os.path.join(test_dir, "set/test.npy")
np.save(test_path, test_data)
yaml_content = f"""
dataset_name: {dataset_name}
graph: # graph structure and required attributes.
nodes:
- num: {num_nodes}
edges:
- format: csv
path: edges/edge.csv
feature_data:
- domain: edge
type: null
name: feat
format: numpy
path: data/edge-feat.npy
feature_data:
- domain: node
type: null
name: feat
format: numpy
in_memory: false
path: data/node-feat.npy
tasks:
- name: node_classification
num_classes: {num_classes}
train_set:
- type: null
data:
- format: numpy
path: set/train.npy
validation_set:
- type: null
data:
- format: numpy
path: set/validation.npy
test_set:
- type: null
data:
- format: numpy
path: set/test.npy
"""
yaml_file = os.path.join(test_dir, "metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
preprocessed_metadata_path = gb.preprocess_ondisk_dataset(test_dir)
with open(preprocessed_metadata_path, "r") as f:
yaml_data = yaml.safe_load(f)
target_yaml_content = f"""
dataset_name: {dataset_name}
graph_topology:
type: FusedCSCSamplingGraph
path: preprocessed/fused_csc_sampling_graph.pt
feature_data:
- domain: node
type: null
name: feat
format: numpy
in_memory: false
path: preprocessed/data/node-feat.npy
tasks:
- name: node_classification
num_classes: {num_classes}
train_set:
- type: null
data:
- format: numpy
path: preprocessed/set/train.npy
validation_set:
- type: null
data:
- format: numpy
path: preprocessed/set/validation.npy
test_set:
- type: null
data:
- format: numpy
path: preprocessed/set/test.npy
include_original_edge_id: False
"""
target_yaml_data = yaml.safe_load(target_yaml_content)
# Check yaml content.
assert (
yaml_data == target_yaml_data
), "The preprocessed metadata.yaml is not correct."
# Check file existence.
assert os.path.exists(
os.path.join(test_dir, yaml_data["graph_topology"]["path"])
)
assert os.path.exists(
os.path.join(test_dir, yaml_data["feature_data"][0]["path"])
)
for set_name in ["train_set", "validation_set", "test_set"]:
assert os.path.exists(
os.path.join(
test_dir,
yaml_data["tasks"][0][set_name][0]["data"][0]["path"],
)
)
@unittest.skipIf(os.name != "nt", "Skip on Unix")
def test_OnDiskDataset_preprocess_yaml_content_windows():
"""Test if the preprocessed metadata.yaml is correct."""
with tempfile.TemporaryDirectory() as test_dir:
# All metadata fields are specified.
dataset_name = "graphbolt_test"
num_nodes = 4000
num_edges = 20000
num_classes = 10
# Generate random edges.
nodes = np.repeat(np.arange(num_nodes), 5)
neighbors = np.random.randint(0, num_nodes, size=(num_edges))
edges = np.stack([nodes, neighbors], axis=1)
# Write into edges/edge.csv
os.makedirs(os.path.join(test_dir, "edges\\"), exist_ok=True)
os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
edges = pd.DataFrame(edges, columns=["src", "dst"])
edge_path = os.path.join("edges", "edge.csv")
edges.to_csv(
os.path.join(test_dir, "edges\\edge.csv"),
os.path.join(test_dir, edge_path),
index=False,
header=False,
)
# Generate random graph edge-feats.
edge_feats = np.random.rand(num_edges, 5)
os.makedirs(os.path.join(test_dir, "data\\"), exist_ok=True)
np.save(os.path.join(test_dir, "data\\edge-feat.npy"), edge_feats)
os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
feature_edge = os.path.join("data", "edge-feat.npy")
np.save(os.path.join(test_dir, feature_edge), edge_feats)
# Generate random node-feats.
node_feats = np.random.rand(num_nodes, 10)
np.save(os.path.join(test_dir, "data\\node-feat.npy"), node_feats)
feature_node = os.path.join("data", "node-feat.npy")
np.save(os.path.join(test_dir, feature_node), node_feats)
# Generate train/test/valid set.
os.makedirs(os.path.join(test_dir, "set\\"), exist_ok=True)
os.makedirs(os.path.join(test_dir, "set"), exist_ok=True)
train_pairs = (np.arange(1000), np.arange(1000, 2000))
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T
train_path = os.path.join(test_dir, "set\\train.npy")
np.save(train_path, train_data)
train_path = os.path.join("set", "train.npy")
np.save(os.path.join(test_dir, train_path), train_data)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T
validation_path = os.path.join(test_dir, "set\\validation.npy")
np.save(validation_path, validation_data)
validation_path = os.path.join("set", "validation.npy")
np.save(os.path.join(test_dir, validation_path), validation_data)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T
test_path = os.path.join(test_dir, "set\\test.npy")
np.save(test_path, test_data)
test_path = os.path.join("set", "test.npy")
np.save(os.path.join(test_dir, test_path), test_data)
yaml_content = f"""
dataset_name: {dataset_name}
......@@ -1838,21 +1686,21 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
- num: {num_nodes}
edges:
- format: csv
path: edges\\edge.csv
path: {edge_path}
feature_data:
- domain: edge
type: null
name: feat
format: numpy
in_memory: true
path: data\\edge-feat.npy
path: {feature_edge}
feature_data:
- domain: node
type: null
name: feat
format: numpy
in_memory: false
path: data\\node-feat.npy
path: {feature_node}
tasks:
- name: node_classification
num_classes: {num_classes}
......@@ -1860,17 +1708,17 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
- type: null
data:
- format: numpy
path: set\\train.npy
path: {train_path}
validation_set:
- type: null
data:
- format: numpy
path: set\\validation.npy
path: {validation_path}
test_set:
- type: null
data:
- format: numpy
path: set\\test.npy
path: {test_path}
"""
yaml_file = os.path.join(test_dir, "metadata.yaml")
with open(yaml_file, "w") as f:
......@@ -1880,18 +1728,19 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
with open(preprocessed_metadata_path, "r") as f:
yaml_data = yaml.safe_load(f)
topo_path = os.path.join("preprocessed", "fused_csc_sampling_graph.pt")
target_yaml_content = f"""
dataset_name: {dataset_name}
graph_topology:
type: FusedCSCSamplingGraph
path: preprocessed\\fused_csc_sampling_graph.pt
path: {topo_path}
feature_data:
- domain: node
type: null
name: feat
format: numpy
in_memory: false
path: preprocessed\\data\\node-feat.npy
path: {os.path.join("preprocessed", feature_node)}
tasks:
- name: node_classification
num_classes: {num_classes}
......@@ -1899,17 +1748,17 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
- type: null
data:
- format: numpy
path: preprocessed\\set\\train.npy
path: {os.path.join("preprocessed", train_path)}
validation_set:
- type: null
data:
- format: numpy
path: preprocessed\\set\\validation.npy
path: {os.path.join("preprocessed", validation_path)}
test_set:
- type: null
data:
- format: numpy
path: preprocessed\\set\\test.npy
path: {os.path.join("preprocessed", test_path)}
include_original_edge_id: False
"""
target_yaml_data = yaml.safe_load(target_yaml_content)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment