Unverified Commit 85c030f6 authored by Andrei Ivanov's avatar Andrei Ivanov Committed by GitHub
Browse files

[GraphBolt] Combining `test_OnDiskDataset_preprocess_yaml_content` tests for...

[GraphBolt] Combining `test_OnDiskDataset_preprocess_yaml_content` tests for Windows and Unix. (#7148)
parent 5d2d1453
...@@ -1625,8 +1625,7 @@ def test_OnDiskDataset_preprocess_path(): ...@@ -1625,8 +1625,7 @@ def test_OnDiskDataset_preprocess_path():
_ = gb.OnDiskDataset(fake_dir) _ = gb.OnDiskDataset(fake_dir)
@unittest.skipIf(os.name == "nt", "Skip on Windows") def test_OnDiskDataset_preprocess_yaml_content():
def test_OnDiskDataset_preprocess_yaml_content_unix():
"""Test if the preprocessed metadata.yaml is correct.""" """Test if the preprocessed metadata.yaml is correct."""
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
# All metadata fields are specified. # All metadata fields are specified.
...@@ -1640,196 +1639,45 @@ def test_OnDiskDataset_preprocess_yaml_content_unix(): ...@@ -1640,196 +1639,45 @@ def test_OnDiskDataset_preprocess_yaml_content_unix():
neighbors = np.random.randint(0, num_nodes, size=(num_edges)) neighbors = np.random.randint(0, num_nodes, size=(num_edges))
edges = np.stack([nodes, neighbors], axis=1) edges = np.stack([nodes, neighbors], axis=1)
# Write into edges/edge.csv # Write into edges/edge.csv
os.makedirs(os.path.join(test_dir, "edges/"), exist_ok=True) os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
edges = pd.DataFrame(edges, columns=["src", "dst"])
edges.to_csv(
os.path.join(test_dir, "edges/edge.csv"),
index=False,
header=False,
)
# Generate random graph edge-feats.
edge_feats = np.random.rand(num_edges, 5)
os.makedirs(os.path.join(test_dir, "data/"), exist_ok=True)
np.save(os.path.join(test_dir, "data/edge-feat.npy"), edge_feats)
# Generate random node-feats.
node_feats = np.random.rand(num_nodes, 10)
np.save(os.path.join(test_dir, "data/node-feat.npy"), node_feats)
# Generate train/test/valid set.
os.makedirs(os.path.join(test_dir, "set/"), exist_ok=True)
train_pairs = (np.arange(1000), np.arange(1000, 2000))
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T
train_path = os.path.join(test_dir, "set/train.npy")
np.save(train_path, train_data)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T
validation_path = os.path.join(test_dir, "set/validation.npy")
np.save(validation_path, validation_data)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T
test_path = os.path.join(test_dir, "set/test.npy")
np.save(test_path, test_data)
yaml_content = f"""
dataset_name: {dataset_name}
graph: # graph structure and required attributes.
nodes:
- num: {num_nodes}
edges:
- format: csv
path: edges/edge.csv
feature_data:
- domain: edge
type: null
name: feat
format: numpy
path: data/edge-feat.npy
feature_data:
- domain: node
type: null
name: feat
format: numpy
in_memory: false
path: data/node-feat.npy
tasks:
- name: node_classification
num_classes: {num_classes}
train_set:
- type: null
data:
- format: numpy
path: set/train.npy
validation_set:
- type: null
data:
- format: numpy
path: set/validation.npy
test_set:
- type: null
data:
- format: numpy
path: set/test.npy
"""
yaml_file = os.path.join(test_dir, "metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
preprocessed_metadata_path = gb.preprocess_ondisk_dataset(test_dir)
with open(preprocessed_metadata_path, "r") as f:
yaml_data = yaml.safe_load(f)
target_yaml_content = f"""
dataset_name: {dataset_name}
graph_topology:
type: FusedCSCSamplingGraph
path: preprocessed/fused_csc_sampling_graph.pt
feature_data:
- domain: node
type: null
name: feat
format: numpy
in_memory: false
path: preprocessed/data/node-feat.npy
tasks:
- name: node_classification
num_classes: {num_classes}
train_set:
- type: null
data:
- format: numpy
path: preprocessed/set/train.npy
validation_set:
- type: null
data:
- format: numpy
path: preprocessed/set/validation.npy
test_set:
- type: null
data:
- format: numpy
path: preprocessed/set/test.npy
include_original_edge_id: False
"""
target_yaml_data = yaml.safe_load(target_yaml_content)
# Check yaml content.
assert (
yaml_data == target_yaml_data
), "The preprocessed metadata.yaml is not correct."
# Check file existence.
assert os.path.exists(
os.path.join(test_dir, yaml_data["graph_topology"]["path"])
)
assert os.path.exists(
os.path.join(test_dir, yaml_data["feature_data"][0]["path"])
)
for set_name in ["train_set", "validation_set", "test_set"]:
assert os.path.exists(
os.path.join(
test_dir,
yaml_data["tasks"][0][set_name][0]["data"][0]["path"],
)
)
@unittest.skipIf(os.name != "nt", "Skip on Unix")
def test_OnDiskDataset_preprocess_yaml_content_windows():
"""Test if the preprocessed metadata.yaml is correct."""
with tempfile.TemporaryDirectory() as test_dir:
# All metadata fields are specified.
dataset_name = "graphbolt_test"
num_nodes = 4000
num_edges = 20000
num_classes = 10
# Generate random edges.
nodes = np.repeat(np.arange(num_nodes), 5)
neighbors = np.random.randint(0, num_nodes, size=(num_edges))
edges = np.stack([nodes, neighbors], axis=1)
# Write into edges/edge.csv
os.makedirs(os.path.join(test_dir, "edges\\"), exist_ok=True)
edges = pd.DataFrame(edges, columns=["src", "dst"]) edges = pd.DataFrame(edges, columns=["src", "dst"])
edge_path = os.path.join("edges", "edge.csv")
edges.to_csv( edges.to_csv(
os.path.join(test_dir, "edges\\edge.csv"), os.path.join(test_dir, edge_path),
index=False, index=False,
header=False, header=False,
) )
# Generate random graph edge-feats. # Generate random graph edge-feats.
edge_feats = np.random.rand(num_edges, 5) edge_feats = np.random.rand(num_edges, 5)
os.makedirs(os.path.join(test_dir, "data\\"), exist_ok=True) os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
np.save(os.path.join(test_dir, "data\\edge-feat.npy"), edge_feats) feature_edge = os.path.join("data", "edge-feat.npy")
np.save(os.path.join(test_dir, feature_edge), edge_feats)
# Generate random node-feats. # Generate random node-feats.
node_feats = np.random.rand(num_nodes, 10) node_feats = np.random.rand(num_nodes, 10)
np.save(os.path.join(test_dir, "data\\node-feat.npy"), node_feats) feature_node = os.path.join("data", "node-feat.npy")
np.save(os.path.join(test_dir, feature_node), node_feats)
# Generate train/test/valid set. # Generate train/test/valid set.
os.makedirs(os.path.join(test_dir, "set\\"), exist_ok=True) os.makedirs(os.path.join(test_dir, "set"), exist_ok=True)
train_pairs = (np.arange(1000), np.arange(1000, 2000)) train_pairs = (np.arange(1000), np.arange(1000, 2000))
train_labels = np.random.randint(0, 10, size=1000) train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T train_data = np.vstack([train_pairs, train_labels]).T
train_path = os.path.join(test_dir, "set\\train.npy") train_path = os.path.join("set", "train.npy")
np.save(train_path, train_data) np.save(os.path.join(test_dir, train_path), train_data)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000)) validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
validation_labels = np.random.randint(0, 10, size=1000) validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T validation_data = np.vstack([validation_pairs, validation_labels]).T
validation_path = os.path.join(test_dir, "set\\validation.npy") validation_path = os.path.join("set", "validation.npy")
np.save(validation_path, validation_data) np.save(os.path.join(test_dir, validation_path), validation_data)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000)) test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
test_labels = np.random.randint(0, 10, size=1000) test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T test_data = np.vstack([test_pairs, test_labels]).T
test_path = os.path.join(test_dir, "set\\test.npy") test_path = os.path.join("set", "test.npy")
np.save(test_path, test_data) np.save(os.path.join(test_dir, test_path), test_data)
yaml_content = f""" yaml_content = f"""
dataset_name: {dataset_name} dataset_name: {dataset_name}
...@@ -1838,21 +1686,21 @@ def test_OnDiskDataset_preprocess_yaml_content_windows(): ...@@ -1838,21 +1686,21 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
- num: {num_nodes} - num: {num_nodes}
edges: edges:
- format: csv - format: csv
path: edges\\edge.csv path: {edge_path}
feature_data: feature_data:
- domain: edge - domain: edge
type: null type: null
name: feat name: feat
format: numpy format: numpy
in_memory: true in_memory: true
path: data\\edge-feat.npy path: {feature_edge}
feature_data: feature_data:
- domain: node - domain: node
type: null type: null
name: feat name: feat
format: numpy format: numpy
in_memory: false in_memory: false
path: data\\node-feat.npy path: {feature_node}
tasks: tasks:
- name: node_classification - name: node_classification
num_classes: {num_classes} num_classes: {num_classes}
...@@ -1860,17 +1708,17 @@ def test_OnDiskDataset_preprocess_yaml_content_windows(): ...@@ -1860,17 +1708,17 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
- type: null - type: null
data: data:
- format: numpy - format: numpy
path: set\\train.npy path: {train_path}
validation_set: validation_set:
- type: null - type: null
data: data:
- format: numpy - format: numpy
path: set\\validation.npy path: {validation_path}
test_set: test_set:
- type: null - type: null
data: data:
- format: numpy - format: numpy
path: set\\test.npy path: {test_path}
""" """
yaml_file = os.path.join(test_dir, "metadata.yaml") yaml_file = os.path.join(test_dir, "metadata.yaml")
with open(yaml_file, "w") as f: with open(yaml_file, "w") as f:
...@@ -1880,18 +1728,19 @@ def test_OnDiskDataset_preprocess_yaml_content_windows(): ...@@ -1880,18 +1728,19 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
with open(preprocessed_metadata_path, "r") as f: with open(preprocessed_metadata_path, "r") as f:
yaml_data = yaml.safe_load(f) yaml_data = yaml.safe_load(f)
topo_path = os.path.join("preprocessed", "fused_csc_sampling_graph.pt")
target_yaml_content = f""" target_yaml_content = f"""
dataset_name: {dataset_name} dataset_name: {dataset_name}
graph_topology: graph_topology:
type: FusedCSCSamplingGraph type: FusedCSCSamplingGraph
path: preprocessed\\fused_csc_sampling_graph.pt path: {topo_path}
feature_data: feature_data:
- domain: node - domain: node
type: null type: null
name: feat name: feat
format: numpy format: numpy
in_memory: false in_memory: false
path: preprocessed\\data\\node-feat.npy path: {os.path.join("preprocessed", feature_node)}
tasks: tasks:
- name: node_classification - name: node_classification
num_classes: {num_classes} num_classes: {num_classes}
...@@ -1899,17 +1748,17 @@ def test_OnDiskDataset_preprocess_yaml_content_windows(): ...@@ -1899,17 +1748,17 @@ def test_OnDiskDataset_preprocess_yaml_content_windows():
- type: null - type: null
data: data:
- format: numpy - format: numpy
path: preprocessed\\set\\train.npy path: {os.path.join("preprocessed", train_path)}
validation_set: validation_set:
- type: null - type: null
data: data:
- format: numpy - format: numpy
path: preprocessed\\set\\validation.npy path: {os.path.join("preprocessed", validation_path)}
test_set: test_set:
- type: null - type: null
data: data:
- format: numpy - format: numpy
path: preprocessed\\set\\test.npy path: {os.path.join("preprocessed", test_path)}
include_original_edge_id: False include_original_edge_id: False
""" """
target_yaml_data = yaml.safe_load(target_yaml_content) target_yaml_data = yaml.safe_load(target_yaml_content)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment