Unverified Commit 523bbb4c authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[GraphBolt] fix preprocess issue for single ntype/etype graph (#7011)

parent 0bfe34d9
...@@ -118,7 +118,18 @@ def preprocess_ondisk_dataset( ...@@ -118,7 +118,18 @@ def preprocess_ondisk_dataset(
# 2. Load the edge data and create a DGLGraph. # 2. Load the edge data and create a DGLGraph.
if "graph" not in input_config: if "graph" not in input_config:
raise RuntimeError("Invalid config: does not contain graph field.") raise RuntimeError("Invalid config: does not contain graph field.")
is_homogeneous = "type" not in input_config["graph"]["nodes"][0] # For any graph that node/edge types are specified, we construct DGLGraph
# with `dgl.heterograph()` even there's only one node/edge type. This is
# because we want to save the node/edge types in the graph. So the logic of
# checking whether the graph is homogeneous is different from the logic in
# `DGLGraph.is_homogeneous()`. Otherwise, we construct DGLGraph with
# `dgl.graph()`.
is_homogeneous = (
len(input_config["graph"]["nodes"]) == 1
and len(input_config["graph"]["edges"]) == 1
and "type" not in input_config["graph"]["nodes"][0]
and "type" not in input_config["graph"]["edges"][0]
)
if is_homogeneous: if is_homogeneous:
# Homogeneous graph. # Homogeneous graph.
num_nodes = input_config["graph"]["nodes"][0]["num"] num_nodes = input_config["graph"]["nodes"][0]["num"]
...@@ -178,20 +189,24 @@ def preprocess_ondisk_dataset( ...@@ -178,20 +189,24 @@ def preprocess_ondisk_dataset(
if not is_homogeneous: if not is_homogeneous:
# For heterogenous graph, a node/edge feature must cover all # For heterogenous graph, a node/edge feature must cover all
# node/edge types. # node/edge types.
for feat_name, feat_data in g.ndata.items(): ntypes = g.ntypes
existing_types = set(feat_data.keys()) assert all(
assert existing_types == set(g.ntypes), ( set(g.nodes[ntypes[0]].data.keys())
f"Node feature {feat_name} does not cover all node types." == set(g.nodes[ntype].data.keys())
+ f"Existing types: {existing_types}." for ntype in ntypes
+ f"Expected types: {g.ntypes}." ), (
) "Node feature does not cover all node types: "
for feat_name, feat_data in g.edata.items(): + f"{set(g.nodes[ntype].data.keys() for ntype in ntypes)}."
existing_types = set(feat_data.keys()) )
assert existing_types == set(g.canonical_etypes), ( etypes = g.canonical_etypes
f"Edge feature {feat_name} does not cover all edge types." assert all(
+ f"Existing types: {existing_types}." set(g.edges[etypes[0]].data.keys())
+ f"Expected types: {g.etypes}." == set(g.edges[etype].data.keys())
) for etype in etypes
), (
"Edge feature does not cover all edge types: "
+ f"{set(g.edges[etype].data.keys() for etype in etypes)}."
)
# 4. Convert the DGLGraph to a FusedCSCSamplingGraph. # 4. Convert the DGLGraph to a FusedCSCSamplingGraph.
fused_csc_sampling_graph = from_dglgraph( fused_csc_sampling_graph = from_dglgraph(
......
...@@ -2742,3 +2742,83 @@ def test_OnDiskDataset_load_tasks_selectively(): ...@@ -2742,3 +2742,83 @@ def test_OnDiskDataset_load_tasks_selectively():
dataset = gb.OnDiskDataset(test_dir).load(tasks=2) dataset = gb.OnDiskDataset(test_dir).load(tasks=2)
dataset = None dataset = None
def test_OnDiskDataset_preprocess_graph_with_single_type():
"""Test for graph with single node/edge type."""
with tempfile.TemporaryDirectory() as test_dir:
# All metadata fields are specified.
dataset_name = "graphbolt_test"
num_nodes = 4000
num_edges = 20000
# Generate random edges.
nodes = np.repeat(np.arange(num_nodes), 5)
neighbors = np.random.randint(0, num_nodes, size=(num_edges))
edges = np.stack([nodes, neighbors], axis=1)
# Wrtie into edges/edge.csv
os.makedirs(os.path.join(test_dir, "edges/"), exist_ok=True)
edges = pd.DataFrame(edges, columns=["src", "dst"])
edges.to_csv(
os.path.join(test_dir, "edges/edge.csv"),
index=False,
header=False,
)
# Generate random graph edge-feats.
edge_feats = np.random.rand(num_edges, 5)
os.makedirs(os.path.join(test_dir, "data/"), exist_ok=True)
np.save(os.path.join(test_dir, "data/edge-feat.npy"), edge_feats)
# Generate random node-feats.
node_feats = np.random.rand(num_nodes, 10)
np.save(os.path.join(test_dir, "data/node-feat.npy"), node_feats)
yaml_content = f"""
dataset_name: {dataset_name}
graph: # graph structure and required attributes.
nodes:
- num: {num_nodes}
type: author
edges:
- type: author:collab:author
format: csv
path: edges/edge.csv
feature_data:
- domain: edge
type: author:collab:author
name: feat
format: numpy
path: data/edge-feat.npy
- domain: node
type: author
name: feat
format: numpy
path: data/node-feat.npy
"""
yaml_file = os.path.join(test_dir, "metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(test_dir).load()
assert dataset.dataset_name == dataset_name
graph = dataset.graph
assert isinstance(graph, gb.FusedCSCSamplingGraph)
assert graph.total_num_nodes == num_nodes
assert graph.total_num_edges == num_edges
assert (
graph.node_attributes is not None
and "feat" in graph.node_attributes
)
assert (
graph.edge_attributes is not None
and "feat" in graph.edge_attributes
)
assert torch.equal(graph.node_type_offset, torch.tensor([0, num_nodes]))
assert torch.equal(
graph.type_per_edge,
torch.zeros(num_edges),
)
assert graph.edge_type_to_id == {"author:collab:author": 0}
assert graph.node_type_to_id == {"author": 0}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment