write_mag.py 4.36 KB
Newer Older
1
import json
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
2
3

import dgl
4
import numpy as np
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
5
import torch as th
6
7
8
from ogb.nodeproppred import DglNodePropPredDataset

# Load OGB-MAG.
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
9
dataset = DglNodePropPredDataset(name="ogbn-mag")
10
11
12
13
14
hg_orig, labels = dataset[0]
subgs = {}
for etype in hg_orig.canonical_etypes:
    u, v = hg_orig.all_edges(etype=etype)
    subgs[etype] = (u, v)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
15
    subgs[(etype[2], "rev-" + etype[1], etype[0])] = (v, u)
16
hg = dgl.heterograph(subgs)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
17
hg.nodes["paper"].data["feat"] = hg_orig.nodes["paper"].data["feat"]
18
19
20
21
print(hg)

# OGB-MAG is stored in heterogeneous format. We need to convert it into homogeneous format.
g = dgl.to_homogeneous(hg)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
22
23
24
25
26
g.ndata["orig_id"] = g.ndata[dgl.NID]
g.edata["orig_id"] = g.edata[dgl.EID]
print("|V|=" + str(g.number_of_nodes()))
print("|E|=" + str(g.number_of_edges()))
print("|NTYPE|=" + str(len(th.unique(g.ndata[dgl.NTYPE]))))
27
28
29
30
31
32
33

# Store the metadata of nodes.
num_node_weights = 0
node_data = [g.ndata[dgl.NTYPE].numpy()]
for ntype_id in th.unique(g.ndata[dgl.NTYPE]):
    node_data.append((g.ndata[dgl.NTYPE] == ntype_id).numpy())
    num_node_weights += 1
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
34
node_data.append(g.ndata["orig_id"].numpy())
35
node_data = np.stack(node_data, 1)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
36
np.savetxt("mag_nodes.txt", node_data, fmt="%d", delimiter=" ")
37
38
39
40
41

# Store the node features
node_feats = {}
for ntype in hg.ntypes:
    for name in hg.nodes[ntype].data:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
42
        node_feats[ntype + "/" + name] = hg.nodes[ntype].data[name]
43
44
45
dgl.data.utils.save_tensors("node_feat.dgl", node_feats)

# Store the metadata of edges.
46
47
# ParMETIS cannot handle duplicated edges and self-loops. We should remove them
# in the preprocessing.
48
src_id, dst_id = g.edges()
49
50
51
52
53
# Remove self-loops
self_loop_idx = src_id == dst_id
not_self_loop_idx = src_id != dst_id
self_loop_src_id = src_id[self_loop_idx]
self_loop_dst_id = dst_id[self_loop_idx]
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
54
self_loop_orig_id = g.edata["orig_id"][self_loop_idx]
55
56
57
self_loop_etype = g.edata[dgl.ETYPE][self_loop_idx]
src_id = src_id[not_self_loop_idx]
dst_id = dst_id[not_self_loop_idx]
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
58
orig_id = g.edata["orig_id"][not_self_loop_idx]
59
60
61
62
63
64
65
66
67
68
69
70
71
72
etype = g.edata[dgl.ETYPE][not_self_loop_idx]
# Remove duplicated edges.
ids = (src_id * g.number_of_nodes() + dst_id).numpy()
uniq_ids, idx = np.unique(ids, return_index=True)
duplicate_idx = np.setdiff1d(np.arange(len(ids)), idx)
duplicate_src_id = src_id[duplicate_idx]
duplicate_dst_id = dst_id[duplicate_idx]
duplicate_orig_id = orig_id[duplicate_idx]
duplicate_etype = etype[duplicate_idx]
src_id = src_id[idx]
dst_id = dst_id[idx]
orig_id = orig_id[idx]
etype = etype[idx]
edge_data = th.stack([src_id, dst_id, orig_id, etype], 1)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
np.savetxt("mag_edges.txt", edge_data.numpy(), fmt="%d", delimiter=" ")
removed_edge_data = th.stack(
    [
        th.cat([self_loop_src_id, duplicate_src_id]),
        th.cat([self_loop_dst_id, duplicate_dst_id]),
        th.cat([self_loop_orig_id, duplicate_orig_id]),
        th.cat([self_loop_etype, duplicate_etype]),
    ],
    1,
)
np.savetxt(
    "mag_removed_edges.txt", removed_edge_data.numpy(), fmt="%d", delimiter=" "
)
print(
    "There are {} edges, remove {} self-loops and {} duplicated edges".format(
        g.number_of_edges(), len(self_loop_src_id), len(duplicate_src_id)
    )
)
91
92
93
94
95

# Store the edge features
edge_feats = {}
for etype in hg.etypes:
    for name in hg.edges[etype].data:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
96
        edge_feats[etype + "/" + name] = hg.edges[etype].data[name]
97
98
99
dgl.data.utils.save_tensors("edge_feat.dgl", edge_feats)

# Store the basic metadata of the graph.
100
graph_stats = [g.number_of_nodes(), len(src_id), num_node_weights]
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
101
102
103
104
with open("mag_stats.txt", "w") as filehandle:
    filehandle.writelines(
        "{} {} {}".format(graph_stats[0], graph_stats[1], graph_stats[2])
    )
105
106
107
108
109
110
111

# Store the ID ranges of nodes and edges of the entire graph.
nid_ranges = {}
eid_ranges = {}
for ntype in hg.ntypes:
    ntype_id = hg.get_ntype_id(ntype)
    nid = th.nonzero(g.ndata[dgl.NTYPE] == ntype_id, as_tuple=True)[0]
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
112
    per_type_nid = g.ndata["orig_id"][nid]
113
114
115
116
117
118
119
120
    assert np.all((per_type_nid == th.arange(len(per_type_nid))).numpy())
    assert np.all((nid == th.arange(nid[0], nid[-1] + 1)).numpy())
    nid_ranges[ntype] = [int(nid[0]), int(nid[-1] + 1)]
for etype in hg.etypes:
    etype_id = hg.get_etype_id(etype)
    eid = th.nonzero(g.edata[dgl.ETYPE] == etype_id, as_tuple=True)[0]
    assert np.all((eid == th.arange(eid[0], eid[-1] + 1)).numpy())
    eid_ranges[etype] = [int(eid[0]), int(eid[-1] + 1)]
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
121
122
with open("mag.json", "w") as outfile:
    json.dump({"nid": nid_ranges, "eid": eid_ranges}, outfile, indent=4)