gb_test_utils.py 12.2 KB
Newer Older
1
2
import os

3
import dgl
4
import dgl.graphbolt as gb
5

6
7
import numpy as np
import pandas as pd
8
9
10
11
import scipy.sparse as sp
import torch


12
def rand_csc_graph(N, density, bidirection_edge=False):
13
    adj = sp.random(N, N, density)
14
15
    if bidirection_edge:
        adj = adj + adj.T
16
17
18
19
20
    adj = adj.tocsc()

    indptr = torch.LongTensor(adj.indptr)
    indices = torch.LongTensor(adj.indices)

21
    graph = gb.fused_csc_sampling_graph(indptr, indices)
22
23

    return graph
24
25
26
27
28
29
30
31
32
33
34


def random_homo_graph(num_nodes, num_edges):
    csc_indptr = torch.randint(0, num_edges, (num_nodes + 1,))
    csc_indptr = torch.sort(csc_indptr)[0]
    csc_indptr[0] = 0
    csc_indptr[-1] = num_edges
    indices = torch.randint(0, num_nodes, (num_edges,))
    return csc_indptr, indices


35
def get_type_to_id(num_ntypes, num_etypes):
36
37
38
39
40
41
42
    ntypes = {f"n{i}": i for i in range(num_ntypes)}
    etypes = {}
    count = 0
    for n1 in range(num_ntypes):
        for n2 in range(n1, num_ntypes):
            if count >= num_etypes:
                break
43
            etypes.update({f"n{n1}:e{count}:n{n2}": count})
44
            count += 1
45
    return ntypes, etypes
46
47


48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def get_ntypes_and_etypes(num_nodes, num_ntypes, num_etypes):
    ntypes = {f"n{i}": num_nodes // num_ntypes for i in range(num_ntypes)}
    if num_nodes % num_ntypes != 0:
        ntypes["n0"] += num_nodes % num_ntypes
    etypes = []
    count = 0
    while count < num_etypes:
        for n1 in range(num_ntypes):
            for n2 in range(num_ntypes):
                if count >= num_etypes:
                    break
                etypes.append((f"n{n1}", f"e{count}", f"n{n2}"))
                count += 1
    return ntypes, etypes


64
def random_hetero_graph(num_nodes, num_edges, num_ntypes, num_etypes):
65
66
67
68
69
70
    ntypes, etypes = get_ntypes_and_etypes(num_nodes, num_ntypes, num_etypes)
    edges = {}
    for step, etype in enumerate(etypes):
        src_ntype, _, dst_ntype = etype
        num_e = num_edges // num_etypes + (
            0 if step != 0 else num_edges % num_etypes
71
        )
72
73
74
75
76
77
78
79
80
81
82
83
84
        if ntypes[src_ntype] == 0 or ntypes[dst_ntype] == 0:
            continue
        src = torch.randint(0, ntypes[src_ntype], (num_e,))
        dst = torch.randint(0, ntypes[dst_ntype], (num_e,))

        edges[etype] = (src, dst)

    gb_g = gb.from_dglgraph(dgl.heterograph(edges, ntypes))
    return (
        gb_g.csc_indptr,
        gb_g.indices,
        gb_g.node_type_offset,
        gb_g.type_per_edge,
85
86
        gb_g.node_type_to_id,
        gb_g.edge_type_to_id,
87
    )
88
89
90


def random_homo_graphbolt_graph(
91
    test_dir, dataset_name, num_nodes, num_edges, num_classes, edge_fmt="csv"
92
93
94
):
    """Generate random graphbolt version homograph"""
    # Generate random edges.
95
96
97
98
    nodes = np.repeat(np.arange(num_nodes, dtype=np.int64), 5)
    neighbors = np.random.randint(
        0, num_nodes, size=(num_edges), dtype=np.int64
    )
99
100
    edges = np.stack([nodes, neighbors], axis=1)
    os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
101
102
103
104
    assert edge_fmt in [
        "numpy",
        "csv",
    ], "Only numpy and csv are supported for edges."
105
    if edge_fmt == "csv":
106
        # Write into edges/edge.csv
107
        edges_DataFrame = pd.DataFrame(edges, columns=["src", "dst"])
108
        edge_path = os.path.join("edges", "edge.csv")
109
        edges_DataFrame.to_csv(
110
111
112
113
114
            os.path.join(test_dir, edge_path),
            index=False,
            header=False,
        )
    else:
115
        # Write into edges/edge.npy
116
117
118
        edges = edges.T
        edge_path = os.path.join("edges", "edge.npy")
        np.save(os.path.join(test_dir, edge_path), edges)
119
120

    # Generate random graph edge-feats.
121
    edge_feats = np.random.rand(num_edges, num_classes)
122
123
124
125
126
    os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
    edge_feat_path = os.path.join("data", "edge-feat.npy")
    np.save(os.path.join(test_dir, edge_feat_path), edge_feats)

    # Generate random node-feats.
127
128
129
130
    if num_classes == 1:
        node_feats = np.random.rand(num_nodes)
    else:
        node_feats = np.random.rand(num_nodes, num_classes)
131
132
133
134
135
136
137
138
139
140
141
    node_feat_path = os.path.join("data", "node-feat.npy")
    np.save(os.path.join(test_dir, node_feat_path), node_feats)

    # Generate train/test/valid set.
    assert num_nodes % 4 == 0, "num_nodes must be divisible by 4"
    each_set_size = num_nodes // 4
    os.makedirs(os.path.join(test_dir, "set"), exist_ok=True)
    train_pairs = (
        np.arange(each_set_size),
        np.arange(each_set_size, 2 * each_set_size),
    )
142
    train_data = np.vstack(train_pairs).T.astype(edges.dtype)
143
144
145
146
147
148
149
    train_path = os.path.join("set", "train.npy")
    np.save(os.path.join(test_dir, train_path), train_data)

    validation_pairs = (
        np.arange(each_set_size, 2 * each_set_size),
        np.arange(2 * each_set_size, 3 * each_set_size),
    )
150
    validation_data = np.vstack(validation_pairs).T.astype(edges.dtype)
151
152
153
154
155
156
157
    validation_path = os.path.join("set", "validation.npy")
    np.save(os.path.join(test_dir, validation_path), validation_data)

    test_pairs = (
        np.arange(2 * each_set_size, 3 * each_set_size),
        np.arange(3 * each_set_size, 4 * each_set_size),
    )
158
    test_data = np.vstack(test_pairs).T.astype(edges.dtype)
159
160
161
162
163
    test_path = os.path.join("set", "test.npy")
    np.save(os.path.join(test_dir, test_path), test_data)

    yaml_content = f"""
        dataset_name: {dataset_name}
164
        graph: # Graph structure and required attributes.
165
166
167
            nodes:
                - num: {num_nodes}
            edges:
168
                - format: {edge_fmt}
169
170
                  path: {edge_path}
            feature_data:
171
172
173
174
175
176
                - domain: node
                  type: null
                  name: feat
                  format: numpy
                  in_memory: true
                  path: {node_feat_path}
177
178
179
180
181
182
183
184
185
186
187
                - domain: edge
                  type: null
                  name: feat
                  format: numpy
                  in_memory: true
                  path: {edge_feat_path}
        feature_data:
            - domain: node
              type: null
              name: feat
              format: numpy
188
              in_memory: true
189
              path: {node_feat_path}
190
191
192
193
194
            - domain: edge
              type: null
              name: feat
              format: numpy
              path: {edge_feat_path}
195
        tasks:
196
          - name: link_prediction
197
198
            num_classes: {num_classes}
            train_set:
199
              - type: null
200
                data:
201
                  - name: seeds
202
203
                    format: numpy
                    in_memory: true
204
205
                    path: {train_path}
            validation_set:
206
              - type: null
207
                data:
208
                  - name: seeds
209
210
                    format: numpy
                    in_memory: true
211
212
                    path: {validation_path}
            test_set:
213
              - type: null
214
                data:
215
                  - name: seeds
216
217
                    format: numpy
                    in_memory: true
218
219
220
                    path: {test_path}
    """
    return yaml_content
221
222


223
def generate_raw_data_for_hetero_dataset(
224
    test_dir, dataset_name, num_nodes, num_edges, num_classes, edge_fmt="csv"
225
226
227
228
229
230
231
232
):
    # Generate edges.
    edges_path = {}
    for etype, num_edge in num_edges.items():
        src_ntype, etype_str, dst_ntype = etype
        src = torch.randint(0, num_nodes[src_ntype], (num_edge,))
        dst = torch.randint(0, num_nodes[dst_ntype], (num_edge,))
        os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
233
234
235
236
        assert edge_fmt in [
            "numpy",
            "csv",
        ], "Only numpy and csv are supported for edges."
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
        if edge_fmt == "csv":
            # Write into edges/edge.csv
            edges = pd.DataFrame(
                np.stack([src, dst], axis=1), columns=["src", "dst"]
            )
            edge_path = os.path.join("edges", f"{etype_str}.csv")
            edges.to_csv(
                os.path.join(test_dir, edge_path),
                index=False,
                header=False,
            )
        else:
            edges = np.stack([src, dst], axis=1).T
            edge_path = os.path.join("edges", f"{etype_str}.npy")
            np.save(os.path.join(test_dir, edge_path), edges)
252
253
254
255
256
257
258
259
260
261
262
        edges_path[etype_str] = edge_path

    # Generate node features.
    node_feats_path = {}
    os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
    for ntype, num_node in num_nodes.items():
        node_feat_path = os.path.join("data", f"{ntype}-feat.npy")
        node_feats = np.random.rand(num_node, num_classes)
        np.save(os.path.join(test_dir, node_feat_path), node_feats)
        node_feats_path[ntype] = node_feat_path

263
264
265
266
267
268
269
270
271
272
    # Generate edge features.
    edge_feats_path = {}
    os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
    for etype, num_edge in num_edges.items():
        src_ntype, etype_str, dst_ntype = etype
        edge_feat_path = os.path.join("data", f"{etype_str}-feat.npy")
        edge_feats = np.random.rand(num_edge, num_classes)
        np.save(os.path.join(test_dir, edge_feat_path), edge_feats)
        edge_feats_path[etype_str] = edge_feat_path

273
274
    # Generate train/test/valid set.
    os.makedirs(os.path.join(test_dir, "set"), exist_ok=True)
275
    user_ids = torch.arange(num_nodes["user"])
276
    np.random.shuffle(user_ids.numpy())
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
    num_train = int(num_nodes["user"] * 0.6)
    num_validation = int(num_nodes["user"] * 0.2)
    num_test = num_nodes["user"] - num_train - num_validation
    train_path = os.path.join("set", "train.npy")
    np.save(os.path.join(test_dir, train_path), user_ids[:num_train])
    validation_path = os.path.join("set", "validation.npy")
    np.save(
        os.path.join(test_dir, validation_path),
        user_ids[num_train : num_train + num_validation],
    )
    test_path = os.path.join("set", "test.npy")
    np.save(
        os.path.join(test_dir, test_path),
        user_ids[num_train + num_validation :],
    )

    yaml_content = f"""
        dataset_name: {dataset_name}
295
        graph: # Graph structure and required attributes.
296
297
298
299
300
301
302
          nodes:
            - type: user
              num: {num_nodes["user"]}
            - type: item
              num: {num_nodes["item"]}
          edges:
            - type: "user:follow:user"
303
              format: {edge_fmt}
304
305
              path: {edges_path["follow"]}
            - type: "user:click:item"
306
              format: {edge_fmt}
307
              path: {edges_path["click"]}
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
          feature_data:
            - domain: node
              type: user
              name: feat
              format: numpy
              in_memory: true
              path: {node_feats_path["user"]}
            - domain: node
              type: item
              name: feat
              format: numpy
              in_memory: true
              path: {node_feats_path["item"]}
            - domain: edge
              type: "user:follow:user"
              name: feat
              format: numpy
              in_memory: true
              path: {edge_feats_path["follow"]}
            - domain: edge
              type: "user:click:item"
              name: feat
              format: numpy
              in_memory: true
              path: {edge_feats_path["click"]}
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
        feature_data:
          - domain: node
            type: user
            name: feat
            format: numpy
            in_memory: true
            path: {node_feats_path["user"]}
          - domain: node
            type: item
            name: feat
            format: numpy
            in_memory: true
            path: {node_feats_path["item"]}
        tasks:
          - name: node_classification
            num_classes: {num_classes}
            train_set:
              - type: user
                data:
352
                  - name: seeds
353
354
355
356
357
358
                    format: numpy
                    in_memory: true
                    path: {train_path}
            validation_set:
              - type: user
                data:
359
                  - name: seeds
360
361
362
363
364
365
                    format: numpy
                    in_memory: true
                    path: {validation_path}
            test_set:
              - type: user
                data:
366
                  - name: seeds
367
368
369
370
371
372
373
374
                    format: numpy
                    in_memory: true
                    path: {test_path}
    """

    yaml_file = os.path.join(test_dir, "metadata.yaml")
    with open(yaml_file, "w") as f:
        f.write(yaml_content)