test_dist_graph_store.py 33.4 KB
Newer Older
1
2
3
4
5
6
import os
os.environ['OMP_NUM_THREADS'] = '1'
import dgl
import sys
import numpy as np
import time
7
import socket
8
9
10
11
from scipy import sparse as spsp
from numpy.testing import assert_array_equal
from multiprocessing import Process, Manager, Condition, Value
import multiprocessing as mp
12
from dgl.heterograph_index import create_unitgraph_from_coo
13
14
from dgl.data.utils import load_graphs, save_graphs
from dgl.distributed import DistGraphServer, DistGraph
15
from dgl.distributed import partition_graph, load_partition, load_partition_book, node_split, edge_split
16
from numpy.testing import assert_almost_equal
17
import backend as F
18
import math
19
20
import unittest
import pickle
21
from utils import reset_envs, generate_ip_config
22
import pytest
23

24
25
26
27
if os.name != 'nt':
    import fcntl
    import struct

28
def create_random_graph(n):
29
    arr = (spsp.random(n, n, density=0.001, format='coo', random_state=100) != 0).astype(np.int64)
30
    return dgl.from_scipy(arr)
31

32
def run_server(graph_name, server_id, server_count, num_clients, shared_mem, keep_alive=False):
33
    g = DistGraphServer(server_id, "kv_ip_config.txt", server_count, num_clients,
34
                        '/tmp/dist_graph/{}.json'.format(graph_name),
35
                        disable_shared_mem=not shared_mem,
36
                        graph_format=['csc', 'coo'], keep_alive=keep_alive)
37
    print('start server', server_id)
38
39
40
41
42
43
44
45
46
    # verify dtype of underlying graph
    cg = g.client_g
    for k, dtype in dgl.distributed.dist_graph.FIELD_DICT.items():
        if k in cg.ndata:
            assert F.dtype(
                cg.ndata[k]) == dtype, "Data type of {} in ndata should be {}.".format(k, dtype)
        if k in cg.edata:
            assert F.dtype(
                cg.edata[k]) == dtype, "Data type of {} in edata should be {}.".format(k, dtype)
47
48
    g.start()

49
50
51
def emb_init(shape, dtype):
    return F.zeros(shape, dtype, F.cpu())

52
def rand_init(shape, dtype):
53
    return F.tensor(np.random.normal(size=shape), F.float32)
54

55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def check_dist_graph_empty(g, num_clients, num_nodes, num_edges):
    # Test API
    assert g.number_of_nodes() == num_nodes
    assert g.number_of_edges() == num_edges

    # Test init node data
    new_shape = (g.number_of_nodes(), 2)
    g.ndata['test1'] = dgl.distributed.DistTensor(new_shape, F.int32)
    nids = F.arange(0, int(g.number_of_nodes() / 2))
    feats = g.ndata['test1'][nids]
    assert np.all(F.asnumpy(feats) == 0)

    # create a tensor and destroy a tensor and create it again.
    test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test3', init_func=rand_init)
    del test3
    test3 = dgl.distributed.DistTensor((g.number_of_nodes(), 3), F.float32, 'test3')
    del test3

    # Test write data
    new_feats = F.ones((len(nids), 2), F.int32, F.cpu())
    g.ndata['test1'][nids] = new_feats
    feats = g.ndata['test1'][nids]
    assert np.all(F.asnumpy(feats) == 1)

    # Test metadata operations.
    assert g.node_attr_schemes()['test1'].dtype == F.int32

    print('end')

def run_client_empty(graph_name, part_id, server_count, num_clients, num_nodes, num_edges):
    os.environ['DGL_NUM_SERVER'] = str(server_count)
    dgl.distributed.initialize("kv_ip_config.txt")
    gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
                                                part_id, None)
    g = DistGraph(graph_name, gpb=gpb)
    check_dist_graph_empty(g, num_clients, num_nodes, num_edges)

def check_server_client_empty(shared_mem, num_servers, num_clients):
93
    prepare_dist(num_servers)
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
    g = create_random_graph(10000)

    # Partition the graph
    num_parts = 1
    graph_name = 'dist_graph_test_1'
    partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')

    # let's just test on one partition for now.
    # We cannot run multiple servers and clients on the same machine.
    serv_ps = []
    ctx = mp.get_context('spawn')
    for serv_id in range(num_servers):
        p = ctx.Process(target=run_server, args=(graph_name, serv_id, num_servers,
                                                 num_clients, shared_mem))
        serv_ps.append(p)
        p.start()

    cli_ps = []
    for cli_id in range(num_clients):
        print('start client', cli_id)
        p = ctx.Process(target=run_client_empty, args=(graph_name, 0, num_servers, num_clients,
                                                       g.number_of_nodes(), g.number_of_edges()))
        p.start()
        cli_ps.append(p)

    for p in cli_ps:
        p.join()

    for p in serv_ps:
        p.join()

    print('clients have terminated')

127
def run_client(graph_name, part_id, server_count, num_clients, num_nodes, num_edges, group_id):
128
    os.environ['DGL_NUM_SERVER'] = str(server_count)
129
    os.environ['DGL_GROUP_ID'] = str(group_id)
130
    dgl.distributed.initialize("kv_ip_config.txt")
131
132
    gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
                                                part_id, None)
133
    g = DistGraph(graph_name, gpb=gpb)
134
    check_dist_graph(g, num_clients, num_nodes, num_edges)
135

136
def run_emb_client(graph_name, part_id, server_count, num_clients, num_nodes, num_edges, group_id):
137
    os.environ['DGL_NUM_SERVER'] = str(server_count)
138
    os.environ['DGL_GROUP_ID'] = str(group_id)
139
140
141
142
143
144
    dgl.distributed.initialize("kv_ip_config.txt")
    gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
                                                part_id, None)
    g = DistGraph(graph_name, gpb=gpb)
    check_dist_emb(g, num_clients, num_nodes, num_edges)

145
146
147
148
149
150
151
152
153
154
155
156
157
def run_client_hierarchy(graph_name, part_id, server_count, node_mask, edge_mask, return_dict):
    os.environ['DGL_NUM_SERVER'] = str(server_count)
    dgl.distributed.initialize("kv_ip_config.txt")
    gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
                                                part_id, None)
    g = DistGraph(graph_name, gpb=gpb)
    node_mask = F.tensor(node_mask)
    edge_mask = F.tensor(edge_mask)
    nodes = node_split(node_mask, g.get_partition_book(), node_trainer_ids=g.ndata['trainer_id'])
    edges = edge_split(edge_mask, g.get_partition_book(), edge_trainer_ids=g.edata['trainer_id'])
    rank = g.rank()
    return_dict[rank] = (nodes, edges)

158
159
def check_dist_emb(g, num_clients, num_nodes, num_edges):
    from dgl.distributed.optim import SparseAdagrad
160
    from dgl.distributed import DistEmbedding
161
162
    # Test sparse emb
    try:
163
        emb = DistEmbedding(g.number_of_nodes(), 1, 'emb1', emb_init)
164
        nids = F.arange(0, int(g.number_of_nodes()))
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
        lr = 0.001
        optimizer = SparseAdagrad([emb], lr=lr)
        with F.record_grad():
            feats = emb(nids)
            assert np.all(F.asnumpy(feats) == np.zeros((len(nids), 1)))
            loss = F.sum(feats + 1, 0)
        loss.backward()
        optimizer.step()
        feats = emb(nids)
        if num_clients == 1:
            assert_almost_equal(F.asnumpy(feats), np.ones((len(nids), 1)) * -lr)
        rest = np.setdiff1d(np.arange(g.number_of_nodes()), F.asnumpy(nids))
        feats1 = emb(rest)
        assert np.all(F.asnumpy(feats1) == np.zeros((len(rest), 1)))

        policy = dgl.distributed.PartitionPolicy('node', g.get_partition_book())
181
        grad_sum = dgl.distributed.DistTensor((g.number_of_nodes(), 1), F.float32,
182
183
184
185
186
                                              'emb1_sum', policy)
        if num_clients == 1:
            assert np.all(F.asnumpy(grad_sum[nids]) == np.ones((len(nids), 1)) * num_clients)
        assert np.all(F.asnumpy(grad_sum[rest]) == np.zeros((len(rest), 1)))

187
        emb = DistEmbedding(g.number_of_nodes(), 1, 'emb2', emb_init)
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
        with F.no_grad():
            feats1 = emb(nids)
        assert np.all(F.asnumpy(feats1) == 0)

        optimizer = SparseAdagrad([emb], lr=lr)
        with F.record_grad():
            feats1 = emb(nids)
            feats2 = emb(nids)
            feats = F.cat([feats1, feats2], 0)
            assert np.all(F.asnumpy(feats) == np.zeros((len(nids) * 2, 1)))
            loss = F.sum(feats + 1, 0)
        loss.backward()
        optimizer.step()
        with F.no_grad():
            feats = emb(nids)
        if num_clients == 1:
204
            assert_almost_equal(F.asnumpy(feats), np.ones((len(nids), 1)) * 1 * -lr)
205
206
207
208
209
        rest = np.setdiff1d(np.arange(g.number_of_nodes()), F.asnumpy(nids))
        feats1 = emb(rest)
        assert np.all(F.asnumpy(feats1) == np.zeros((len(rest), 1)))
    except NotImplementedError as e:
        pass
210
211
212
    except Exception as e:
        print(e)
        sys.exit(-1)
213

214
def check_dist_graph(g, num_clients, num_nodes, num_edges):
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
    # Test API
    assert g.number_of_nodes() == num_nodes
    assert g.number_of_edges() == num_edges

    # Test reading node data
    nids = F.arange(0, int(g.number_of_nodes() / 2))
    feats1 = g.ndata['features'][nids]
    feats = F.squeeze(feats1, 1)
    assert np.all(F.asnumpy(feats == nids))

    # Test reading edge data
    eids = F.arange(0, int(g.number_of_edges() / 2))
    feats1 = g.edata['features'][eids]
    feats = F.squeeze(feats1, 1)
    assert np.all(F.asnumpy(feats == eids))

231
232
233
234
235
    # Test edge_subgraph
    sg = g.edge_subgraph(eids)
    assert sg.num_edges() == len(eids)
    assert F.array_equal(sg.edata[dgl.EID], eids)

236
237
    # Test init node data
    new_shape = (g.number_of_nodes(), 2)
238
239
    test1 = dgl.distributed.DistTensor(new_shape, F.int32)
    g.ndata['test1'] = test1
240
241
    feats = g.ndata['test1'][nids]
    assert np.all(F.asnumpy(feats) == 0)
242
    assert test1.count_nonzero() == 0
243

244
    # reference to a one that exists
245
246
    test2 = dgl.distributed.DistTensor(new_shape, F.float32, 'test2', init_func=rand_init)
    test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test2')
247
248
249
    assert np.all(F.asnumpy(test2[nids]) == F.asnumpy(test3[nids]))

    # create a tensor and destroy a tensor and create it again.
250
    test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test3', init_func=rand_init)
251
    del test3
252
    test3 = dgl.distributed.DistTensor((g.number_of_nodes(), 3), F.float32, 'test3')
253
254
    del test3

Da Zheng's avatar
Da Zheng committed
255
256
257
258
259
260
261
262
    # add tests for anonymous distributed tensor.
    test3 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init)
    data = test3[0:10]
    test4 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init)
    del test3
    test5 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init)
    assert np.sum(F.asnumpy(test5[0:10] != data)) > 0

263
    # test a persistent tesnor
264
    test4 = dgl.distributed.DistTensor(new_shape, F.float32, 'test4', init_func=rand_init,
265
266
267
                                       persistent=True)
    del test4
    try:
268
        test4 = dgl.distributed.DistTensor((g.number_of_nodes(), 3), F.float32, 'test4')
269
270
271
        raise Exception('')
    except:
        pass
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286

    # Test write data
    new_feats = F.ones((len(nids), 2), F.int32, F.cpu())
    g.ndata['test1'][nids] = new_feats
    feats = g.ndata['test1'][nids]
    assert np.all(F.asnumpy(feats) == 1)

    # Test metadata operations.
    assert len(g.ndata['features']) == g.number_of_nodes()
    assert g.ndata['features'].shape == (g.number_of_nodes(), 1)
    assert g.ndata['features'].dtype == F.int64
    assert g.node_attr_schemes()['features'].dtype == F.int64
    assert g.node_attr_schemes()['test1'].dtype == F.int32
    assert g.node_attr_schemes()['features'].shape == (1,)

287
288
    selected_nodes = np.random.randint(0, 100, size=g.number_of_nodes()) > 30
    # Test node split
289
    nodes = node_split(selected_nodes, g.get_partition_book())
290
291
292
293
294
295
    nodes = F.asnumpy(nodes)
    # We only have one partition, so the local nodes are basically all nodes in the graph.
    local_nids = np.arange(g.number_of_nodes())
    for n in nodes:
        assert n in local_nids

296
297
    print('end')

298
def check_dist_emb_server_client(shared_mem, num_servers, num_clients, num_groups=1):
299
    prepare_dist(num_servers)
300
301
302
303
    g = create_random_graph(10000)

    # Partition the graph
    num_parts = 1
304
    graph_name = f'check_dist_emb_{shared_mem}_{num_servers}_{num_clients}_{num_groups}'
305
306
307
308
309
310
311
312
    g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1)
    g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1)
    partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')

    # let's just test on one partition for now.
    # We cannot run multiple servers and clients on the same machine.
    serv_ps = []
    ctx = mp.get_context('spawn')
313
    keep_alive = num_groups > 1
314
315
    for serv_id in range(num_servers):
        p = ctx.Process(target=run_server, args=(graph_name, serv_id, num_servers,
316
                                                 num_clients, shared_mem, keep_alive))
317
318
319
320
321
        serv_ps.append(p)
        p.start()

    cli_ps = []
    for cli_id in range(num_clients):
322
323
324
325
326
327
328
        for group_id in range(num_groups):
            print('start client[{}] for group[{}]'.format(cli_id, group_id))
            p = ctx.Process(target=run_emb_client, args=(graph_name, 0, num_servers, num_clients,
                                                        g.number_of_nodes(),
                                                        g.number_of_edges(),
                                                        group_id))
            p.start()
329
            time.sleep(1) # avoid race condition when instantiating DistGraph
330
            cli_ps.append(p)
331
332
333

    for p in cli_ps:
        p.join()
334
        assert p.exitcode == 0
335

336
337
338
339
340
    if keep_alive:
        for p in serv_ps:
            assert p.is_alive()
        # force shutdown server
        dgl.distributed.shutdown_servers("kv_ip_config.txt", num_servers)
341
342
343
344
345
    for p in serv_ps:
        p.join()

    print('clients have terminated')

346
def check_server_client(shared_mem, num_servers, num_clients, num_groups=1):
347
    prepare_dist(num_servers)
348
349
350
351
    g = create_random_graph(10000)

    # Partition the graph
    num_parts = 1
352
    graph_name = f'check_server_client_{shared_mem}_{num_servers}_{num_clients}_{num_groups}'
353
354
    g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1)
    g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1)
355
    partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')
356
357
358
359

    # let's just test on one partition for now.
    # We cannot run multiple servers and clients on the same machine.
    serv_ps = []
360
    ctx = mp.get_context('spawn')
361
    keep_alive = num_groups > 1
362
    for serv_id in range(num_servers):
363
        p = ctx.Process(target=run_server, args=(graph_name, serv_id, num_servers,
364
                                                 num_clients, shared_mem, keep_alive))
365
366
367
        serv_ps.append(p)
        p.start()

368
    # launch different client groups simultaneously
369
    cli_ps = []
370
    for cli_id in range(num_clients):
371
372
373
374
375
        for group_id in range(num_groups):
            print('start client[{}] for group[{}]'.format(cli_id, group_id))
            p = ctx.Process(target=run_client, args=(graph_name, 0, num_servers, num_clients, g.number_of_nodes(),
                                                    g.number_of_edges(), group_id))
            p.start()
376
            time.sleep(1) # avoid race condition when instantiating DistGraph
377
            cli_ps.append(p)
378
379
    for p in cli_ps:
        p.join()
380

381
382
383
384
385
    if keep_alive:
        for p in serv_ps:
            assert p.is_alive()
        # force shutdown server
        dgl.distributed.shutdown_servers("kv_ip_config.txt", num_servers)
386
387
388
    for p in serv_ps:
        p.join()

389
390
    print('clients have terminated')

391
def check_server_client_hierarchy(shared_mem, num_servers, num_clients):
392
    prepare_dist(num_servers)
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
    g = create_random_graph(10000)

    # Partition the graph
    num_parts = 1
    graph_name = 'dist_graph_test_2'
    g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1)
    g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1)
    partition_graph(g, graph_name, num_parts, '/tmp/dist_graph', num_trainers_per_machine=num_clients)

    # let's just test on one partition for now.
    # We cannot run multiple servers and clients on the same machine.
    serv_ps = []
    ctx = mp.get_context('spawn')
    for serv_id in range(num_servers):
        p = ctx.Process(target=run_server, args=(graph_name, serv_id, num_servers,
                                                 num_clients, shared_mem))
        serv_ps.append(p)
        p.start()

    cli_ps = []
    manager = mp.Manager()
    return_dict = manager.dict()
    node_mask = np.zeros((g.number_of_nodes(),), np.int32)
    edge_mask = np.zeros((g.number_of_edges(),), np.int32)
    nodes = np.random.choice(g.number_of_nodes(), g.number_of_nodes() // 10, replace=False)
    edges = np.random.choice(g.number_of_edges(), g.number_of_edges() // 10, replace=False)
    node_mask[nodes] = 1
    edge_mask[edges] = 1
    nodes = np.sort(nodes)
    edges = np.sort(edges)
    for cli_id in range(num_clients):
        print('start client', cli_id)
        p = ctx.Process(target=run_client_hierarchy, args=(graph_name, 0, num_servers,
                                                           node_mask, edge_mask, return_dict))
        p.start()
        cli_ps.append(p)

    for p in cli_ps:
        p.join()
    for p in serv_ps:
        p.join()

    nodes1 = []
    edges1 = []
    for n, e in return_dict.values():
        nodes1.append(n)
        edges1.append(e)
    nodes1, _ = F.sort_1d(F.cat(nodes1, 0))
    edges1, _ = F.sort_1d(F.cat(edges1, 0))
    assert np.all(F.asnumpy(nodes1) == nodes)
    assert np.all(F.asnumpy(edges1) == edges)

    print('clients have terminated')

447
448

def run_client_hetero(graph_name, part_id, server_count, num_clients, num_nodes, num_edges):
449
450
    os.environ['DGL_NUM_SERVER'] = str(server_count)
    dgl.distributed.initialize("kv_ip_config.txt")
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
    gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
                                                part_id, None)
    g = DistGraph(graph_name, gpb=gpb)
    check_dist_graph_hetero(g, num_clients, num_nodes, num_edges)

def create_random_hetero():
    num_nodes = {'n1': 10000, 'n2': 10010, 'n3': 10020}
    etypes = [('n1', 'r1', 'n2'),
              ('n1', 'r2', 'n3'),
              ('n2', 'r3', 'n3')]
    edges = {}
    for etype in etypes:
        src_ntype, _, dst_ntype = etype
        arr = spsp.random(num_nodes[src_ntype], num_nodes[dst_ntype], density=0.001, format='coo',
                          random_state=100)
        edges[etype] = (arr.row, arr.col)
    g = dgl.heterograph(edges, num_nodes)
    g.nodes['n1'].data['feat'] = F.unsqueeze(F.arange(0, g.number_of_nodes('n1')), 1)
    g.edges['r1'].data['feat'] = F.unsqueeze(F.arange(0, g.number_of_edges('r1')), 1)
    return g

def check_dist_graph_hetero(g, num_clients, num_nodes, num_edges):
    # Test API
    for ntype in num_nodes:
        assert ntype in g.ntypes
        assert num_nodes[ntype] == g.number_of_nodes(ntype)
    for etype in num_edges:
        assert etype in g.etypes
        assert num_edges[etype] == g.number_of_edges(etype)
480
481
482
483
484
485
486
    etypes = [('n1', 'r1', 'n2'),
              ('n1', 'r2', 'n3'),
              ('n2', 'r3', 'n3')]
    for i, etype in enumerate(g.canonical_etypes):
        assert etype[0] == etypes[i][0]
        assert etype[1] == etypes[i][1]
        assert etype[2] == etypes[i][2]
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
    assert g.number_of_nodes() == sum([num_nodes[ntype] for ntype in num_nodes])
    assert g.number_of_edges() == sum([num_edges[etype] for etype in num_edges])

    # Test reading node data
    nids = F.arange(0, int(g.number_of_nodes('n1') / 2))
    feats1 = g.nodes['n1'].data['feat'][nids]
    feats = F.squeeze(feats1, 1)
    assert np.all(F.asnumpy(feats == nids))

    # Test reading edge data
    eids = F.arange(0, int(g.number_of_edges('r1') / 2))
    feats1 = g.edges['r1'].data['feat'][eids]
    feats = F.squeeze(feats1, 1)
    assert np.all(F.asnumpy(feats == eids))

502
503
504
505
506
507
508
509
    # Test edge_subgraph
    sg = g.edge_subgraph({'r1': eids})
    assert sg.num_edges() == len(eids)
    assert F.array_equal(sg.edata[dgl.EID], eids)
    sg = g.edge_subgraph({('n1', 'r1', 'n2'): eids})
    assert sg.num_edges() == len(eids)
    assert F.array_equal(sg.edata[dgl.EID], eids)

510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
    # Test init node data
    new_shape = (g.number_of_nodes('n1'), 2)
    g.nodes['n1'].data['test1'] = dgl.distributed.DistTensor(new_shape, F.int32)
    feats = g.nodes['n1'].data['test1'][nids]
    assert np.all(F.asnumpy(feats) == 0)

    # create a tensor and destroy a tensor and create it again.
    test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test3', init_func=rand_init)
    del test3
    test3 = dgl.distributed.DistTensor((g.number_of_nodes('n1'), 3), F.float32, 'test3')
    del test3

    # add tests for anonymous distributed tensor.
    test3 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init)
    data = test3[0:10]
    test4 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init)
    del test3
    test5 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init)
    assert np.sum(F.asnumpy(test5[0:10] != data)) > 0

    # test a persistent tesnor
    test4 = dgl.distributed.DistTensor(new_shape, F.float32, 'test4', init_func=rand_init,
                                       persistent=True)
    del test4
    try:
        test4 = dgl.distributed.DistTensor((g.number_of_nodes('n1'), 3), F.float32, 'test4')
        raise Exception('')
    except:
        pass

    # Test write data
    new_feats = F.ones((len(nids), 2), F.int32, F.cpu())
    g.nodes['n1'].data['test1'][nids] = new_feats
    feats = g.nodes['n1'].data['test1'][nids]
    assert np.all(F.asnumpy(feats) == 1)

    # Test metadata operations.
    assert len(g.nodes['n1'].data['feat']) == g.number_of_nodes('n1')
    assert g.nodes['n1'].data['feat'].shape == (g.number_of_nodes('n1'), 1)
    assert g.nodes['n1'].data['feat'].dtype == F.int64

    selected_nodes = np.random.randint(0, 100, size=g.number_of_nodes('n1')) > 30
    # Test node split
    nodes = node_split(selected_nodes, g.get_partition_book(), ntype='n1')
    nodes = F.asnumpy(nodes)
    # We only have one partition, so the local nodes are basically all nodes in the graph.
    local_nids = np.arange(g.number_of_nodes('n1'))
    for n in nodes:
        assert n in local_nids

    print('end')

def check_server_client_hetero(shared_mem, num_servers, num_clients):
563
    prepare_dist(num_servers)
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
    g = create_random_hetero()

    # Partition the graph
    num_parts = 1
    graph_name = 'dist_graph_test_3'
    partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')

    # let's just test on one partition for now.
    # We cannot run multiple servers and clients on the same machine.
    serv_ps = []
    ctx = mp.get_context('spawn')
    for serv_id in range(num_servers):
        p = ctx.Process(target=run_server, args=(graph_name, serv_id, num_servers,
                                                 num_clients, shared_mem))
        serv_ps.append(p)
        p.start()

    cli_ps = []
    num_nodes = {ntype: g.number_of_nodes(ntype) for ntype in g.ntypes}
    num_edges = {etype: g.number_of_edges(etype) for etype in g.etypes}
    for cli_id in range(num_clients):
        print('start client', cli_id)
        p = ctx.Process(target=run_client_hetero, args=(graph_name, 0, num_servers, num_clients, num_nodes,
                                                        num_edges))
        p.start()
        cli_ps.append(p)

    for p in cli_ps:
        p.join()

    for p in serv_ps:
        p.join()

    print('clients have terminated')

599
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
600
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="TF doesn't support some of operations in DistGraph")
601
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Turn off Mxnet support")
602
def test_server_client():
603
    reset_envs()
604
    os.environ['DGL_DIST_MODE'] = 'distributed'
605
    check_server_client_hierarchy(False, 1, 4)
606
    check_server_client_empty(True, 1, 1)
607
608
    check_server_client_hetero(True, 1, 1)
    check_server_client_hetero(False, 1, 1)
609
610
    check_server_client(True, 1, 1)
    check_server_client(False, 1, 1)
611
612
613
614
615
616
    # [TODO][Rhett] Tests for multiple groups may fail sometimes and
    # root cause is unknown. Let's disable them for now.
    #check_server_client(True, 2, 2)
    #check_server_client(True, 1, 1, 2)
    #check_server_client(False, 1, 1, 2)
    #check_server_client(True, 2, 2, 2)
617

618
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
619
620
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="TF doesn't support distributed DistEmbedding")
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Mxnet doesn't support distributed DistEmbedding")
621
def test_dist_emb_server_client():
622
    reset_envs()
623
624
625
    os.environ['DGL_DIST_MODE'] = 'distributed'
    check_dist_emb_server_client(True, 1, 1)
    check_dist_emb_server_client(False, 1, 1)
626
627
628
629
630
631
    # [TODO][Rhett] Tests for multiple groups may fail sometimes and
    # root cause is unknown. Let's disable them for now.
    #check_dist_emb_server_client(True, 2, 2)
    #check_dist_emb_server_client(True, 1, 1, 2)
    #check_dist_emb_server_client(False, 1, 1, 2)
    #check_dist_emb_server_client(True, 2, 2, 2)
632

633
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="TF doesn't support some of operations in DistGraph")
634
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Turn off Mxnet support")
635
def test_standalone():
636
    reset_envs()
637
    os.environ['DGL_DIST_MODE'] = 'standalone'
Da Zheng's avatar
Da Zheng committed
638

639
640
641
642
643
644
645
    g = create_random_graph(10000)
    # Partition the graph
    num_parts = 1
    graph_name = 'dist_graph_test_3'
    g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1)
    g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1)
    partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')
646
647

    dgl.distributed.initialize("kv_ip_config.txt")
648
    dist_g = DistGraph(graph_name, part_config='/tmp/dist_graph/{}.json'.format(graph_name))
649
    check_dist_graph(dist_g, 1, g.number_of_nodes(), g.number_of_edges())
650
    dgl.distributed.exit_client() # this is needed since there's two test here in one process
651

652
653
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="TF doesn't support distributed DistEmbedding")
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Mxnet doesn't support distributed DistEmbedding")
654
def test_standalone_node_emb():
655
    reset_envs()
656
657
658
659
660
661
662
663
664
665
666
667
    os.environ['DGL_DIST_MODE'] = 'standalone'

    g = create_random_graph(10000)
    # Partition the graph
    num_parts = 1
    graph_name = 'dist_graph_test_3'
    g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1)
    g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1)
    partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')

    dgl.distributed.initialize("kv_ip_config.txt")
    dist_g = DistGraph(graph_name, part_config='/tmp/dist_graph/{}.json'.format(graph_name))
668
    check_dist_emb(dist_g, 1, g.number_of_nodes(), g.number_of_edges())
669
670
    dgl.distributed.exit_client() # this is needed since there's two test here in one process

671
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
672
673
674
675
676
677
678
679
680
681
@pytest.mark.parametrize("hetero", [True, False])
def test_split(hetero):
    if hetero:
        g = create_random_hetero()
        ntype = 'n1'
        etype = 'r1'
    else:
        g = create_random_graph(10000)
        ntype = '_N'
        etype = '_E'
682
683
    num_parts = 4
    num_hops = 2
684
    partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis')
685

686
687
    node_mask = np.random.randint(0, 100, size=g.number_of_nodes(ntype)) > 30
    edge_mask = np.random.randint(0, 100, size=g.number_of_edges(etype)) > 30
688
689
    selected_nodes = np.nonzero(node_mask)[0]
    selected_edges = np.nonzero(edge_mask)[0]
Da Zheng's avatar
Da Zheng committed
690
691
692
693
694
695
696
697
698

    # The code now collects the roles of all client processes and use the information
    # to determine how to split the workloads. Here is to simulate the multi-client
    # use case.
    def set_roles(num_clients):
        dgl.distributed.role.CUR_ROLE = 'default'
        dgl.distributed.role.GLOBAL_RANK = {i:i for i in range(num_clients)}
        dgl.distributed.role.PER_ROLE_RANK['default'] = {i:i for i in range(num_clients)}

699
    for i in range(num_parts):
Da Zheng's avatar
Da Zheng committed
700
        set_roles(num_parts)
701
        part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition('/tmp/dist_graph/dist_graph_test.json', i)
Da Zheng's avatar
Da Zheng committed
702
        local_nids = F.nonzero_1d(part_g.ndata['inner_node'])
703
        local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids)
704
705
706
707
708
709
710
        if hetero:
            ntype_ids, nids = gpb.map_to_per_ntype(local_nids)
            local_nids = F.asnumpy(nids)[F.asnumpy(ntype_ids) == 0]
        else:
            local_nids = F.asnumpy(local_nids)
        nodes1 = np.intersect1d(selected_nodes, local_nids)
        nodes2 = node_split(node_mask, gpb, ntype=ntype, rank=i, force_even=False)
711
        assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes2)))
712
        for n in F.asnumpy(nodes2):
713
714
            assert n in local_nids

Da Zheng's avatar
Da Zheng committed
715
        set_roles(num_parts * 2)
716
717
        nodes3 = node_split(node_mask, gpb, ntype=ntype, rank=i * 2, force_even=False)
        nodes4 = node_split(node_mask, gpb, ntype=ntype, rank=i * 2 + 1, force_even=False)
718
719
720
        nodes5 = F.cat([nodes3, nodes4], 0)
        assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes5)))

Da Zheng's avatar
Da Zheng committed
721
        set_roles(num_parts)
Da Zheng's avatar
Da Zheng committed
722
        local_eids = F.nonzero_1d(part_g.edata['inner_edge'])
723
        local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids)
724
725
726
727
728
729
730
        if hetero:
            etype_ids, eids = gpb.map_to_per_etype(local_eids)
            local_eids = F.asnumpy(eids)[F.asnumpy(etype_ids) == 0]
        else:
            local_eids = F.asnumpy(local_eids)
        edges1 = np.intersect1d(selected_edges, local_eids)
        edges2 = edge_split(edge_mask, gpb, etype=etype, rank=i, force_even=False)
731
        assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges2)))
732
        for e in F.asnumpy(edges2):
733
734
            assert e in local_eids

Da Zheng's avatar
Da Zheng committed
735
        set_roles(num_parts * 2)
736
737
        edges3 = edge_split(edge_mask, gpb, etype=etype, rank=i * 2, force_even=False)
        edges4 = edge_split(edge_mask, gpb, etype=etype, rank=i * 2 + 1, force_even=False)
738
739
740
        edges5 = F.cat([edges3, edges4], 0)
        assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges5)))

741
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
742
743
744
745
746
747
748
749
750
751
752
753
754
755
def test_split_even():
    g = create_random_graph(10000)
    num_parts = 4
    num_hops = 2
    partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis')

    node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30
    edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30
    selected_nodes = np.nonzero(node_mask)[0]
    selected_edges = np.nonzero(edge_mask)[0]
    all_nodes1 = []
    all_nodes2 = []
    all_edges1 = []
    all_edges2 = []
Da Zheng's avatar
Da Zheng committed
756
757
758
759
760
761
762
763
764

    # The code now collects the roles of all client processes and use the information
    # to determine how to split the workloads. Here is to simulate the multi-client
    # use case.
    def set_roles(num_clients):
        dgl.distributed.role.CUR_ROLE = 'default'
        dgl.distributed.role.GLOBAL_RANK = {i:i for i in range(num_clients)}
        dgl.distributed.role.PER_ROLE_RANK['default'] = {i:i for i in range(num_clients)}

765
    for i in range(num_parts):
Da Zheng's avatar
Da Zheng committed
766
        set_roles(num_parts)
767
        part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition('/tmp/dist_graph/dist_graph_test.json', i)
768
769
        local_nids = F.nonzero_1d(part_g.ndata['inner_node'])
        local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids)
770
        nodes = node_split(node_mask, gpb, rank=i, force_even=True)
771
772
773
774
        all_nodes1.append(nodes)
        subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(local_nids))
        print('part {} get {} nodes and {} are in the partition'.format(i, len(nodes), len(subset)))

Da Zheng's avatar
Da Zheng committed
775
        set_roles(num_parts * 2)
776
777
778
        nodes1 = node_split(node_mask, gpb, rank=i * 2, force_even=True)
        nodes2 = node_split(node_mask, gpb, rank=i * 2 + 1, force_even=True)
        nodes3, _ = F.sort_1d(F.cat([nodes1, nodes2], 0))
779
780
781
782
        all_nodes2.append(nodes3)
        subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(nodes3))
        print('intersection has', len(subset))

Da Zheng's avatar
Da Zheng committed
783
        set_roles(num_parts)
784
785
        local_eids = F.nonzero_1d(part_g.edata['inner_edge'])
        local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids)
786
        edges = edge_split(edge_mask, gpb, rank=i, force_even=True)
787
788
789
790
        all_edges1.append(edges)
        subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(local_eids))
        print('part {} get {} edges and {} are in the partition'.format(i, len(edges), len(subset)))

Da Zheng's avatar
Da Zheng committed
791
        set_roles(num_parts * 2)
792
793
794
        edges1 = edge_split(edge_mask, gpb, rank=i * 2, force_even=True)
        edges2 = edge_split(edge_mask, gpb, rank=i * 2 + 1, force_even=True)
        edges3, _ = F.sort_1d(F.cat([edges1, edges2], 0))
795
796
797
798
799
800
801
802
803
804
805
806
807
808
        all_edges2.append(edges3)
        subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(edges3))
        print('intersection has', len(subset))
    all_nodes1 = F.cat(all_nodes1, 0)
    all_edges1 = F.cat(all_edges1, 0)
    all_nodes2 = F.cat(all_nodes2, 0)
    all_edges2 = F.cat(all_edges2, 0)
    all_nodes = np.nonzero(node_mask)[0]
    all_edges = np.nonzero(edge_mask)[0]
    assert np.all(all_nodes == F.asnumpy(all_nodes1))
    assert np.all(all_edges == F.asnumpy(all_edges1))
    assert np.all(all_nodes == F.asnumpy(all_nodes2))
    assert np.all(all_edges == F.asnumpy(all_edges2))

809
810
def prepare_dist(num_servers=1):
    generate_ip_config("kv_ip_config.txt", 1, num_servers=num_servers)
811

812
if __name__ == '__main__':
Da Zheng's avatar
Da Zheng committed
813
    os.makedirs('/tmp/dist_graph', exist_ok=True)
814
    test_dist_emb_server_client()
815
    test_server_client()
816
817
    test_split(True)
    test_split(False)
818
    test_split_even()
819
    test_standalone()
820
    test_standalone_node_emb()