test_sampling.py 51.2 KB
Newer Older
1
2
3
4
import dgl
import backend as F
import numpy as np
import unittest
5
from collections import defaultdict
6
import pytest
7

Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
8
def check_random_walk(g, metapath, traces, ntypes, prob=None, trace_eids=None):
9
10
11
12
13
14
15
16
    traces = F.asnumpy(traces)
    ntypes = F.asnumpy(ntypes)
    for j in range(traces.shape[1] - 1):
        assert ntypes[j] == g.get_ntype_id(g.to_canonical_etype(metapath[j])[0])
        assert ntypes[j + 1] == g.get_ntype_id(g.to_canonical_etype(metapath[j])[2])

    for i in range(traces.shape[0]):
        for j in range(traces.shape[1] - 1):
17
            assert g.has_edges_between(
18
19
20
                traces[i, j], traces[i, j+1], etype=metapath[j])
            if prob is not None and prob in g.edges[metapath[j]].data:
                p = F.asnumpy(g.edges[metapath[j]].data['p'])
21
                eids = g.edge_ids(traces[i, j], traces[i, j+1], etype=metapath[j])
22
                assert p[eids] != 0
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
23
24
25
            if trace_eids is not None:
                u, v = g.find_edges(trace_eids[i, j], etype=metapath[j])
                assert (u == traces[i, j]) and (v == traces[i, j + 1])
26

27
28
29
30
31
32
33
@pytest.mark.parametrize('use_uva', [True, False])
def test_non_uniform_random_walk(use_uva):
    if use_uva:
        if F.ctx() == F.cpu():
            pytest.skip('UVA biased random walk requires a GPU.')
        if dgl.backend.backend_name != 'pytorch':
            pytest.skip('UVA biased random walk is only supported with PyTorch.')
34
    g2 = dgl.heterograph({
35
            ('user', 'follow', 'user'): ([0, 1, 1, 2, 3], [1, 2, 3, 0, 0])
36
        })
37
    g4 = dgl.heterograph({
38
39
40
            ('user', 'follow', 'user'): ([0, 1, 1, 2, 3], [1, 2, 3, 0, 0]),
            ('user', 'view', 'item'): ([0, 0, 1, 2, 3, 3], [0, 1, 1, 2, 2, 1]),
            ('item', 'viewed-by', 'user'): ([0, 1, 1, 2, 2, 1], [0, 0, 1, 2, 3, 3])
41
        })
42

43
44
45
46
    g2.edata['p'] = F.copy_to(F.tensor([3, 0, 3, 3, 3], dtype=F.float32), F.cpu())
    g2.edata['p2'] = F.copy_to(F.tensor([[3], [0], [3], [3], [3]], dtype=F.float32), F.cpu())
    g4.edges['follow'].data['p'] = F.copy_to(F.tensor([3, 0, 3, 3, 3], dtype=F.float32), F.cpu())
    g4.edges['viewed-by'].data['p'] = F.copy_to(F.tensor([1, 1, 1, 1, 1, 1], dtype=F.float32), F.cpu())
47

48
49
50
51
52
53
54
    if use_uva:
        for g in (g2, g4):
            g.create_formats_()
            g.pin_memory_()
    elif F._default_context_str == 'gpu':
        g2 = g2.to(F.ctx())
        g4 = g4.to(F.ctx())
55

56
    try:
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
        traces, eids, ntypes = dgl.sampling.random_walk(
            g2, F.tensor([0, 1, 2, 3, 0, 1, 2, 3], dtype=g2.idtype),
            length=4, prob='p', return_eids=True)
        check_random_walk(g2, ['follow'] * 4, traces, ntypes, 'p', trace_eids=eids)

        with pytest.raises(dgl.DGLError):
            traces, ntypes = dgl.sampling.random_walk(
                g2, F.tensor([0, 1, 2, 3, 0, 1, 2, 3], dtype=g2.idtype),
                length=4, prob='p2')

        metapath = ['follow', 'view', 'viewed-by'] * 2
        traces, eids, ntypes = dgl.sampling.random_walk(
            g4, F.tensor([0, 1, 2, 3, 0, 1, 2, 3], dtype=g4.idtype),
            metapath=metapath, prob='p', return_eids=True)
        check_random_walk(g4, metapath, traces, ntypes, 'p', trace_eids=eids)
        traces, eids, ntypes = dgl.sampling.random_walk(
            g4, F.tensor([0, 1, 2, 3, 0, 1, 2, 3], dtype=g4.idtype),
            metapath=metapath, prob='p', restart_prob=0., return_eids=True)
        check_random_walk(g4, metapath, traces, ntypes, 'p', trace_eids=eids)
        traces, eids, ntypes = dgl.sampling.random_walk(
            g4, F.tensor([0, 1, 2, 3, 0, 1, 2, 3], dtype=g4.idtype),
            metapath=metapath, prob='p',
            restart_prob=F.zeros((6,), F.float32, F.ctx()), return_eids=True)
        check_random_walk(g4, metapath, traces, ntypes, 'p', trace_eids=eids)
        traces, eids, ntypes = dgl.sampling.random_walk(
            g4, F.tensor([0, 1, 2, 3, 0, 1, 2, 3], dtype=g4.idtype),
            metapath=metapath + ['follow'], prob='p',
            restart_prob=F.tensor([0, 0, 0, 0, 0, 0, 1], F.float32), return_eids=True)
        check_random_walk(g4, metapath, traces[:, :7], ntypes[:7], 'p', trace_eids=eids)
        assert (F.asnumpy(traces[:, 7]) == -1).all()
    finally:
        for g in (g2, g4):
            g.unpin_memory_()

@pytest.mark.parametrize('use_uva', [True, False])
92
def test_uniform_random_walk(use_uva):
93
94
    if use_uva and F.ctx() == F.cpu():
        pytest.skip('UVA random walk requires a GPU.')
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
    g1 = dgl.heterograph({
            ('user', 'follow', 'user'): ([0, 1, 2], [1, 2, 0])
        })
    g2 = dgl.heterograph({
            ('user', 'follow', 'user'): ([0, 1, 1, 2, 3], [1, 2, 3, 0, 0])
        })
    g3 = dgl.heterograph({
            ('user', 'follow', 'user'): ([0, 1, 2], [1, 2, 0]),
            ('user', 'view', 'item'): ([0, 1, 2], [0, 1, 2]),
            ('item', 'viewed-by', 'user'): ([0, 1, 2], [0, 1, 2])
        })
    g4 = dgl.heterograph({
            ('user', 'follow', 'user'): ([0, 1, 1, 2, 3], [1, 2, 3, 0, 0]),
            ('user', 'view', 'item'): ([0, 0, 1, 2, 3, 3], [0, 1, 1, 2, 2, 1]),
            ('item', 'viewed-by', 'user'): ([0, 1, 1, 2, 2, 1], [0, 0, 1, 2, 3, 3])
        })

    if use_uva:
        for g in (g1, g2, g3, g4):
            g.create_formats_()
            g.pin_memory_()
    elif F._default_context_str == 'gpu':
        g1 = g1.to(F.ctx())
        g2 = g2.to(F.ctx())
        g3 = g3.to(F.ctx())
        g4 = g4.to(F.ctx())

    try:
        traces, eids, ntypes = dgl.sampling.random_walk(
            g1, F.tensor([0, 1, 2, 0, 1, 2], dtype=g1.idtype), length=4, return_eids=True)
        check_random_walk(g1, ['follow'] * 4, traces, ntypes, trace_eids=eids)
        if F._default_context_str == 'cpu':
            with pytest.raises(dgl.DGLError):
                dgl.sampling.random_walk(g1, F.tensor([0, 1, 2, 10], dtype=g1.idtype), length=4, return_eids=True)
        traces, eids, ntypes = dgl.sampling.random_walk(
            g1, F.tensor([0, 1, 2, 0, 1, 2], dtype=g1.idtype), length=4, restart_prob=0., return_eids=True)
        check_random_walk(g1, ['follow'] * 4, traces, ntypes, trace_eids=eids)
        traces, ntypes = dgl.sampling.random_walk(
            g1, F.tensor([0, 1, 2, 0, 1, 2], dtype=g1.idtype), length=4, restart_prob=F.zeros((4,), F.float32))
        check_random_walk(g1, ['follow'] * 4, traces, ntypes)
        traces, ntypes = dgl.sampling.random_walk(
            g1, F.tensor([0, 1, 2, 0, 1, 2], dtype=g1.idtype), length=5,
            restart_prob=F.tensor([0, 0, 0, 0, 1], dtype=F.float32))
        check_random_walk(
            g1, ['follow'] * 4, F.slice_axis(traces, 1, 0, 5), F.slice_axis(ntypes, 0, 0, 5))
        assert (F.asnumpy(traces)[:, 5] == -1).all()

        traces, eids, ntypes = dgl.sampling.random_walk(
            g2, F.tensor([0, 1, 2, 3, 0, 1, 2, 3], dtype=g2.idtype), length=4, return_eids=True)
        check_random_walk(g2, ['follow'] * 4, traces, ntypes, trace_eids=eids)

        metapath = ['follow', 'view', 'viewed-by'] * 2
        traces, eids, ntypes = dgl.sampling.random_walk(
            g3, F.tensor([0, 1, 2, 0, 1, 2], dtype=g3.idtype), metapath=metapath, return_eids=True)
        check_random_walk(g3, metapath, traces, ntypes, trace_eids=eids)

        metapath = ['follow', 'view', 'viewed-by'] * 2
        traces, eids, ntypes = dgl.sampling.random_walk(
            g4, F.tensor([0, 1, 2, 3, 0, 1, 2, 3], dtype=g4.idtype), metapath=metapath, return_eids=True)
        check_random_walk(g4, metapath, traces, ntypes, trace_eids=eids)

        traces, eids, ntypes = dgl.sampling.random_walk(
            g4, F.tensor([0, 1, 2, 0, 1, 2], dtype=g4.idtype), metapath=metapath, return_eids=True)
        check_random_walk(g4, metapath, traces, ntypes, trace_eids=eids)
    finally:    # make sure to unpin the graphs even if some test fails
        for g in (g1, g2, g3, g4):
            if g.is_pinned():
                g.unpin_memory_()

Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU random walk not implemented")
def test_node2vec():
    g1 = dgl.heterograph({
        ('user', 'follow', 'user'): ([0, 1, 2], [1, 2, 0])
        })
    g2 = dgl.heterograph({
        ('user', 'follow', 'user'): ([0, 1, 1, 2, 3], [1, 2, 3, 0, 0])
        })
    g2.edata['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32)

    ntypes = F.zeros((5,), dtype=F.int64)

    traces, eids = dgl.sampling.node2vec_random_walk(g1, [0, 1, 2, 0, 1, 2], 1, 1, 4, return_eids=True)
    check_random_walk(g1, ['follow'] * 4, traces, ntypes, trace_eids=eids)

    traces, eids = dgl.sampling.node2vec_random_walk(
        g2, [0, 1, 2, 3, 0, 1, 2, 3], 1, 1, 4, prob='p', return_eids=True)
    check_random_walk(g2, ['follow'] * 4, traces, ntypes, 'p', trace_eids=eids)

183
184
185
186
187
188
189
190
191
192
193
194
195
196
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU pack traces not implemented")
def test_pack_traces():
    traces, types = (np.array(
        [[ 0,  1, -1, -1, -1, -1, -1],
         [ 0,  1,  1,  3,  0,  0,  0]], dtype='int64'),
        np.array([0, 0, 1, 0, 0, 1, 0], dtype='int64'))
    traces = F.zerocopy_from_numpy(traces)
    types = F.zerocopy_from_numpy(types)
    result = dgl.sampling.pack_traces(traces, types)
    assert F.array_equal(result[0], F.tensor([0, 1, 0, 1, 1, 3, 0, 0, 0], dtype=F.int64))
    assert F.array_equal(result[1], F.tensor([0, 0, 0, 0, 1, 0, 0, 1, 0], dtype=F.int64))
    assert F.array_equal(result[2], F.tensor([2, 7], dtype=F.int64))
    assert F.array_equal(result[3], F.tensor([0, 2], dtype=F.int64))

197
@pytest.mark.parametrize('use_uva', [True, False])
198
def test_pinsage_sampling(use_uva):
199
200
    if use_uva and F.ctx() == F.cpu():
        pytest.skip('UVA sampling requires a GPU.')
201
    def _test_sampler(g, sampler, ntype):
202
        seeds = F.copy_to(F.tensor([0, 2], dtype=g.idtype), F.ctx())
203
        neighbor_g = sampler(seeds)
204
205
206
207
208
209
210
        assert neighbor_g.ntypes == [ntype]
        u, v = neighbor_g.all_edges(form='uv', order='eid')
        uv = list(zip(F.asnumpy(u).tolist(), F.asnumpy(v).tolist()))
        assert (1, 0) in uv or (0, 0) in uv
        assert (2, 2) in uv or (3, 2) in uv

    g = dgl.heterograph({
211
212
        ('item', 'bought-by', 'user'): ([0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 2, 3, 2, 3]),
        ('user', 'bought', 'item'): ([0, 1, 0, 1, 2, 3, 2, 3], [0, 0, 1, 1, 2, 2, 3, 3])})
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    if use_uva:
        g.create_formats_()
        g.pin_memory_()
    elif F._default_context_str == 'gpu':
        g = g.to(F.ctx())
    try:
        sampler = dgl.sampling.PinSAGESampler(g, 'item', 'user', 4, 0.5, 3, 2)
        _test_sampler(g, sampler, 'item')
        sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2, ['bought-by', 'bought'])
        _test_sampler(g, sampler, 'item')
        sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2,
            [('item', 'bought-by', 'user'), ('user', 'bought', 'item')])
        _test_sampler(g, sampler, 'item')
    finally:
        if g.is_pinned():
            g.unpin_memory_()

230
231
    g = dgl.graph(([0, 0, 1, 1, 2, 2, 3, 3],
                   [0, 1, 0, 1, 2, 3, 2, 3]))
232
233
234
235
236
237
238
239
240
241
242
243
    if use_uva:
        g.create_formats_()
        g.pin_memory_()
    elif F._default_context_str == 'gpu':
        g = g.to(F.ctx())
    try:
        sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2)
        _test_sampler(g, sampler, g.ntypes[0])
    finally:
        if g.is_pinned():
            g.unpin_memory_()

244
    g = dgl.heterograph({
245
246
247
        ('A', 'AB', 'B'): ([0, 2], [1, 3]),
        ('B', 'BC', 'C'): ([1, 3], [2, 1]),
        ('C', 'CA', 'A'): ([2, 1], [0, 2])})
248
249
250
251
252
253
254
255
256
257
258
    if use_uva:
        g.create_formats_()
        g.pin_memory_()
    elif F._default_context_str == 'gpu':
        g = g.to(F.ctx())
    try:
        sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2, ['AB', 'BC', 'CA'])
        _test_sampler(g, sampler, 'A')
    finally:
        if g.is_pinned():
            g.unpin_memory_()
259

260
261
262
263
def _gen_neighbor_sampling_test_graph(hypersparse, reverse):
    if hypersparse:
        # should crash if allocated a CSR
        card = 1 << 50
264
        num_nodes_dict = {'user': card, 'game': card, 'coin': card}
265
266
    else:
        card = None
267
268
        num_nodes_dict = None

269
    if reverse:
270
271
272
        g = dgl.heterograph({
            ('user', 'follow', 'user'): ([0, 0, 0, 1, 1, 1, 2], [1, 2, 3, 0, 2, 3, 0])
        }, {'user': card if card is not None else 4})
273
        g = g.to(F.ctx())
274
        g.edata['prob'] = F.tensor([.5, .5, 0., .5, .5, 0., 1.], dtype=F.float32)
275
        g.edata['mask'] = F.tensor([True, True, False, True, True, False, True])
276
277
278
279
280
281
282
        hg = dgl.heterograph({
            ('user', 'follow', 'user'): ([0, 0, 0, 1, 1, 1, 2],
                                         [1, 2, 3, 0, 2, 3, 0]),
            ('game', 'play', 'user'): ([0, 1, 2, 2], [0, 0, 1, 3]),
            ('user', 'liked-by', 'game'): ([0, 1, 2, 0, 3, 0], [2, 2, 2, 1, 1, 0]),
            ('coin', 'flips', 'user'): ([0, 0, 0, 0], [0, 1, 2, 3])
        }, num_nodes_dict)
283
        hg = hg.to(F.ctx())
284
    else:
285
286
287
        g = dgl.heterograph({
            ('user', 'follow', 'user'): ([1, 2, 3, 0, 2, 3, 0], [0, 0, 0, 1, 1, 1, 2])
        }, {'user': card if card is not None else 4})
288
        g = g.to(F.ctx())
289
        g.edata['prob'] = F.tensor([.5, .5, 0., .5, .5, 0., 1.], dtype=F.float32)
290
        g.edata['mask'] = F.tensor([True, True, False, True, True, False, True])
291
292
293
294
295
296
297
        hg = dgl.heterograph({
            ('user', 'follow', 'user'): ([1, 2, 3, 0, 2, 3, 0],
                                         [0, 0, 0, 1, 1, 1, 2]),
            ('user', 'play', 'game'): ([0, 0, 1, 3], [0, 1, 2, 2]),
            ('game', 'liked-by', 'user'): ([2, 2, 2, 1, 1, 0], [0, 1, 2, 0, 3, 0]),
            ('user', 'flips', 'coin'): ([0, 1, 2, 3], [0, 0, 0, 0])
        }, num_nodes_dict)
298
        hg = hg.to(F.ctx())
299
    hg.edges['follow'].data['prob'] = F.tensor([.5, .5, 0., .5, .5, 0., 1.], dtype=F.float32)
300
    hg.edges['follow'].data['mask'] = F.tensor([True, True, False, True, True, False, True])
301
    hg.edges['play'].data['prob'] = F.tensor([.8, .5, .5, .5], dtype=F.float32)
302
    # Leave out the mask of play and liked-by since all of them are True anyway.
303
    hg.edges['liked-by'].data['prob'] = F.tensor([.3, .5, .2, .5, .1, .1], dtype=F.float32)
304
305
306
307
308
309
310
311
312

    return g, hg

def _gen_neighbor_topk_test_graph(hypersparse, reverse):
    if hypersparse:
        # should crash if allocated a CSR
        card = 1 << 50
    else:
        card = None
313

314
    if reverse:
315
316
317
        g = dgl.heterograph({
            ('user', 'follow', 'user'): ([0, 0, 0, 1, 1, 1, 2], [1, 2, 3, 0, 2, 3, 0])
        })
318
        g.edata['weight'] = F.tensor([.5, .3, 0., -5., 22., 0., 1.], dtype=F.float32)
319
320
321
322
323
324
325
        hg = dgl.heterograph({
            ('user', 'follow', 'user'): ([0, 0, 0, 1, 1, 1, 2],
                                         [1, 2, 3, 0, 2, 3, 0]),
            ('game', 'play', 'user'): ([0, 1, 2, 2], [0, 0, 1, 3]),
            ('user', 'liked-by', 'game'): ([0, 1, 2, 0, 3, 0], [2, 2, 2, 1, 1, 0]),
            ('coin', 'flips', 'user'): ([0, 0, 0, 0], [0, 1, 2, 3])
        })
326
    else:
327
328
329
        g = dgl.heterograph({
            ('user', 'follow', 'user'): ([1, 2, 3, 0, 2, 3, 0], [0, 0, 0, 1, 1, 1, 2])
        })
330
        g.edata['weight'] = F.tensor([.5, .3, 0., -5., 22., 0., 1.], dtype=F.float32)
331
332
333
334
335
336
337
338
339
340
341
        hg = dgl.heterograph({
            ('user', 'follow', 'user'): ([1, 2, 3, 0, 2, 3, 0],
                                         [0, 0, 0, 1, 1, 1, 2]),
            ('user', 'play', 'game'): ([0, 0, 1, 3], [0, 1, 2, 2]),
            ('game', 'liked-by', 'user'): ([2, 2, 2, 1, 1, 0], [0, 1, 2, 0, 3, 0]),
            ('user', 'flips', 'coin'): ([0, 1, 2, 3], [0, 0, 0, 0])
        })
    hg.edges['follow'].data['weight'] = F.tensor([.5, .3, 0., -5., 22., 0., 1.], dtype=F.float32)
    hg.edges['play'].data['weight'] = F.tensor([.8, .5, .4, .5], dtype=F.float32)
    hg.edges['liked-by'].data['weight'] = F.tensor([.3, .5, .2, .5, .1, .1], dtype=F.float32)
    hg.edges['flips'].data['weight'] = F.tensor([10, 2, 13, -1], dtype=F.float32)
342
343
    return g, hg

344
def _test_sample_neighbors(hypersparse, prob):
345
346
347
    g, hg = _gen_neighbor_sampling_test_graph(hypersparse, False)

    def _test1(p, replace):
348
349
350
        subg = dgl.sampling.sample_neighbors(g, [0, 1], -1, prob=p, replace=replace)
        assert subg.number_of_nodes() == g.number_of_nodes()
        u, v = subg.edges()
351
352
353
354
355
356
357
        u_ans, v_ans, e_ans = g.in_edges([0, 1], form='all')
        if p is not None:
            emask = F.gather_row(g.edata[p], e_ans)
            if p == 'prob':
                emask = (emask != 0)
            u_ans = F.boolean_mask(u_ans, emask)
            v_ans = F.boolean_mask(v_ans, emask)
358
359
360
361
        uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
        uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
        assert uv == uv_ans

362
363
364
365
366
367
        for i in range(10):
            subg = dgl.sampling.sample_neighbors(g, [0, 1], 2, prob=p, replace=replace)
            assert subg.number_of_nodes() == g.number_of_nodes()
            assert subg.number_of_edges() == 4
            u, v = subg.edges()
            assert set(F.asnumpy(F.unique(v))) == {0, 1}
368
            assert F.array_equal(F.astype(g.has_edges_between(u, v), F.int64), F.ones((4,), dtype=F.int64))
369
370
371
372
373
374
375
376
            assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
            edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
            if not replace:
                # check no duplication
                assert len(edge_set) == 4
            if p is not None:
                assert not (3, 0) in edge_set
                assert not (3, 1) in edge_set
377
378
    _test1(prob, True)   # w/ replacement, uniform
    _test1(prob, False)  # w/o replacement, uniform
379
380

    def _test2(p, replace):  # fanout > #neighbors
381
382
383
        subg = dgl.sampling.sample_neighbors(g, [0, 2], -1, prob=p, replace=replace)
        assert subg.number_of_nodes() == g.number_of_nodes()
        u, v = subg.edges()
384
385
386
387
388
389
390
        u_ans, v_ans, e_ans = g.in_edges([0, 2], form='all')
        if p is not None:
            emask = F.gather_row(g.edata[p], e_ans)
            if p == 'prob':
                emask = (emask != 0)
            u_ans = F.boolean_mask(u_ans, emask)
            v_ans = F.boolean_mask(v_ans, emask)
391
392
393
394
        uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
        uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
        assert uv == uv_ans

395
396
397
398
399
400
401
        for i in range(10):
            subg = dgl.sampling.sample_neighbors(g, [0, 2], 2, prob=p, replace=replace)
            assert subg.number_of_nodes() == g.number_of_nodes()
            num_edges = 4 if replace else 3
            assert subg.number_of_edges() == num_edges
            u, v = subg.edges()
            assert set(F.asnumpy(F.unique(v))) == {0, 2}
402
            assert F.array_equal(F.astype(g.has_edges_between(u, v), F.int64), F.ones((num_edges,), dtype=F.int64))
403
404
405
406
407
408
409
            assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
            edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
            if not replace:
                # check no duplication
                assert len(edge_set) == num_edges
            if p is not None:
                assert not (3, 0) in edge_set
410
411
    _test2(prob, True)   # w/ replacement, uniform
    _test2(prob, False)  # w/o replacement, uniform
412
413

    def _test3(p, replace):
414
415
416
        subg = dgl.sampling.sample_neighbors(hg, {'user': [0, 1], 'game': 0}, -1, prob=p, replace=replace)
        assert len(subg.ntypes) == 3
        assert len(subg.etypes) == 4
417
        assert subg['follow'].number_of_edges() == 6 if p is None else 4
418
419
420
421
        assert subg['play'].number_of_edges() == 1
        assert subg['liked-by'].number_of_edges() == 4
        assert subg['flips'].number_of_edges() == 0

422
423
424
425
426
427
428
429
430
        for i in range(10):
            subg = dgl.sampling.sample_neighbors(hg, {'user' : [0,1], 'game' : 0}, 2, prob=p, replace=replace)
            assert len(subg.ntypes) == 3
            assert len(subg.etypes) == 4
            assert subg['follow'].number_of_edges() == 4
            assert subg['play'].number_of_edges() == 2 if replace else 1
            assert subg['liked-by'].number_of_edges() == 4 if replace else 3
            assert subg['flips'].number_of_edges() == 0

431
432
    _test3(prob, True)   # w/ replacement, uniform
    _test3(prob, False)  # w/o replacement, uniform
433
434
435

    # test different fanouts for different relations
    for i in range(10):
436
437
        subg = dgl.sampling.sample_neighbors(
            hg,
438
439
            {'user' : [0,1], 'game' : 0, 'coin': 0},
            {'follow': 1, 'play': 2, 'liked-by': 0, 'flips': -1},
440
            replace=True)
441
442
443
444
445
        assert len(subg.ntypes) == 3
        assert len(subg.etypes) == 4
        assert subg['follow'].number_of_edges() == 2
        assert subg['play'].number_of_edges() == 2
        assert subg['liked-by'].number_of_edges() == 0
446
        assert subg['flips'].number_of_edges() == 4
447
448
449
450
451

def _test_sample_neighbors_outedge(hypersparse):
    g, hg = _gen_neighbor_sampling_test_graph(hypersparse, True)

    def _test1(p, replace):
452
453
454
        subg = dgl.sampling.sample_neighbors(g, [0, 1], -1, prob=p, replace=replace, edge_dir='out')
        assert subg.number_of_nodes() == g.number_of_nodes()
        u, v = subg.edges()
455
456
457
458
459
460
461
        u_ans, v_ans, e_ans = g.out_edges([0, 1], form='all')
        if p is not None:
            emask = F.gather_row(g.edata[p], e_ans)
            if p == 'prob':
                emask = (emask != 0)
            u_ans = F.boolean_mask(u_ans, emask)
            v_ans = F.boolean_mask(v_ans, emask)
462
463
464
465
        uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
        uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
        assert uv == uv_ans

466
467
468
469
470
471
        for i in range(10):
            subg = dgl.sampling.sample_neighbors(g, [0, 1], 2, prob=p, replace=replace, edge_dir='out')
            assert subg.number_of_nodes() == g.number_of_nodes()
            assert subg.number_of_edges() == 4
            u, v = subg.edges()
            assert set(F.asnumpy(F.unique(u))) == {0, 1}
472
            assert F.array_equal(F.astype(g.has_edges_between(u, v), F.int64), F.ones((4,), dtype=F.int64))
473
474
475
476
477
478
479
480
481
482
483
484
485
486
            assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
            edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
            if not replace:
                # check no duplication
                assert len(edge_set) == 4
            if p is not None:
                assert not (0, 3) in edge_set
                assert not (1, 3) in edge_set
    _test1(None, True)   # w/ replacement, uniform
    _test1(None, False)  # w/o replacement, uniform
    _test1('prob', True)   # w/ replacement
    _test1('prob', False)  # w/o replacement

    def _test2(p, replace):  # fanout > #neighbors
487
488
489
        subg = dgl.sampling.sample_neighbors(g, [0, 2], -1, prob=p, replace=replace, edge_dir='out')
        assert subg.number_of_nodes() == g.number_of_nodes()
        u, v = subg.edges()
490
491
492
493
494
495
496
        u_ans, v_ans, e_ans = g.out_edges([0, 2], form='all')
        if p is not None:
            emask = F.gather_row(g.edata[p], e_ans)
            if p == 'prob':
                emask = (emask != 0)
            u_ans = F.boolean_mask(u_ans, emask)
            v_ans = F.boolean_mask(v_ans, emask)
497
498
499
500
        uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
        uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
        assert uv == uv_ans

501
502
503
504
505
506
507
        for i in range(10):
            subg = dgl.sampling.sample_neighbors(g, [0, 2], 2, prob=p, replace=replace, edge_dir='out')
            assert subg.number_of_nodes() == g.number_of_nodes()
            num_edges = 4 if replace else 3
            assert subg.number_of_edges() == num_edges
            u, v = subg.edges()
            assert set(F.asnumpy(F.unique(u))) == {0, 2}
508
            assert F.array_equal(F.astype(g.has_edges_between(u, v), F.int64), F.ones((num_edges,), dtype=F.int64))
509
510
511
512
513
514
515
516
517
518
519
520
521
            assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
            edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
            if not replace:
                # check no duplication
                assert len(edge_set) == num_edges
            if p is not None:
                assert not (0, 3) in edge_set
    _test2(None, True)   # w/ replacement, uniform
    _test2(None, False)  # w/o replacement, uniform
    _test2('prob', True)   # w/ replacement
    _test2('prob', False)  # w/o replacement

    def _test3(p, replace):
522
523
524
        subg = dgl.sampling.sample_neighbors(hg, {'user': [0, 1], 'game': 0}, -1, prob=p, replace=replace, edge_dir='out')
        assert len(subg.ntypes) == 3
        assert len(subg.etypes) == 4
525
        assert subg['follow'].number_of_edges() == 6 if p is None else 4
526
527
528
529
        assert subg['play'].number_of_edges() == 1
        assert subg['liked-by'].number_of_edges() == 4
        assert subg['flips'].number_of_edges() == 0

530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
        for i in range(10):
            subg = dgl.sampling.sample_neighbors(hg, {'user' : [0,1], 'game' : 0}, 2, prob=p, replace=replace, edge_dir='out')
            assert len(subg.ntypes) == 3
            assert len(subg.etypes) == 4
            assert subg['follow'].number_of_edges() == 4
            assert subg['play'].number_of_edges() == 2 if replace else 1
            assert subg['liked-by'].number_of_edges() == 4 if replace else 3
            assert subg['flips'].number_of_edges() == 0

    _test3(None, True)   # w/ replacement, uniform
    _test3(None, False)  # w/o replacement, uniform
    _test3('prob', True)   # w/ replacement
    _test3('prob', False)  # w/o replacement

def _test_sample_neighbors_topk(hypersparse):
    g, hg = _gen_neighbor_topk_test_graph(hypersparse, False)

    def _test1():
548
549
550
551
552
553
554
555
        subg = dgl.sampling.select_topk(g, -1, 'weight', [0, 1])
        assert subg.number_of_nodes() == g.number_of_nodes()
        u, v = subg.edges()
        u_ans, v_ans = subg.in_edges([0, 1])
        uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
        uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
        assert uv == uv_ans

556
        subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 1])
557
558
559
560
561
562
563
564
565
        assert subg.number_of_nodes() == g.number_of_nodes()
        assert subg.number_of_edges() == 4
        u, v = subg.edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
        assert edge_set == {(2,0),(1,0),(2,1),(3,1)}
    _test1()

    def _test2():  # k > #neighbors
566
567
568
569
570
571
572
573
        subg = dgl.sampling.select_topk(g, -1, 'weight', [0, 2])
        assert subg.number_of_nodes() == g.number_of_nodes()
        u, v = subg.edges()
        u_ans, v_ans = subg.in_edges([0, 2])
        uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
        uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
        assert uv == uv_ans

574
        subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 2])
575
576
577
578
579
580
581
582
583
        assert subg.number_of_nodes() == g.number_of_nodes()
        assert subg.number_of_edges() == 3
        u, v = subg.edges()
        assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert edge_set == {(2,0),(1,0),(0,2)}
    _test2()

    def _test3():
584
        subg = dgl.sampling.select_topk(hg, 2, 'weight', {'user' : [0,1], 'game' : 0})
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
        assert len(subg.ntypes) == 3
        assert len(subg.etypes) == 4
        u, v = subg['follow'].edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert F.array_equal(hg['follow'].edge_ids(u, v), subg['follow'].edata[dgl.EID])
        assert edge_set == {(2,0),(1,0),(2,1),(3,1)}
        u, v = subg['play'].edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert F.array_equal(hg['play'].edge_ids(u, v), subg['play'].edata[dgl.EID])
        assert edge_set == {(0,0)}
        u, v = subg['liked-by'].edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert F.array_equal(hg['liked-by'].edge_ids(u, v), subg['liked-by'].edata[dgl.EID])
        assert edge_set == {(2,0),(2,1),(1,0)}
        assert subg['flips'].number_of_edges() == 0
    _test3()

    # test different k for different relations
603
    subg = dgl.sampling.select_topk(
604
        hg, {'follow': 1, 'play': 2, 'liked-by': 0, 'flips': -1}, 'weight', {'user' : [0,1], 'game' : 0, 'coin': 0})
605
606
607
608
609
    assert len(subg.ntypes) == 3
    assert len(subg.etypes) == 4
    assert subg['follow'].number_of_edges() == 2
    assert subg['play'].number_of_edges() == 1
    assert subg['liked-by'].number_of_edges() == 0
610
    assert subg['flips'].number_of_edges() == 4
611
612
613
614
615

def _test_sample_neighbors_topk_outedge(hypersparse):
    g, hg = _gen_neighbor_topk_test_graph(hypersparse, True)

    def _test1():
616
617
618
619
620
621
622
623
        subg = dgl.sampling.select_topk(g, -1, 'weight', [0, 1], edge_dir='out')
        assert subg.number_of_nodes() == g.number_of_nodes()
        u, v = subg.edges()
        u_ans, v_ans = subg.out_edges([0, 1])
        uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
        uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
        assert uv == uv_ans

624
        subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 1], edge_dir='out')
625
626
627
628
629
630
631
632
633
        assert subg.number_of_nodes() == g.number_of_nodes()
        assert subg.number_of_edges() == 4
        u, v = subg.edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
        assert edge_set == {(0,2),(0,1),(1,2),(1,3)}
    _test1()

    def _test2():  # k > #neighbors
634
635
636
637
638
639
640
641
        subg = dgl.sampling.select_topk(g, -1, 'weight', [0, 2], edge_dir='out')
        assert subg.number_of_nodes() == g.number_of_nodes()
        u, v = subg.edges()
        u_ans, v_ans = subg.out_edges([0, 2])
        uv = set(zip(F.asnumpy(u), F.asnumpy(v)))
        uv_ans = set(zip(F.asnumpy(u_ans), F.asnumpy(v_ans)))
        assert uv == uv_ans

642
        subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 2], edge_dir='out')
643
644
645
646
647
648
649
650
651
        assert subg.number_of_nodes() == g.number_of_nodes()
        assert subg.number_of_edges() == 3
        u, v = subg.edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert F.array_equal(g.edge_ids(u, v), subg.edata[dgl.EID])
        assert edge_set == {(0,2),(0,1),(2,0)}
    _test2()

    def _test3():
652
        subg = dgl.sampling.select_topk(hg, 2, 'weight', {'user' : [0,1], 'game' : 0}, edge_dir='out')
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
        assert len(subg.ntypes) == 3
        assert len(subg.etypes) == 4
        u, v = subg['follow'].edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert F.array_equal(hg['follow'].edge_ids(u, v), subg['follow'].edata[dgl.EID])
        assert edge_set == {(0,2),(0,1),(1,2),(1,3)}
        u, v = subg['play'].edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert F.array_equal(hg['play'].edge_ids(u, v), subg['play'].edata[dgl.EID])
        assert edge_set == {(0,0)}
        u, v = subg['liked-by'].edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert F.array_equal(hg['liked-by'].edge_ids(u, v), subg['liked-by'].edata[dgl.EID])
        assert edge_set == {(0,2),(1,2),(0,1)}
        assert subg['flips'].number_of_edges() == 0
    _test3()

670
671
672
673
674
675
def test_sample_neighbors_noprob():
    _test_sample_neighbors(False, None)
    #_test_sample_neighbors(True)

def test_sample_neighbors_prob():
    _test_sample_neighbors(False, 'prob')
676
    #_test_sample_neighbors(True)
677
678
679

def test_sample_neighbors_outedge():
    _test_sample_neighbors_outedge(False)
680
    #_test_sample_neighbors_outedge(True)
681

682
683
684
685
686
@unittest.skipIf(F.backend_name == 'mxnet', reason='MXNet has problem converting bool arrays')
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors with mask not implemented")
def test_sample_neighbors_mask():
    _test_sample_neighbors(False, 'mask')

687
688
689
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_sample_neighbors_topk():
    _test_sample_neighbors_topk(False)
690
    #_test_sample_neighbors_topk(True)
691
692
693
694

@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_sample_neighbors_topk_outedge():
    _test_sample_neighbors_topk_outedge(False)
695
    #_test_sample_neighbors_topk_outedge(True)
696

697
def test_sample_neighbors_with_0deg():
698
    g = dgl.graph(([], []), num_nodes=5).to(F.ctx())
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
699
700
701
702
703
704
705
706
    sg = dgl.sampling.sample_neighbors(g, F.tensor([1, 2], dtype=F.int64), 2, edge_dir='in', replace=False)
    assert sg.number_of_edges() == 0
    sg = dgl.sampling.sample_neighbors(g, F.tensor([1, 2], dtype=F.int64), 2, edge_dir='in', replace=True)
    assert sg.number_of_edges() == 0
    sg = dgl.sampling.sample_neighbors(g, F.tensor([1, 2], dtype=F.int64), 2, edge_dir='out', replace=False)
    assert sg.number_of_edges() == 0
    sg = dgl.sampling.sample_neighbors(g, F.tensor([1, 2], dtype=F.int64), 2, edge_dir='out', replace=True)
    assert sg.number_of_edges() == 0
707

708
709
710
711
712
713
714
715
716
717
718
719
def create_test_graph(num_nodes, num_edges_per_node, bipartite=False):
    src = np.concatenate(
        [np.array([i] * num_edges_per_node) for i in range(num_nodes)])
    dst = np.concatenate(
        [np.random.choice(num_nodes, num_edges_per_node, replace=False) for i in range(num_nodes)]
    )
    if bipartite:
        g = dgl.heterograph({("u", "e", "v") : (src, dst)})
    else:
        g = dgl.graph((src, dst))
    return g

720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
def create_etype_test_graph(num_nodes, num_edges_per_node, rare_cnt):
    src = np.concatenate(
        [np.random.choice(num_nodes, num_edges_per_node, replace=False) for i in range(num_nodes)]
    )
    dst = np.concatenate(
        [np.array([i] * num_edges_per_node) for i in range(num_nodes)])

    minor_src = np.concatenate(
        [np.random.choice(num_nodes, 2, replace=False) for i in range(num_nodes)]
    )
    minor_dst = np.concatenate(
        [np.array([i] * 2) for i in range(num_nodes)])

    most_zero_src = np.concatenate(
        [np.random.choice(num_nodes, num_edges_per_node, replace=False) for i in range(rare_cnt)]
    )
    most_zero_dst = np.concatenate(
        [np.array([i] * num_edges_per_node) for i in range(rare_cnt)])


    g = dgl.heterograph({("v", "e_major", "u") : (src, dst),
                         ("u", "e_major_rev", "v") : (dst, src),
                         ("v2", "e_minor", "u") : (minor_src, minor_dst),
                         ("v2", "most_zero", "u") : (most_zero_src, most_zero_dst),
                         ("u", "e_minor_rev", "v2") : (minor_dst, minor_src)})
745
746
747
748
749
    for etype in g.etypes:
        prob = np.random.rand(g.num_edges(etype))
        prob[prob > 0.2] = 0
        g.edges[etype].data['p'] = F.zerocopy_from_numpy(prob)
        g.edges[etype].data['mask'] = F.zerocopy_from_numpy(prob != 0)
750
751
752

    return g

753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_sample_neighbors_biased_homogeneous():
    g = create_test_graph(100, 30)

    def check_num(nodes, tag):
        nodes, tag = F.asnumpy(nodes), F.asnumpy(tag)
        cnt = [sum(tag[nodes] == i) for i in range(4)]
        # No tag 0
        assert cnt[0] == 0

        # very rare tag 1
        assert cnt[2] > 2 * cnt[1]
        assert cnt[3] > 2 * cnt[1]

    tag = F.tensor(np.random.choice(4, 100))
    bias = F.tensor([0, 0.1, 10, 10], dtype=F.float32)
    # inedge / without replacement
770
    g_sorted = dgl.sort_csc_by_tag(g, tag)
771
772
773
774
775
776
777
778
779
780
781
782
783
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.nodes(), 5, bias, replace=False)
        check_num(subg.edges()[0], tag)
        u, v = subg.edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert len(edge_set) == subg.number_of_edges()

    # inedge / with replacement
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.nodes(), 5, bias, replace=True)
        check_num(subg.edges()[0], tag)

    # outedge / without replacement
784
    g_sorted = dgl.sort_csr_by_tag(g, tag)
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.nodes(), 5, bias, edge_dir='out', replace=False)
        check_num(subg.edges()[1], tag)
        u, v = subg.edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert len(edge_set) == subg.number_of_edges()

    # outedge / with replacement
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.nodes(), 5, bias, edge_dir='out', replace=True)
        check_num(subg.edges()[1], tag)

@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_sample_neighbors_biased_bipartite():
    g = create_test_graph(100, 30, True)
    num_dst = g.number_of_dst_nodes()
    bias = F.tensor([0, 0.01, 10, 10], dtype=F.float32)
    def check_num(nodes, tag):
        nodes, tag = F.asnumpy(nodes), F.asnumpy(tag)
        cnt = [sum(tag[nodes] == i) for i in range(4)]
        # No tag 0
        assert cnt[0] == 0

        # very rare tag 1
        assert cnt[2] > 2 * cnt[1]
        assert cnt[3] > 2 * cnt[1]

    # inedge / without replacement
    tag = F.tensor(np.random.choice(4, 100))
814
    g_sorted = dgl.sort_csc_by_tag(g, tag)
815
816
817
818
819
820
821
822
823
824
825
826
827
828
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.dstnodes(), 5, bias, replace=False)
        check_num(subg.edges()[0], tag)
        u, v = subg.edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert len(edge_set) == subg.number_of_edges()

    # inedge / with replacement
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.dstnodes(), 5, bias, replace=True)
        check_num(subg.edges()[0], tag)

    # outedge / without replacement
    tag = F.tensor(np.random.choice(4, num_dst))
829
    g_sorted = dgl.sort_csr_by_tag(g, tag)
830
831
832
833
834
835
836
837
838
839
840
841
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.srcnodes(), 5, bias, edge_dir='out', replace=False)
        check_num(subg.edges()[1], tag)
        u, v = subg.edges()
        edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
        assert len(edge_set) == subg.number_of_edges()

    # outedge / with replacement
    for _ in range(5):
        subg = dgl.sampling.sample_neighbors_biased(g_sorted, g.srcnodes(), 5, bias, edge_dir='out', replace=True)
        check_num(subg.edges()[1], tag)

842
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
843
@unittest.skipIf(F.backend_name == 'mxnet', reason='MXNet has problem converting bool arrays')
844
845
846
847
@pytest.mark.parametrize('format_', ['coo', 'csr', 'csc'])
@pytest.mark.parametrize('direction', ['in', 'out'])
@pytest.mark.parametrize('replace', [False, True])
def test_sample_neighbors_etype_homogeneous(format_, direction, replace):
848
849
850
    num_nodes = 100
    rare_cnt = 4
    g = create_etype_test_graph(100, 30, rare_cnt)
851
852
853
854
855
856
857
858
    h_g = dgl.to_homogeneous(g, edata=['p', 'mask'])
    h_g_etype = F.asnumpy(h_g.edata[dgl.ETYPE])
    h_g_offset = np.cumsum(np.insert(np.bincount(h_g_etype), 0, 0)).tolist()
    sg = g.edge_subgraph(g.edata['mask'], relabel_nodes=False)
    h_sg = h_g.edge_subgraph(h_g.edata['mask'], relabel_nodes=False)
    h_sg_etype = F.asnumpy(h_sg.edata[dgl.ETYPE])
    h_sg_offset = np.cumsum(np.insert(np.bincount(h_sg_etype), 0, 0)).tolist()

859
860
    seed_ntype = g.get_ntype_id("u")
    seeds = F.nonzero_1d(h_g.ndata[dgl.NTYPE] == seed_ntype)
861
862
863
864
    fanouts = F.tensor([6, 5, 4, 3, 2], dtype=F.int64)

    def check_num(h_g, all_src, all_dst, subg, replace, fanouts, direction):
        src, dst = subg.edges()
865
866
        all_etype_array = F.asnumpy(h_g.edata[dgl.ETYPE])
        num_etypes = all_etype_array.max() + 1
867
868
869
870
871
872
873
874
875
876
        etype_array = F.asnumpy(subg.edata[dgl.ETYPE])
        src = F.asnumpy(src)
        dst = F.asnumpy(dst)
        fanouts = F.asnumpy(fanouts)

        all_src = F.asnumpy(all_src)
        all_dst = F.asnumpy(all_dst)

        src_per_etype = []
        dst_per_etype = []
877
878
        all_src_per_etype = []
        all_dst_per_etype = []
879
880
881
        for etype in range(num_etypes):
            src_per_etype.append(src[etype_array == etype])
            dst_per_etype.append(dst[etype_array == etype])
882
883
            all_src_per_etype.append(all_src[all_etype_array == etype])
            all_dst_per_etype.append(all_dst[all_etype_array == etype])
884
885
886
887

        if replace:
            if direction == 'in':
                in_degree_per_etype = [np.bincount(d) for d in dst_per_etype]
888
889
890
891
892
893
894
                for etype in range(len(fanouts)):
                    in_degree = in_degree_per_etype[etype]
                    fanout = fanouts[etype]
                    ans = np.zeros_like(in_degree)
                    if len(in_degree) > 0:
                        ans[all_dst_per_etype[etype]] = fanout
                    assert np.all(in_degree == ans)
895
            else:
896
                out_degree_per_etype = [np.bincount(s) for s in src_per_etype]
897
898
899
900
901
902
903
                for etype in range(len(fanouts)):
                    out_degree = out_degree_per_etype[etype]
                    fanout = fanouts[etype]
                    ans = np.zeros_like(out_degree)
                    if len(out_degree) > 0:
                        ans[all_src_per_etype[etype]] = fanout
                    assert np.all(out_degree == ans)
904
905
906
907
908
909
910
911
912
913
914
        else:
            if direction == 'in':
                for v in set(dst):
                    u = src[dst == v]
                    et = etype_array[dst == v]
                    all_u = all_src[all_dst == v]
                    all_et = all_etype_array[all_dst == v]
                    for etype in set(et):
                        u_etype = set(u[et == etype])
                        all_u_etype = set(all_u[all_et == etype])
                        assert (len(u_etype) == fanouts[etype]) or (u_etype == all_u_etype)
915
            else:
916
917
918
919
920
921
922
923
924
925
926
                for u in set(src):
                    v = dst[src == u]
                    et = etype_array[src == u]
                    all_v = all_dst[all_src == u]
                    all_et = all_etype_array[all_src == u]
                    for etype in set(et):
                        v_etype = set(v[et == etype])
                        all_v_etype = set(all_v[all_et == etype])
                        assert (len(v_etype) == fanouts[etype]) or (v_etype == all_v_etype)

    all_src, all_dst = h_g.edges()
927
    all_sub_src, all_sub_dst = h_sg.edges()
928
929
930
    h_g = h_g.formats(format_)
    if (direction, format_) in [('in', 'csr'), ('out', 'csc')]:
        h_g = h_g.formats(['csc', 'csr', 'coo'])
931
932
    for _ in range(5):
        subg = dgl.sampling.sample_etype_neighbors(
933
934
            h_g, seeds, h_g_offset, fanouts, replace=replace,
            edge_dir=direction)
935
        check_num(h_g, all_src, all_dst, subg, replace, fanouts, direction)
936

937
938
939
940
941
942
943
944
945
946
947
948
        p = [g.edges[etype].data['p'] for etype in g.etypes]
        subg = dgl.sampling.sample_etype_neighbors(
            h_g, seeds, h_g_offset, fanouts, replace=replace,
            edge_dir=direction, prob=p)
        check_num(h_sg, all_sub_src, all_sub_dst, subg, replace, fanouts, direction)

        p = [g.edges[etype].data['mask'] for etype in g.etypes]
        subg = dgl.sampling.sample_etype_neighbors(
            h_g, seeds, h_g_offset, fanouts, replace=replace,
            edge_dir=direction, prob=p)
        check_num(h_sg, all_sub_src, all_sub_dst, subg, replace, fanouts, direction)

949
950

@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
951
@unittest.skipIf(F.backend_name == 'mxnet', reason='MXNet has problem converting bool arrays')
952
953
954
955
956
957
958
959
@pytest.mark.parametrize('format_', ['csr', 'csc'])
@pytest.mark.parametrize('direction', ['in', 'out'])
def test_sample_neighbors_etype_sorted_homogeneous(format_, direction):
    rare_cnt = 4
    g = create_etype_test_graph(100, 30, rare_cnt)
    h_g = dgl.to_homogeneous(g)
    seed_ntype = g.get_ntype_id("u")
    seeds = F.nonzero_1d(h_g.ndata[dgl.NTYPE] == seed_ntype)
960
    fanouts = F.tensor([6, 5, -1, 3, 2], dtype=F.int64)
961
962
963
964
    h_g = h_g.formats(format_)
    if (direction, format_) in [('in', 'csr'), ('out', 'csc')]:
        h_g = h_g.formats(['csc', 'csr', 'coo'])

965
966
967
968
969
970
971
972
973
    if direction == 'in':
        h_g = dgl.sort_csc_by_tag(h_g, h_g.edata[dgl.ETYPE], tag_type='edge')
    else:
        h_g = dgl.sort_csr_by_tag(h_g, h_g.edata[dgl.ETYPE], tag_type='edge')
    # shuffle
    h_g_etype = F.asnumpy(h_g.edata[dgl.ETYPE])
    h_g_offset = np.cumsum(np.insert(np.bincount(h_g_etype), 0, 0)).tolist()
    sg = dgl.sampling.sample_etype_neighbors(
        h_g, seeds, h_g_offset, fanouts, edge_dir=direction, etype_sorted=True)
974

975
976
977
978
979
980
981
982
983
984
985
986
987
@pytest.mark.parametrize('dtype', ['int32', 'int64'])
def test_sample_neighbors_exclude_edges_heteroG(dtype):
    d_i_d_u_nodes = F.zerocopy_from_numpy(np.unique(np.random.randint(300, size=100, dtype=dtype)))
    d_i_d_v_nodes = F.zerocopy_from_numpy(np.random.randint(25, size=d_i_d_u_nodes.shape, dtype=dtype))
    d_i_g_u_nodes = F.zerocopy_from_numpy(np.unique(np.random.randint(300, size=100, dtype=dtype)))
    d_i_g_v_nodes = F.zerocopy_from_numpy(np.random.randint(25, size=d_i_g_u_nodes.shape, dtype=dtype))
    d_t_d_u_nodes = F.zerocopy_from_numpy(np.unique(np.random.randint(300, size=100, dtype=dtype)))
    d_t_d_v_nodes = F.zerocopy_from_numpy(np.random.randint(25, size=d_t_d_u_nodes.shape, dtype=dtype))

    g = dgl.heterograph({
        ('drug', 'interacts', 'drug'): (d_i_d_u_nodes, d_i_d_v_nodes),
        ('drug', 'interacts', 'gene'): (d_i_g_u_nodes, d_i_g_v_nodes),
        ('drug', 'treats', 'disease'): (d_t_d_u_nodes, d_t_d_v_nodes)
988
    }).to(F.ctx())
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043

    (U, V, EID) = (0, 1, 2)

    nd_b_idx = np.random.randint(low=1, high=24, dtype=dtype)
    nd_e_idx = np.random.randint(low=25, high=49, dtype=dtype)
    did_b_idx = np.random.randint(low=1, high=24, dtype=dtype)
    did_e_idx = np.random.randint(low=25, high=49, dtype=dtype)
    sampled_amount = np.random.randint(low=1, high=10, dtype=dtype)

    drug_i_drug_edges = g.all_edges(form='all', etype=('drug','interacts','drug'))
    excluded_d_i_d_edges = drug_i_drug_edges[EID][did_b_idx:did_e_idx]
    sampled_drug_node = drug_i_drug_edges[V][nd_b_idx:nd_e_idx]
    did_excluded_nodes_U = drug_i_drug_edges[U][did_b_idx:did_e_idx]
    did_excluded_nodes_V = drug_i_drug_edges[V][did_b_idx:did_e_idx]

    nd_b_idx = np.random.randint(low=1, high=24, dtype=dtype)
    nd_e_idx = np.random.randint(low=25, high=49, dtype=dtype)
    dig_b_idx = np.random.randint(low=1, high=24, dtype=dtype)
    dig_e_idx = np.random.randint(low=25, high=49, dtype=dtype)
    drug_i_gene_edges = g.all_edges(form='all', etype=('drug','interacts','gene'))
    excluded_d_i_g_edges = drug_i_gene_edges[EID][dig_b_idx:dig_e_idx]
    dig_excluded_nodes_U = drug_i_gene_edges[U][dig_b_idx:dig_e_idx]
    dig_excluded_nodes_V = drug_i_gene_edges[V][dig_b_idx:dig_e_idx]
    sampled_gene_node = drug_i_gene_edges[V][nd_b_idx:nd_e_idx]

    nd_b_idx = np.random.randint(low=1, high=24, dtype=dtype)
    nd_e_idx = np.random.randint(low=25, high=49, dtype=dtype)
    dtd_b_idx = np.random.randint(low=1, high=24, dtype=dtype)
    dtd_e_idx = np.random.randint(low=25, high=49, dtype=dtype)
    drug_t_dis_edges = g.all_edges(form='all', etype=('drug','treats','disease'))
    excluded_d_t_d_edges = drug_t_dis_edges[EID][dtd_b_idx:dtd_e_idx]
    dtd_excluded_nodes_U = drug_t_dis_edges[U][dtd_b_idx:dtd_e_idx]
    dtd_excluded_nodes_V = drug_t_dis_edges[V][dtd_b_idx:dtd_e_idx]
    sampled_disease_node = drug_t_dis_edges[V][nd_b_idx:nd_e_idx]
    excluded_edges  = {('drug', 'interacts', 'drug'): excluded_d_i_d_edges,
                       ('drug', 'interacts', 'gene'): excluded_d_i_g_edges,
                       ('drug', 'treats', 'disease'): excluded_d_t_d_edges
                      }

    sg = dgl.sampling.sample_neighbors(g, {'drug': sampled_drug_node,
                                           'gene': sampled_gene_node,
                                           'disease': sampled_disease_node},
                                       sampled_amount, exclude_edges=excluded_edges)

    assert not np.any(F.asnumpy(sg.has_edges_between(did_excluded_nodes_U,did_excluded_nodes_V,
                                                     etype=('drug','interacts','drug'))))
    assert not np.any(F.asnumpy(sg.has_edges_between(dig_excluded_nodes_U,dig_excluded_nodes_V,
                                                     etype=('drug','interacts','gene'))))
    assert not np.any(F.asnumpy(sg.has_edges_between(dtd_excluded_nodes_U,dtd_excluded_nodes_V,
                                                     etype=('drug','treats','disease'))))

@pytest.mark.parametrize('dtype', ['int32', 'int64'])
def test_sample_neighbors_exclude_edges_homoG(dtype):
    u_nodes = F.zerocopy_from_numpy(np.unique(np.random.randint(300,size=100, dtype=dtype)))
    v_nodes = F.zerocopy_from_numpy(np.random.randint(25, size=u_nodes.shape, dtype=dtype))
1044
    g = dgl.graph((u_nodes, v_nodes)).to(F.ctx())
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064

    (U, V, EID) = (0, 1, 2)

    nd_b_idx = np.random.randint(low=1,high=24, dtype=dtype)
    nd_e_idx = np.random.randint(low=25,high=49, dtype=dtype)
    b_idx = np.random.randint(low=1,high=24, dtype=dtype)
    e_idx = np.random.randint(low=25,high=49, dtype=dtype)
    sampled_amount = np.random.randint(low=1,high=10, dtype=dtype)

    g_edges = g.all_edges(form='all')
    excluded_edges = g_edges[EID][b_idx:e_idx]
    sampled_node = g_edges[V][nd_b_idx:nd_e_idx]
    excluded_nodes_U = g_edges[U][b_idx:e_idx]
    excluded_nodes_V = g_edges[V][b_idx:e_idx]

    sg = dgl.sampling.sample_neighbors(g, sampled_node,
                                       sampled_amount, exclude_edges=excluded_edges)

    assert not np.any(F.asnumpy(sg.has_edges_between(excluded_nodes_U,excluded_nodes_V)))

1065
1066
@pytest.mark.parametrize('dtype', ['int32', 'int64'])
def test_global_uniform_negative_sampling(dtype):
1067
1068
1069
1070
    g = dgl.graph(([], []), num_nodes=1000).to(F.ctx())
    src, dst = dgl.sampling.global_uniform_negative_sampling(g, 2000, False, True)
    assert len(src) == 2000
    assert len(dst) == 2000
1071

1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
    g = dgl.graph((np.random.randint(0, 20, (300,)), np.random.randint(0, 20, (300,)))).to(F.ctx())
    src, dst = dgl.sampling.global_uniform_negative_sampling(g, 20, False, True)
    assert not F.asnumpy(g.has_edges_between(src, dst)).any()

    src, dst = dgl.sampling.global_uniform_negative_sampling(g, 20, False, False)
    assert not F.asnumpy(g.has_edges_between(src, dst)).any()
    src = F.asnumpy(src)
    dst = F.asnumpy(dst)
    s = set(zip(src.tolist(), dst.tolist()))
    assert len(s) == len(src)

    g = dgl.graph(([0], [1])).to(F.ctx())
    src, dst = dgl.sampling.global_uniform_negative_sampling(g, 20, True, False, redundancy=10)
    src = F.asnumpy(src)
    dst = F.asnumpy(dst)
    # should have either no element or (1, 0)
    assert len(src) < 2
    assert len(dst) < 2
    if len(src) == 1:
        assert src[0] == 1
        assert dst[0] == 0

    g = dgl.heterograph({
        ('A', 'AB', 'B'): (np.random.randint(0, 20, (300,)), np.random.randint(0, 40, (300,))),
        ('B', 'BA', 'A'): (np.random.randint(0, 40, (200,)), np.random.randint(0, 20, (200,)))}).to(F.ctx())
    src, dst = dgl.sampling.global_uniform_negative_sampling(g, 20, False, etype='AB')
    assert not F.asnumpy(g.has_edges_between(src, dst, etype='AB')).any()

1100

1101
if __name__ == '__main__':
1102
    from itertools import product
1103
1104
1105
    test_sample_neighbors_noprob()
    test_sample_neighbors_prob()
    test_sample_neighbors_mask()
1106
1107
    for args in product(['coo', 'csr', 'csc'], ['in', 'out'], [False, True]):
        test_sample_neighbors_etype_homogeneous(*args)
1108
1109
    for args in product(['csr', 'csc'], ['in', 'out']):
        test_sample_neighbors_etype_sorted_homogeneous(*args)
1110
    test_non_uniform_random_walk(False)
1111
    test_uniform_random_walk(False)
1112
    test_pack_traces()
1113
    test_pinsage_sampling(False)
1114
1115
1116
    test_sample_neighbors_outedge()
    test_sample_neighbors_topk()
    test_sample_neighbors_topk_outedge()
1117
    test_sample_neighbors_with_0deg()
1118
1119
    test_sample_neighbors_biased_homogeneous()
    test_sample_neighbors_biased_bipartite()
1120
1121
    test_sample_neighbors_exclude_edges_heteroG('int32')
    test_sample_neighbors_exclude_edges_homoG('int32')
1122
1123
    test_global_uniform_negative_sampling('int32')
    test_global_uniform_negative_sampling('int64')