test_sampler.py 13.6 KB
Newer Older
1
import backend as F
Da Zheng's avatar
Da Zheng committed
2
3
4
5
import numpy as np
import scipy as sp
import dgl
from dgl import utils
VoVAllen's avatar
VoVAllen committed
6
import unittest
7
from numpy.testing import assert_array_equal
Da Zheng's avatar
Da Zheng committed
8

9
10
np.random.seed(42)

Da Zheng's avatar
Da Zheng committed
11
12
13
14
def generate_rand_graph(n):
    arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(np.int64)
    return dgl.DGLGraph(arr, readonly=True)

15
16
17
def test_create_full():
    g = generate_rand_graph(100)
    full_nf = dgl.contrib.sampling.sampler.create_full_nodeflow(g, 5)
Da Zheng's avatar
Da Zheng committed
18
    assert full_nf.number_of_nodes() == g.number_of_nodes() * 6
19
20
    assert full_nf.number_of_edges() == g.number_of_edges() * 5

Da Zheng's avatar
Da Zheng committed
21
22
23
def test_1neighbor_sampler_all():
    g = generate_rand_graph(100)
    # In this case, NeighborSampling simply gets the neighborhood of a single vertex.
24
    for i, subg in enumerate(dgl.contrib.sampling.NeighborSampler(
Da Zheng's avatar
Da Zheng committed
25
            g, 1, g.number_of_nodes(), neighbor_type='in', num_workers=4)):
26
        seed_ids = subg.layer_parent_nid(-1)
Da Zheng's avatar
Da Zheng committed
27
        assert len(seed_ids) == 1
28
        src, dst, eid = g.in_edges(seed_ids, form='all')
Da Zheng's avatar
Da Zheng committed
29
30
        assert subg.number_of_nodes() == len(src) + 1
        assert subg.number_of_edges() == len(src)
Da Zheng's avatar
Da Zheng committed
31

Da Zheng's avatar
Da Zheng committed
32
33
34
        assert seed_ids == subg.layer_parent_nid(-1)
        child_src, child_dst, child_eid = subg.in_edges(subg.layer_nid(-1), form='all')
        assert F.array_equal(child_src, subg.layer_nid(0))
Da Zheng's avatar
Da Zheng committed
35

Da Zheng's avatar
Da Zheng committed
36
37
        src1 = subg.map_to_parent_nid(child_src)
        assert F.array_equal(src1, src)
Da Zheng's avatar
Da Zheng committed
38
39

def is_sorted(arr):
40
    return np.sum(np.sort(arr) == arr, 0) == len(arr)
Da Zheng's avatar
Da Zheng committed
41
42

def verify_subgraph(g, subg, seed_id):
Da Zheng's avatar
Da Zheng committed
43
44
45
46
    seed_id = F.asnumpy(seed_id)
    seeds = F.asnumpy(subg.map_to_parent_nid(subg.layer_nid(-1)))
    assert seed_id in seeds
    child_seed = F.asnumpy(subg.layer_nid(-1))[seeds == seed_id]
47
    src, dst, eid = g.in_edges(seed_id, form='all')
Da Zheng's avatar
Da Zheng committed
48
49
    child_src, child_dst, child_eid = subg.in_edges(child_seed, form='all')

50
    child_src = F.asnumpy(child_src)
Da Zheng's avatar
Da Zheng committed
51
52
53
54
55
    # We don't allow duplicate elements in the neighbor list.
    assert(len(np.unique(child_src)) == len(child_src))
    # The neighbor list also needs to be sorted.
    assert(is_sorted(child_src))

Da Zheng's avatar
Da Zheng committed
56
    # a neighbor in the subgraph must also exist in parent graph.
57
    src = F.asnumpy(src)
Da Zheng's avatar
Da Zheng committed
58
    for i in subg.map_to_parent_nid(child_src):
59
        assert F.asnumpy(i) in src
Da Zheng's avatar
Da Zheng committed
60
61
62
63

def test_1neighbor_sampler():
    g = generate_rand_graph(100)
    # In this case, NeighborSampling simply gets the neighborhood of a single vertex.
64
65
66
    for subg in dgl.contrib.sampling.NeighborSampler(g, 1, 5, neighbor_type='in',
                                                     num_workers=4):
        seed_ids = subg.layer_parent_nid(-1)
Da Zheng's avatar
Da Zheng committed
67
68
69
70
71
        assert len(seed_ids) == 1
        assert subg.number_of_nodes() <= 6
        assert subg.number_of_edges() <= 5
        verify_subgraph(g, subg, seed_ids)

72
73
74
def test_prefetch_neighbor_sampler():
    g = generate_rand_graph(100)
    # In this case, NeighborSampling simply gets the neighborhood of a single vertex.
75
76
77
    for subg in dgl.contrib.sampling.NeighborSampler(g, 1, 5, neighbor_type='in',
                                                     num_workers=4, prefetch=True):
        seed_ids = subg.layer_parent_nid(-1)
78
79
80
81
82
        assert len(seed_ids) == 1
        assert subg.number_of_nodes() <= 6
        assert subg.number_of_edges() <= 5
        verify_subgraph(g, subg, seed_ids)

Da Zheng's avatar
Da Zheng committed
83
84
85
def test_10neighbor_sampler_all():
    g = generate_rand_graph(100)
    # In this case, NeighborSampling simply gets the neighborhood of a single vertex.
Da Zheng's avatar
Da Zheng committed
86
87
    for subg in dgl.contrib.sampling.NeighborSampler(g, 10, g.number_of_nodes(),
                                                     neighbor_type='in', num_workers=4):
88
        seed_ids = subg.layer_parent_nid(-1)
Da Zheng's avatar
Da Zheng committed
89
        assert F.array_equal(seed_ids, subg.map_to_parent_nid(subg.layer_nid(-1)))
Da Zheng's avatar
Da Zheng committed
90

Da Zheng's avatar
Da Zheng committed
91
92
93
94
        src, dst, eid = g.in_edges(seed_ids, form='all')
        child_src, child_dst, child_eid = subg.in_edges(subg.layer_nid(-1), form='all')
        src1 = subg.map_to_parent_nid(child_src)
        assert F.array_equal(src1, src)
Da Zheng's avatar
Da Zheng committed
95
96
97

def check_10neighbor_sampler(g, seeds):
    # In this case, NeighborSampling simply gets the neighborhood of a single vertex.
98
99
100
    for subg in dgl.contrib.sampling.NeighborSampler(g, 10, 5, neighbor_type='in',
                                                     num_workers=4, seed_nodes=seeds):
        seed_ids = subg.layer_parent_nid(-1)
Da Zheng's avatar
Da Zheng committed
101
102
103
104
105
106
107
108
109
110
111
        assert subg.number_of_nodes() <= 6 * len(seed_ids)
        assert subg.number_of_edges() <= 5 * len(seed_ids)
        for seed_id in seed_ids:
            verify_subgraph(g, subg, seed_id)

def test_10neighbor_sampler():
    g = generate_rand_graph(100)
    check_10neighbor_sampler(g, None)
    check_10neighbor_sampler(g, seeds=np.unique(np.random.randint(0, g.number_of_nodes(),
                                                                  size=int(g.number_of_nodes() / 10))))

112
def _test_layer_sampler(prefetch=False):
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
    g = generate_rand_graph(100)
    nid = g.nodes()
    src, dst, eid = g.all_edges(form='all', order='eid')
    n_batches = 5
    batch_size = 50
    seed_batches = [np.sort(np.random.choice(F.asnumpy(nid), batch_size, replace=False))
                    for i in range(n_batches)]
    seed_nodes = np.hstack(seed_batches)
    layer_sizes = [50] * 3
    LayerSampler = getattr(dgl.contrib.sampling, 'LayerSampler')
    sampler = LayerSampler(g, batch_size, layer_sizes, 'in',
                           seed_nodes=seed_nodes, num_workers=4, prefetch=prefetch)
    for sub_g in sampler:
        assert all(sub_g.layer_size(i) < size for i, size in enumerate(layer_sizes))
        sub_nid = F.arange(0, sub_g.number_of_nodes())
        assert all(np.all(np.isin(F.asnumpy(sub_g.layer_nid(i)), F.asnumpy(sub_nid)))
                   for i in range(sub_g.num_layers))
        assert np.all(np.isin(F.asnumpy(sub_g.map_to_parent_nid(sub_nid)),
                              F.asnumpy(nid)))
        sub_eid = F.arange(0, sub_g.number_of_edges())
        assert np.all(np.isin(F.asnumpy(sub_g.map_to_parent_eid(sub_eid)),
                              F.asnumpy(eid)))
        assert any(np.all(np.sort(F.asnumpy(sub_g.layer_parent_nid(-1))) == seed_batch)
                   for seed_batch in seed_batches)

        sub_src, sub_dst = sub_g.all_edges(order='eid')
        for i in range(sub_g.num_blocks):
            block_eid = sub_g.block_eid(i)
VoVAllen's avatar
VoVAllen committed
141
142
            block_src = sub_g.map_to_parent_nid(F.gather_row(sub_src, block_eid))
            block_dst = sub_g.map_to_parent_nid(F.gather_row(sub_dst, block_eid))
143
144

            block_parent_eid = sub_g.block_parent_eid(i)
VoVAllen's avatar
VoVAllen committed
145
146
            block_parent_src = F.gather_row(src, block_parent_eid)
            block_parent_dst = F.gather_row(dst, block_parent_eid)
147
148
149
150
151
152
153
154
155
156

            assert np.all(F.asnumpy(block_src == block_parent_src))

        n_layers = sub_g.num_layers
        sub_n = sub_g.number_of_nodes()
        assert sum(F.shape(sub_g.layer_nid(i))[0] for i in range(n_layers)) == sub_n
        n_blocks = sub_g.num_blocks
        sub_m = sub_g.number_of_edges()
        assert sum(F.shape(sub_g.block_eid(i))[0] for i in range(n_blocks)) == sub_m

Da Zheng's avatar
Da Zheng committed
157
158
159
160
def test_layer_sampler():
    _test_layer_sampler()
    _test_layer_sampler(prefetch=True)

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def test_nonuniform_neighbor_sampler():
    # Construct a graph with
    # (1) A path (0, 1, ..., 99) with weight 1
    # (2) A bunch of random edges with weight 0.
    edges = []
    for i in range(99):
        edges.append((i, i + 1))
    for i in range(1000):
        edge = (np.random.randint(100), np.random.randint(100))
        if edge not in edges:
            edges.append(edge)
    src, dst = zip(*edges)
    g = dgl.DGLGraph()
    g.add_nodes(100)
    g.add_edges(src, dst)
    g.readonly()

    g.edata['w'] = F.cat([
        F.ones((99,), F.float64, F.cpu()),
        F.zeros((len(edges) - 99,), F.float64, F.cpu())], 0)

    # Test 1-neighbor NodeFlow with 99 as target node.
    # The generated NodeFlow should only contain node i on layer i.
    sampler = dgl.contrib.sampling.NeighborSampler(
        g, 1, 1, 99, 'in', transition_prob='w', seed_nodes=[99])
    nf = next(iter(sampler))

    assert nf.num_layers == 100
    for i in range(nf.num_layers):
        assert nf.layer_size(i) == 1
191
        assert F.asnumpy(nf.layer_parent_nid(i)[0]) == i
192
193
194
195
196
197
198
199
200

    # Test the reverse direction
    sampler = dgl.contrib.sampling.NeighborSampler(
        g, 1, 1, 99, 'out', transition_prob='w', seed_nodes=[0])
    nf = next(iter(sampler))

    assert nf.num_layers == 100
    for i in range(nf.num_layers):
        assert nf.layer_size(i) == 1
201
        assert F.asnumpy(nf.layer_parent_nid(i)[0]) == 99 - i
202

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def test_setseed():
    g = generate_rand_graph(100)

    nids = []

    dgl.random.seed(42)
    for subg in dgl.contrib.sampling.NeighborSampler(
            g, 5, 3, num_hops=2, neighbor_type='in', num_workers=1):
        nids.append(
            tuple(tuple(F.asnumpy(subg.layer_parent_nid(i))) for i in range(3)))

    # reinitialize
    dgl.random.seed(42)
    for i, subg in enumerate(dgl.contrib.sampling.NeighborSampler(
            g, 5, 3, num_hops=2, neighbor_type='in', num_workers=1)):
        item = tuple(tuple(F.asnumpy(subg.layer_parent_nid(i))) for i in range(3))
        assert item == nids[i]

    for i, subg in enumerate(dgl.contrib.sampling.NeighborSampler(
            g, 5, 3, num_hops=2, neighbor_type='in', num_workers=4)):
        pass

225
226
227
228
229
230
231
232
233
234
235
236
237
def check_head_tail(g):
    lsrc, ldst, leid = g.all_edges(form='all', order='eid')

    lsrc = np.unique(F.asnumpy(lsrc))
    head_nid = np.unique(F.asnumpy(g.head_nid))
    assert len(head_nid) == len(g.head_nid)
    np.testing.assert_equal(lsrc, head_nid)

    ldst = np.unique(F.asnumpy(ldst))
    tail_nid = np.unique(F.asnumpy(g.tail_nid))
    assert len(tail_nid) == len(g.tail_nid)
    np.testing.assert_equal(tail_nid, ldst)

Da Zheng's avatar
Da Zheng committed
238
def check_negative_sampler(mode, exclude_positive, neg_size):
239
    g = generate_rand_graph(100)
240
    etype = np.random.randint(0, 10, size=g.number_of_edges(), dtype=np.int64)
VoVAllen's avatar
VoVAllen committed
241
    g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())
242
243
244
245
246
247
248
249

    pos_gsrc, pos_gdst, pos_geid = g.all_edges(form='all', order='eid')
    pos_map = {}
    for i in range(len(pos_geid)):
        pos_d = int(F.asnumpy(pos_gdst[i]))
        pos_e = int(F.asnumpy(pos_geid[i]))
        pos_map[(pos_d, pos_e)] = int(F.asnumpy(pos_gsrc[i]))

250
    EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
251
    # Test the homogeneous graph.
252
    for pos_edges, neg_edges in EdgeSampler(g, 50,
253
254
                                            negative_mode=mode,
                                            neg_sample_size=neg_size,
Da Zheng's avatar
Da Zheng committed
255
256
                                            exclude_positive=exclude_positive,
                                            return_false_neg=True):
257
        pos_lsrc, pos_ldst, pos_leid = pos_edges.all_edges(form='all', order='eid')
VoVAllen's avatar
VoVAllen committed
258
259
260
        assert_array_equal(F.asnumpy(F.gather_row(pos_edges.parent_eid, pos_leid)),
                           F.asnumpy(g.edge_ids(F.gather_row(pos_edges.parent_nid, pos_lsrc),
                                                F.gather_row(pos_edges.parent_nid, pos_ldst))))
261
262

        neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
263

VoVAllen's avatar
VoVAllen committed
264
265
266
        neg_src = F.gather_row(neg_edges.parent_nid, neg_lsrc)
        neg_dst = F.gather_row(neg_edges.parent_nid, neg_ldst)
        neg_eid = F.gather_row(neg_edges.parent_eid, neg_leid)
267
        for i in range(len(neg_eid)):
VoVAllen's avatar
VoVAllen committed
268
269
            neg_d = int(F.asnumpy(neg_dst)[i])
            neg_e = int(F.asnumpy(neg_eid)[i])
270
            assert (neg_d, neg_e) in pos_map
271
272
273
            if exclude_positive:
                assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)]

274
        check_head_tail(neg_edges)
VoVAllen's avatar
VoVAllen committed
275
276
        pos_tails = F.gather_row(pos_edges.parent_nid, pos_edges.tail_nid)
        neg_tails = F.gather_row(neg_edges.parent_nid, neg_edges.tail_nid)
277
278
279
280
        pos_tails = np.sort(F.asnumpy(pos_tails))
        neg_tails = np.sort(F.asnumpy(neg_tails))
        np.testing.assert_equal(pos_tails, neg_tails)

Da Zheng's avatar
Da Zheng committed
281
        exist = neg_edges.edata['false_neg']
282
283
284
285
286
287
288
289
290
291
        if exclude_positive:
            assert np.sum(F.asnumpy(exist) == 0) == len(exist)
        else:
            assert F.array_equal(g.has_edges_between(neg_src, neg_dst), exist)

    # Test the knowledge graph.
    for _, neg_edges in EdgeSampler(g, 50,
                                    negative_mode=mode,
                                    neg_sample_size=neg_size,
                                    exclude_positive=exclude_positive,
Da Zheng's avatar
Da Zheng committed
292
293
                                    relations=g.edata['etype'],
                                    return_false_neg=True):
294
        neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
VoVAllen's avatar
VoVAllen committed
295
296
297
        neg_src = F.gather_row(neg_edges.parent_nid, neg_lsrc)
        neg_dst = F.gather_row(neg_edges.parent_nid, neg_ldst)
        neg_eid = F.gather_row(neg_edges.parent_eid, neg_leid)
Da Zheng's avatar
Da Zheng committed
298
        exists = neg_edges.edata['false_neg']
299
300
301
302
303
304
305
306
307
        neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
        for i in range(len(neg_eid)):
            u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
            if g.has_edge_between(u, v):
                eid = g.edge_id(u, v)
                etype = g.edata['etype'][eid]
                exist = neg_edges.edata['etype'][i] == etype
                assert F.asnumpy(exists[i]) == F.asnumpy(exist)

VoVAllen's avatar
VoVAllen committed
308
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Core dump")
309
def test_negative_sampler():
Da Zheng's avatar
Da Zheng committed
310
311
312
    check_negative_sampler('PBG-head', False, 10)
    check_negative_sampler('head', True, 10)
    check_negative_sampler('head', False, 10)
Da Zheng's avatar
Da Zheng committed
313
314
    #disable this check for now. It might take too long time.
    #check_negative_sampler('head', False, 100)
315
316


Da Zheng's avatar
Da Zheng committed
317
if __name__ == '__main__':
318
    test_create_full()
Da Zheng's avatar
Da Zheng committed
319
320
321
322
    test_1neighbor_sampler_all()
    test_10neighbor_sampler_all()
    test_1neighbor_sampler()
    test_10neighbor_sampler()
Da Zheng's avatar
Da Zheng committed
323
    test_layer_sampler()
324
    test_nonuniform_neighbor_sampler()
325
    test_setseed()
326
    test_negative_sampler()