sampling_mx.py 17 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
"""
.. _sampling:

NodeFlow and Sampling
=======================================

**Author**: Ziyue Huang, Da Zheng, Quan Gan, Jinjing Zhou, Zheng Zhang
"""
################################################################################################
#
# GCN
# ~~~
#
# In an :math:`L`-layer graph convolution network (GCN), given a graph
# :math:`G=(V, E)`, represented as an adjacency matrix :math:`A`, with
# node features :math:`H^{(0)} = X \in \mathbb{R}^{|V| \times d}`, the
# hidden feature of a node :math:`v` in :math:`(l+1)`-th layer
# :math:`h_v^{(l+1)}` depends on the features of all its neighbors in the
# previous layer :math:`h_u^{(l)}`:
#
# .. math::
#
#
#    z_v^{(l+1)} = \sum_{u \in \mathcal{N}(v)} \tilde{A}_{uv} h_u^{(l)} \qquad h_v^{(l+1)} = \sigma ( z_v^{(l+1)} W^{(l)})
#
# where :math:`\mathcal{N}(v)` is the neighborhood of :math:`v`,
# :math:`\tilde{A}` could be any normalized version of :math:`A` such as
# :math:`D^{-1} A` in Kipf et al., :math:`\sigma(\cdot)` is an activation
# function, and :math:`W^{(l)}` is a trainable parameter of the
# :math:`l`-th layer.
#
# In the node classification task we minimize the following loss:
#
# .. math::
#
#
#    \frac{1}{\vert \mathcal{V}_\mathcal{L} \vert} \sum_{v \in \mathcal{V}_\mathcal{L}} f(y_v, z_v^{(L)})
#
# where :math:`y_v` is the label of :math:`v`, and :math:`f(\cdot, \cdot)`
# is a loss function, e.g., cross entropy loss.
#
# While training GCN on the full graph, each node aggregates the hidden
# features of its neighbors to compute its hidden feature in the next
# layer.
#
# In this tutorial, we will run GCN on the Reddit dataset constructed by `Hamilton et
# al. <https://arxiv.org/abs/1706.02216>`__, wherein the nodes are posts
# and edges are established if two nodes are commented by a same user. The
# task is to predict the category that a post belongs to. This graph has
# 233K nodes, 114.6M edges and 41 categories. Let's first load the Reddit graph.
#
import numpy as np
import dgl
import dgl.function as fn
from dgl import DGLGraph
from dgl.data import RedditDataset
import mxnet as mx
from mxnet import gluon

# load dataset
data = RedditDataset(self_loop=True)
train_nid = mx.nd.array(np.nonzero(data.train_mask)[0]).astype(np.int64)
features = mx.nd.array(data.features)
in_feats = features.shape[1]
labels = mx.nd.array(data.labels)
n_classes = data.num_labels

# construct DGLGraph and prepare related data
g = DGLGraph(data.graph, readonly=True)
g.ndata['features'] = features

################################################################################################
# Here we define the node UDF which has a fully-connected layer:
#

class NodeUpdate(gluon.Block):
    def __init__(self, in_feats, out_feats, activation=None):
        super(NodeUpdate, self).__init__()
        self.dense = gluon.nn.Dense(out_feats, in_units=in_feats)
        self.activation = activation

    def forward(self, node):
        h = node.data['h']
        h = self.dense(h)
        if self.activation:
            h = self.activation(h)
        return {'activation': h}

################################################################################################
# In DGL, we implement GCN on the full graph with ``update_all`` in ``DGLGraph``.
# The following code performs two-layer GCN on the Reddit graph.
#

# number of GCN layers
L = 2
# number of hidden units of a fully connected layer
n_hidden = 64

layers = [NodeUpdate(g.ndata['features'].shape[1], n_hidden, mx.nd.relu),
          NodeUpdate(n_hidden, n_hidden, mx.nd.relu)]
for layer in layers:
    layer.initialize()

h = g.ndata['features']
for i in range(L):
    g.ndata['h'] = h
    g.update_all(message_func=fn.copy_src(src='h', out='m'),
                 reduce_func=fn.sum(msg='m', out='h'),
                 apply_node_func=lambda node: {'h': layers[i](node)['activation']})
    h = g.ndata.pop('h')

##############################################################################
# NodeFlow
# ~~~~~~~~~~~~~~~~~
#
# As the graph scales up to billions of nodes or edges, training on the
# full graph would no longer be efficient or even feasible.
#
# Mini-batch training allows us to control the computation and memory
# usage within some budget. The training loss for each iteration is
#
# .. math::
#
#    \frac{1}{\vert \tilde{\mathcal{V}}_\mathcal{L} \vert} \sum_{v \in \tilde{\mathcal{V}}_\mathcal{L}} f(y_v, z_v^{(L)})
#
# where :math:`\tilde{\mathcal{V}}_\mathcal{L}` is a subset sampled from
# the total labeled nodes :math:`\mathcal{V}_\mathcal{L}` uniformly at
# random.
#
# Stemming from the labeled nodes :math:`\tilde{\mathcal{V}}_\mathcal{L}`
# in a mini-batch and tracing back to the input forms a computational
# dependency graph (a directed acyclic graph or DAG in short), which
# captures the computation flow of :math:`Z^{(L)}`.
#
# In the example below, a mini-batch to compute the hidden features of
# node D in layer 2 requires hidden features of A, B, E, G in layer 1,
# which in turn requires hidden features of C, D, F in layer 0.
#
# |image0|
#
# For that purpose, we define ``NodeFlow`` to represent this computation
# flow.
#
# ``NodeFlow`` is a type of layered graph, where nodes are organized in
# :math:`L + 1` sequential *layers*, and edges only exist between adjacent
# layers, forming *blocks*. We construct ``NodeFlow`` backwards, starting
# from the last layer with all the nodes whose hidden features are
# requested. The set of nodes the next layer depends on forms the previous
# layer. An edge connects a node in the previous layer to another in the
# next layer iff the latter depends on the former. We repeat such process
# until all :math:`L + 1` layers are constructed. The feature of nodes in
# each layer, and that of edges in each block, are stored as separate
# tensors.
#
# .. raw:: html
#
# ``NodeFlow`` provides ``block_compute`` for per-block computation, which
# triggers computation and data propogation from the lower layer to the
# next upper layer.
#

##############################################################################
# Neighbor Sampling
# ~~~~~~~~~~~~~~~~~
#
# Real-world graphs often have nodes with large degree, meaning that a
# moderately deep (e.g. 3 layers) GCN would often depend on input features
# of the entire graph, even if the computation only depends on outputs of
# a few nodes, hence its cost-ineffectiveness.
#
# Sampling methods mitigate this computational problem by reducing the
# receptive field effectively. Fig-c above shows one such example.
#
# Instead of using all the :math:`L`-hop neighbors of a node :math:`v`,
# `Hamilton et al. <https://arxiv.org/abs/1706.02216>`__ propose *neighbor
# sampling*, which randomly samples a few neighbors
# :math:`\hat{\mathcal{N}}^{(l)}(v)` to estimate the aggregation
# :math:`z_v^{(l+1)}` of its total neighbors :math:`\mathcal{N}(v)` in
# :math:`l`-th GCN layer, by an unbiased estimator
# :math:`\hat{z}_v^{(l+1)}`
#
# .. math::
#
#
#    \hat{z}_v^{(l+1)} = \frac{\vert \mathcal{N}(v) \vert }{\vert \hat{\mathcal{N}}^{(l)}(v) \vert} \sum_{u \in \hat{\mathcal{N}}^{(l)}(v)} \tilde{A}_{uv} \hat{h}_u^{(l)} \qquad
#    \hat{h}_v^{(l+1)} = \sigma ( \hat{z}_v^{(l+1)} W^{(l)} )
#
# Let :math:`D^{(l)}` be the number of neighbors to be sampled for each
# node at the :math:`l`-th layer, then the receptive field size of each
# node can be controlled under :math:`\prod_{i=0}^{L-1} D^{(l)}` by
# *neighbor sampling*.
#

##############################################################################
# We then implement *neighbor smapling* by ``NodeFlow``:
#

class GCNSampling(gluon.Block):
    def __init__(self,
                 in_feats,
                 n_hidden,
                 n_classes,
                 n_layers,
                 activation,
                 dropout,
                 **kwargs):
        super(GCNSampling, self).__init__(**kwargs)
        self.dropout = dropout
        self.n_layers = n_layers
        with self.name_scope():
            self.layers = gluon.nn.Sequential()
            # input layer
            self.layers.add(NodeUpdate(in_feats, n_hidden, activation))
            # hidden layers
            for i in range(1, n_layers-1):
                self.layers.add(NodeUpdate(n_hidden, n_hidden, activation))
            # output layer
            self.layers.add(NodeUpdate(n_hidden, n_classes))

    def forward(self, nf):
        nf.layers[0].data['activation'] = nf.layers[0].data['features']
        for i, layer in enumerate(self.layers):
            h = nf.layers[i].data.pop('activation')
            if self.dropout:
                h = mx.nd.Dropout(h, p=self.dropout)
            nf.layers[i].data['h'] = h
            # block_compute() computes the feature of layer i given layer
            # i-1, with the given message, reduce, and apply functions.
            # Here, we essentially aggregate the neighbor node features in
            # the previous layer, and update it with the `layer` function.
            nf.block_compute(i,
                             fn.copy_src(src='h', out='m'),
                             lambda node : {'h': node.mailbox['m'].mean(axis=1)},
                             layer)
        h = nf.layers[-1].data.pop('activation')
        return h

##############################################################################
# DGL provides ``NeighborSampler`` to construct the ``NodeFlow`` for a
# mini-batch according to the computation logic of neighbor sampling.
# ``NeighborSampler``
# returns an iterator that generates a ``NodeFlow`` each time. This function
# has many options to give users opportunities to customize the behavior
# of the neighbor sampler, including the number of neighbors to sample,
# the number of hops to sample, etc. Please see `its API
# document <https://doc.dgl.ai/api/python/sampler.html>`__ for more
# details.
#

# dropout probability
dropout = 0.2
# batch size
batch_size = 10000
# number of neighbors to sample
num_neighbors = 8
# number of epochs
num_epochs = 1

# initialize the model and cross entropy loss
model = GCNSampling(in_feats, n_hidden, n_classes, L,
                    mx.nd.relu, dropout, prefix='GCN')
model.initialize()
loss_fcn = gluon.loss.SoftmaxCELoss()

# use adam optimizer
trainer = gluon.Trainer(model.collect_params(), 'adam',
                        {'learning_rate': 0.03, 'wd': 0})

for epoch in range(num_epochs):
    for nf in dgl.contrib.sampling.NeighborSampler(g, batch_size,
                                                   num_neighbors,
                                                   neighbor_type='in',
                                                   shuffle=True,
                                                   num_hops=L,
                                                   seed_nodes=train_nid):
        # When `NodeFlow` is generated from `NeighborSampler`, it only contains
        # the topology structure, on which there is no data attached.
        # Users need to call `copy_from_parent` to copy specific data,
        # such as input node features, from the original graph.
        nf.copy_from_parent()
        with mx.autograd.record():
            # forward
            pred = model(nf)
            batch_nids = nf.layer_parent_nid(-1).astype('int64')
            batch_labels = labels[batch_nids]
            # cross entropy loss
            loss = loss_fcn(pred, batch_labels)
            loss = loss.sum() / len(batch_nids)
        # backward
        loss.backward()
        # optimization
        trainer.step(batch_size=1)
        print("Epoch[{}]: loss {}".format(epoch, loss.asscalar()))

##############################################################################
# Control Variate
# ~~~~~~~~~~~~~~~
#
# The unbiased estimator :math:`\hat{Z}^{(\cdot)}` used in *neighbor
# sampling* might suffer from high variance, so it still requires a
# relatively large number of neighbors, e.g. \ :math:`D^{(0)}=25` and
# :math:`D^{(1)}=10` in `Hamilton et
# al. <https://arxiv.org/abs/1706.02216>`__. With *control variate*, a
# standard variance reduction technique widely used in Monte Carlo
# methods, 2 neighbors for a node seems sufficient.
#
# *Control variate* method works as follows: given a random variable
# :math:`X` and we wish to estimate its expectation
# :math:`\mathbb{E} [X] = \theta`, it finds another random variable
# :math:`Y` which is highly correlated with :math:`X` and whose
# expectation :math:`\mathbb{E} [Y]` can be easily computed. The *control
# variate* estimator :math:`\tilde{X}` is
#
# .. math::
#
#    \tilde{X} = X - Y + \mathbb{E} [Y] \qquad \mathbb{VAR} [\tilde{X}] = \mathbb{VAR} [X] + \mathbb{VAR} [Y] - 2 \cdot \mathbb{COV} [X, Y]
#
# If :math:`\mathbb{VAR} [Y] - 2\mathbb{COV} [X, Y] < 0`, then
# :math:`\mathbb{VAR} [\tilde{X}] < \mathbb{VAR} [X]`.
#
# `Chen et al. <https://arxiv.org/abs/1710.10568>`__ proposed a *control
# variate* based estimator used in GCN training, by using history
# :math:`\bar{H}^{(l)}` of the nodes which are not sampled, the modified
# estimator :math:`\hat{z}_v^{(l+1)}` is
#
# .. math::
#
#
#    \hat{z}_v^{(l+1)} = \frac{\vert \mathcal{N}(v) \vert }{\vert \hat{\mathcal{N}}^{(l)}(v) \vert} \sum_{u \in \hat{\mathcal{N}}^{(l)}(v)} \tilde{A}_{uv} ( \hat{h}_u^{(l)} - \bar{h}_u^{(l)} ) + \sum_{u \in \mathcal{N}(v)} \tilde{A}_{uv} \bar{h}_u^{(l)} \\
#    \hat{h}_v^{(l+1)} = \sigma ( \hat{z}_v^{(l+1)} W^{(l)} )
#
# This method can also be *conceptually* implemented in DGL as shown
# below,
#

have_large_memory = False
# The control-variate sampling code below needs to run on a large-memory
# machine for the Reddit graph.
if have_large_memory:
    g.ndata['h_0'] = features
    for i in range(L):
        g.ndata['h_{}'.format(i+1)] = mx.nd.zeros((features.shape[0], n_hidden))
    # With control-variate sampling, we only need to sample 2 neighbors to train GCN.
    for nf in dgl.contrib.sampling.NeighborSampler(g, batch_size, expand_factor=2,
                                                   neighbor_type='in', num_hops=L,
                                                   seed_nodes=train_nid):
        for i in range(nf.num_blocks):
            # aggregate history on the original graph
            g.pull(nf.layer_parent_nid(i+1),
                   fn.copy_src(src='h_{}'.format(i), out='m'),
                   lambda node: {'agg_h_{}'.format(i): node.mailbox['m'].mean(axis=1)})
        nf.copy_from_parent()
        h = nf.layers[0].data['features']
        for i in range(nf.num_blocks):
            prev_h = nf.layers[i].data['h_{}'.format(i)]
            # compute delta_h, the difference of the current activation and the history
            nf.layers[i].data['delta_h'] = h - prev_h
            # refresh the old history
            nf.layers[i].data['h_{}'.format(i)] = h.detach()
            # aggregate the delta_h
            nf.block_compute(i,
                             fn.copy_src(src='delta_h', out='m'),
                             lambda node: {'delta_h': node.data['m'].mean(axis=1)})
            delta_h = nf.layers[i + 1].data['delta_h']
            agg_h = nf.layers[i + 1].data['agg_h_{}'.format(i)]
            # control variate estimator
            nf.layers[i + 1].data['h'] = delta_h + agg_h
            nf.apply_layer(i + 1, lambda node : {'h' : layer(node.data['h'])})
            h = nf.layers[i + 1].data['h']
        # update history
        nf.copy_to_parent()

##############################################################################
# You can see full example here, `MXNet
# code <https://github.com/dmlc/dgl/blob/master/examples/mxnet/sampling/>`__
# and `PyTorch
# code <https://github.com/dmlc/dgl/tree/master/examples/pytorch/sampling>`__.
#
# Below shows the performance of graph convolution network and GraphSage
# with neighbor sampling and control variate sampling on the Reddit
# dataset. Our GraphSage with control variate sampling, when sampling one
# neighbor, can achieve over 96% test accuracy. |image1|
#
# More APIs
# ~~~~~~~~~
#
# In fact, ``block_compute`` is one of the APIs that comes with
# ``NodeFlow``, which provides flexibility to research new ideas. The
# computation flow underlying a DAG can be executed in one sweep, by
# calling ``prop_flows``.
#
# ``prop_flows`` accepts a list of UDFs. The code defines node update UDFs
# for each layer.
#

apply_node_funcs = [
    lambda node : {'h' : layers[0](node)['activation']},
    lambda node : {'h' : layers[1](node)['activation']},
]
for nf in dgl.contrib.sampling.NeighborSampler(g, batch_size, num_neighbors,
                                               neighbor_type='in', num_hops=L,
                                               seed_nodes=train_nid):
    nf.copy_from_parent()
    nf.layers[0].data['h'] = nf.layers[0].data['features']
    nf.prop_flow(fn.copy_src(src='h', out='m'),
                 fn.sum(msg='m', out='h'), apply_node_funcs)

##############################################################################
# Internally, ``prop_flow`` triggers the computation by fusing together
# all the block computations, from the input to the top. The main
# advantages of this API are 1) simplicity, 2) allowing more system-level
# optimization in the future.
#
# .. |image0| image:: https://s3.us-east-2.amazonaws.com/dgl.ai/tutorial/sampling/NodeFlow.png
# .. |image1| image:: https://s3.us-east-2.amazonaws.com/dgl.ai/tutorial/sampling/sampling_result.png
#