dnn_mnist_advanced_ex.cpp 18.3 KB
Newer Older
Davis King's avatar
Davis King committed
1
2
3
4
5
6
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
    This is an example illustrating the use of the deep learning tools from the
    dlib C++ Library.  I'm assuming you have already read the dnn_mnist_ex.cpp
    example.  So in this example program I'm going to go over a number of more
    advanced parts of the API, including:
7
        - Using multiple GPUs
Davis King's avatar
Davis King committed
8
9
10
11
        - Training on large datasets that don't fit in memory 
        - Defining large networks
        - Accessing and configuring layers in a network
*/
12

Fm's avatar
Fm committed
13
14
15
16
17
// DNN module uses template-based network declaration that leads to very long
// type names. Visual Studio will produce Warning C4503 in such cases
#ifdef _MSC_VER
#   pragma warning( disable: 4503 )
#endif
18
19
20
21
22
23
24
25

#include <dlib/dnn.h>
#include <iostream>
#include <dlib/data_io.h>

using namespace std;
using namespace dlib;

Davis King's avatar
Davis King committed
26
// ----------------------------------------------------------------------------------------
27

28
29
30
31
// Let's start by showing how you can conveniently define large and complex
// networks.  The most important tool for doing this are C++'s alias templates.
// These let us define new layer types that are combinations of a bunch of other
// layers.  These will form the building blocks for more complex networks.
Davis King's avatar
Davis King committed
32
33
34

// So let's begin by defining the building block of a residual network (see
// Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
// and Sun).  We are going to decompose the residual block into a few alias
// statements.  First, we define the core block.

// Here we have parameterized the "block" layer on a BN layer (nominally some
// kind of batch normalization), the number of filter outputs N, and the stride
// the block operates at.
template <
    int N, 
    template <typename> class BN, 
    int stride, 
    typename SUBNET
    > 
using block  = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;

// Next, we need to define the skip layer mechanism used in the residual network
// paper.  They create their blocks by adding the input tensor to the output of
// each block.  So we define an alias statement that takes a block and wraps it
// with this skip/add structure.

// Note the tag layer.  This layer doesn't do any computation.  It exists solely
// so other layers can refer to it.  In this case, the add_prev1 layer looks for
// the tag1 layer and will take the tag1 output and add it to the input of the
// add_prev1 layer.  This combination allows us to implement skip and residual
// style networks.  We have also set the block stride to 1 in this statement.
// The significance of that is explained next.
template <
    template <int,template<typename>class,int,typename> class block, 
    int N, 
    template<typename>class BN, 
    typename SUBNET
    >
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;

// Some residual blocks do downsampling.  They do this by using a stride of 2
// instead of 1.  However, when downsampling we need to also take care to
// downsample the part of the network that adds the original input to the output
// or the sizes won't make sense (the network will still run, but the results
// aren't as good).  So here we define a downsampling version of residual.  In
// it, we make use of the skip1 layer.  This layer simply outputs whatever is
// output by the tag1 layer.  Therefore, the skip1 layer (there are also skip2,
// skip3, etc. in dlib) allows you to create branching network structures.

// residual_down creates a network structure like this:
/*
         input from SUBNET
             /     \
            /       \
         block     downsample(using avg_pool)
            \       /
             \     /
           add tensors (using add_prev2 which adds the output of tag2 with avg_pool's output)
                |
              output
*/
template <
    template <int,template<typename>class,int,typename> class block, 
    int N, 
    template<typename>class BN, 
    typename SUBNET
    >
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;


98
99
100
101
102
103
104

// Now we can define 4 different residual blocks we will use in this example.
// The first two are non-downsampling residual blocks while the last two
// downsample.  Also, res and res_down use batch normalization while ares and
// ares_down have had the batch normalization replaced with simple affine
// layers.  We will use the affine version of the layers when testing our
// networks.
105
106
107
108
template <typename SUBNET> using res       = relu<residual<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares      = relu<residual<block,8,affine,SUBNET>>;
template <typename SUBNET> using res_down  = relu<residual_down<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>;
109

Davis King's avatar
Davis King committed
110
111
112
113
114


// Now that we have these convenient aliases, we can define a residual network
// without a lot of typing.  Note the use of a repeat layer.  This special layer
// type allows us to type repeat<9,res<SUBNET>> instead of
Davis King's avatar
Davis King committed
115
116
117
// res<res<res<res<res<res<res<res<res<SUBNET>>>>>>>>>.  It will also prevent
// the compiler from complaining about super deep template nesting when creating
// large networks.
Davis King's avatar
Davis King committed
118
119
const unsigned long number_of_classes = 10;
using net_type = loss_multiclass_log<fc<number_of_classes,
120
                            avg_pool_everything<
Davis King's avatar
Davis King committed
121
122
123
124
                            res<res<res<res_down<
                            repeat<9,res, // repeat this layer 9 times
                            res_down<
                            res<
Davis King's avatar
Davis King committed
125
126
                            input<matrix<unsigned char>>
                            >>>>>>>>>>;
Davis King's avatar
Davis King committed
127
128
129
130


// And finally, let's define a residual network building block that uses
// parametric ReLU units instead of regular ReLU.
131
template <typename SUBNET> 
Davis King's avatar
Davis King committed
132
using pres  = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
Davis King's avatar
Davis King committed
133
134
135

// ----------------------------------------------------------------------------------------

136
137
138
139
int main(int argc, char** argv) try
{
    if (argc != 2)
    {
Davis King's avatar
Davis King committed
140
141
142
143
        cout << "This example needs the MNIST dataset to run!" << endl;
        cout << "You can get MNIST from http://yann.lecun.com/exdb/mnist/" << endl;
        cout << "Download the 4 files that comprise the dataset, decompress them, and" << endl;
        cout << "put them in a folder.  Then give that folder as input to this program." << endl;
144
145
146
147
148
149
150
151
152
153
        return 1;
    }

    std::vector<matrix<unsigned char>> training_images;
    std::vector<unsigned long> training_labels;
    std::vector<matrix<unsigned char>> testing_images;
    std::vector<unsigned long> testing_labels;
    load_mnist_dataset(argv[1], training_images, training_labels, testing_images, testing_labels);


Davis King's avatar
Davis King committed
154
155
156
157
158
    // dlib uses cuDNN under the covers.  One of the features of cuDNN is the
    // option to use slower methods that use less RAM or faster methods that use
    // a lot of RAM.  If you find that you run out of RAM on your graphics card
    // then you can call this function and we will request the slower but more
    // RAM frugal cuDNN algorithms.
159
    set_dnn_prefer_smallest_algorithms();
160
161


Davis King's avatar
Davis King committed
162
163
164
    // Create a network as defined above.  This network will produce 10 outputs
    // because that's how we defined net_type.  However, fc layers can have the
    // number of outputs they produce changed at runtime.  
165
    net_type net;
Davis King's avatar
Davis King committed
166
167
    // So if you wanted to use the same network but override the number of
    // outputs at runtime you can do so like this:
168
169
    net_type net2(num_fc_outputs(15));

Davis King's avatar
Davis King committed
170
171
172
    // Now, let's imagine we wanted to replace some of the relu layers with
    // prelu layers.  We might do it like this:
    using net_type2 = loss_multiclass_log<fc<number_of_classes,
173
                                avg_pool_everything<
174
175
176
177
                                pres<res<res<res_down< // 2 prelu layers here
                                tag4<repeat<9,pres,    // 9 groups, each containing 2 prelu layers  
                                res_down<
                                res<
Davis King's avatar
Davis King committed
178
179
                                input<matrix<unsigned char>>
                                >>>>>>>>>>>;
180

Davis King's avatar
Davis King committed
181
182
    // prelu layers have a floating point parameter.  If you want to set it to
    // something other than its default value you can do so like this:
183
    net_type2 pnet(prelu_(0.2),  
184
                   prelu_(0.25),
185
                   repeat_group(prelu_(0.3),prelu_(0.4)) // Initialize all the prelu instances in the repeat 
Davis King's avatar
Davis King committed
186
187
                                                         // layer.  repeat_group() is needed to group the 
                                                         // things that are part of repeat's block.
188
                   );
Davis King's avatar
Davis King committed
189
190
191
192
193
    // As you can see, a network will greedily assign things given to its
    // constructor to the layers inside itself.  The assignment is done in the
    // order the layers are defined, but it will skip layers where the
    // assignment doesn't make sense.  

194
195
196
197
198
199
    // Now let's print the details of the pnet to the screen and inspect it.
    cout << "The pnet has " << pnet.num_layers << " layers in it." << endl;
    cout << pnet << endl;
    // These print statements will output this (I've truncated it since it's
    // long, but you get the idea):
    /*
200
        The pnet has 131 layers in it.
201
        layer<0>    loss_multiclass_log
202
        layer<1>    fc       (num_outputs=10) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
203
204
        layer<2>    avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
        layer<3>    prelu    (initial_param_value=0.2)
Davis King's avatar
Davis King committed
205
        layer<4>    add_prev1
Davis King's avatar
Davis King committed
206
        layer<5>    bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
207
        layer<6>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
208
        layer<7>    prelu    (initial_param_value=0.25)
Davis King's avatar
Davis King committed
209
        layer<8>    bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
Davis King's avatar
Davis King committed
210
        layer<9>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
211
        layer<10>   tag1
212
        ...
213
        layer<34>   relu
Davis King's avatar
Davis King committed
214
        layer<35>   bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
215
216
217
218
        layer<36>   con      (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
        layer<37>   tag1
        layer<38>   tag4
        layer<39>   prelu    (initial_param_value=0.3)
Davis King's avatar
Davis King committed
219
        layer<40>   add_prev1
Davis King's avatar
Davis King committed
220
        layer<41>   bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
221
        ...
222
        layer<118>  relu
Davis King's avatar
Davis King committed
223
        layer<119>  bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
224
225
        layer<120>  con      (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
        layer<121>  tag1
226
        layer<122>  relu
Davis King's avatar
Davis King committed
227
        layer<123>  add_prev1
Davis King's avatar
Davis King committed
228
        layer<124>  bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
229
230
        layer<125>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
        layer<126>  relu
Davis King's avatar
Davis King committed
231
        layer<127>  bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
232
233
234
        layer<128>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
        layer<129>  tag1
        layer<130>  input<matrix>
235
236
237
238
239
    */

    // Now that we know the index numbers for each layer, we can access them
    // individually using layer<index>(pnet).  For example, to access the output
    // tensor for the first prelu layer we can say:
Davis King's avatar
Davis King committed
240
    layer<3>(pnet).get_output();
241
242
243
244
245
246
    // Or to print the prelu parameter for layer 7 we can say:
    cout << "prelu param: "<< layer<7>(pnet).layer_details().get_initial_param_value() << endl;

    // We can also access layers by their type.  This next statement finds the
    // first tag1 layer in pnet, and is therefore equivalent to calling
    // layer<10>(pnet):
Davis King's avatar
Davis King committed
247
    layer<tag1>(pnet);
248
249
250
251
    // The tag layers don't do anything at all and exist simply so you can tag
    // parts of your network and access them by layer<tag>().  You can also
    // index relative to a tag.  So for example, to access the layer immediately
    // after tag4 you can say:
252
    layer<tag4,1>(pnet); // Equivalent to layer<38+1>(pnet).
253
254
255
256
257
258

    // Or to access the layer 2 layers after tag4:
    layer<tag4,2>(pnet);
    // Tagging is a very useful tool for making complex network structures.  For
    // example, the add_prev1 layer is implemented internally by using a call to
    // layer<tag1>().
259

260
261


262
263
    // Ok, that's enough talk about defining and inspecting networks.  Let's
    // talk about training networks!
Davis King's avatar
Davis King committed
264
265

    // The dnn_trainer will use SGD by default, but you can tell it to use
266
267
268
    // different solvers like adam with a weight decay of 0.0005 and the given
    // momentum parameters. 
    dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999));
269
270
271
272
    // Also, if you have multiple graphics cards you can tell the trainer to use
    // them together to make the training faster.  For example, replacing the
    // above constructor call with this one would cause it to use GPU cards 0
    // and 1.
273
    //dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999), {0,1});
274

275
276
    trainer.be_verbose();
    trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100));
Davis King's avatar
Davis King committed
277
278
    // While the trainer is running it keeps an eye on the training error.  If
    // it looks like the error hasn't decreased for the last 2000 iterations it
279
    // will automatically reduce the learning rate by 0.1.  You can change these
Davis King's avatar
Davis King committed
280
    // default parameters to some other values by calling these functions.  Or
281
    // disable the automatic shrinking entirely by setting the shrink factor to 1.
Davis King's avatar
Davis King committed
282
    trainer.set_iterations_without_progress_threshold(2000);
283
    trainer.set_learning_rate_shrink_factor(0.1);
284
285
    // The learning rate will start at 1e-3.
    trainer.set_learning_rate(1e-3);
Davis King's avatar
Davis King committed
286
287
288
289
290
291
292


    // Now, what if your training dataset is so big it doesn't fit in RAM?  You
    // make mini-batches yourself, any way you like, and you send them to the
    // trainer by repeatedly calling trainer.train_one_step(). 
    //
    // For example, the loop below stream MNIST data to out trainer.
293
294
    std::vector<matrix<unsigned char>> mini_batch_samples;
    std::vector<unsigned long> mini_batch_labels; 
295
    dlib::rand rnd(time(0));
296
297
298
299
    // Loop until the trainer's automatic shrinking has shrunk the learning rate to 1e-6.
    // Given our settings, this means it will stop training after it has shrunk the
    // learning rate 3 times.
    while(trainer.get_learning_rate() >= 1e-6)
300
301
302
303
304
305
306
307
308
309
310
311
312
313
    {
        mini_batch_samples.clear();
        mini_batch_labels.clear();

        // make a 128 image mini-batch
        while(mini_batch_samples.size() < 128)
        {
            auto idx = rnd.get_random_32bit_number()%training_images.size();
            mini_batch_samples.push_back(training_images[idx]);
            mini_batch_labels.push_back(training_labels[idx]);
        }

        trainer.train_one_step(mini_batch_samples, mini_batch_labels);
    }
Davis King's avatar
Davis King committed
314
315
316
317
318
319
320

    // When you call train_one_step(), the trainer will do its processing in a
    // separate thread.  This allows the main thread to work on loading data
    // while the trainer is busy executing the mini-batches in parallel.
    // However, this also means we need to wait for any mini-batches that are
    // still executing to stop before we mess with the net object.  Calling
    // get_net() performs the necessary synchronization.
321
322
    trainer.get_net();

Davis King's avatar
Davis King committed
323

324
    net.clean();
Davis King's avatar
Davis King committed
325
326
327
    serialize("mnist_res_network.dat") << net;


Davis King's avatar
Davis King committed
328
329
330
331
332
333
    // Now we have a trained network.  However, it has batch normalization
    // layers in it.  As is customary, we should replace these with simple
    // affine layers before we use the network.  This can be accomplished by
    // making a network type which is identical to net_type but with the batch
    // normalization layers replaced with affine.  For example:
    using test_net_type = loss_multiclass_log<fc<number_of_classes,
334
                                avg_pool_everything<
335
336
337
                                ares<ares<ares<ares_down<
                                repeat<9,res,
                                ares_down<
Davis King's avatar
Davis King committed
338
                                ares<
Davis King's avatar
Davis King committed
339
340
                                input<matrix<unsigned char>>
                                >>>>>>>>>>;
Davis King's avatar
Davis King committed
341
    // Then we can simply assign our trained net to our testing net.
Davis King's avatar
Davis King committed
342
    test_net_type tnet = net;
Davis King's avatar
Davis King committed
343
344
    // Or if you only had a file with your trained network you could deserialize
    // it directly into your testing network.  
Davis King's avatar
Davis King committed
345
346
    deserialize("mnist_res_network.dat") >> tnet;

347

Davis King's avatar
Davis King committed
348
349
    // And finally, we can run the testing network over our data.

Davis King's avatar
Davis King committed
350
    std::vector<unsigned long> predicted_labels = tnet(training_images);
351
352
353
354
355
356
357
358
359
360
361
362
363
364
    int num_right = 0;
    int num_wrong = 0;
    for (size_t i = 0; i < training_images.size(); ++i)
    {
        if (predicted_labels[i] == training_labels[i])
            ++num_right;
        else
            ++num_wrong;
        
    }
    cout << "training num_right: " << num_right << endl;
    cout << "training num_wrong: " << num_wrong << endl;
    cout << "training accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;

Davis King's avatar
Davis King committed
365
    predicted_labels = tnet(testing_images);
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
    num_right = 0;
    num_wrong = 0;
    for (size_t i = 0; i < testing_images.size(); ++i)
    {
        if (predicted_labels[i] == testing_labels[i])
            ++num_right;
        else
            ++num_wrong;
        
    }
    cout << "testing num_right: " << num_right << endl;
    cout << "testing num_wrong: " << num_wrong << endl;
    cout << "testing accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;

}
catch(std::exception& e)
{
    cout << e.what() << endl;
}