dnn_mnist_advanced_ex.cpp 15.2 KB
Newer Older
Davis King's avatar
Davis King committed
1
2
3
4
5
6
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
    This is an example illustrating the use of the deep learning tools from the
    dlib C++ Library.  I'm assuming you have already read the dnn_mnist_ex.cpp
    example.  So in this example program I'm going to go over a number of more
    advanced parts of the API, including:
7
        - Using multiple GPUs
Davis King's avatar
Davis King committed
8
9
10
11
        - Training on large datasets that don't fit in memory 
        - Defining large networks
        - Accessing and configuring layers in a network
*/
12
13
14
15
16
17
18
19
20


#include <dlib/dnn.h>
#include <iostream>
#include <dlib/data_io.h>

using namespace std;
using namespace dlib;

Davis King's avatar
Davis King committed
21
// ----------------------------------------------------------------------------------------
22

Davis King's avatar
Davis King committed
23
24
25
26
27
28
29
30
31
32
33
34
35
// Let's start by showing how you can conveniently define large networks.  The
// most important tool for doing this are C++'s alias templates.  These let us
// define new layer types that are combinations of a bunch of other layers.
// These will form the building blocks for more complex networks.

// So let's begin by defining the building block of a residual network (see
// Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
// and Sun).  You can see a few things in this statement.  The most obvious is
// that we have combined a bunch of layers into the name "base_res".  You can
// also see the use of the tag1 layer.  This layer doesn't do any computation.
// It exists solely so other layers can refer to it.  In this case, the
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
// add it to the input of the add_prev1 layer.  This combination allows us to
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// implement skip and residual style networks.  We have also made base_res
// parameterized by BN, which will let us insert different batch normalization
// layers.
template <template <typename> class BN, typename SUBNET> 
using base_res  = relu<add_prev1<BN<con<8,3,3,1,1,relu<BN<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;

// We also want a residual block that begins by doing downsampling.  We can
// reuse base_res to define it like this:
template <template <typename> class BN, typename SUBNET> 
using base_res_down  = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;

// Now we can define 4 different residual blocks we will use in this example.
// The first two are non-downsampling residual blocks while the last two
// downsample.  Also, res and res_down use batch normalization while ares and
// ares_down have had the batch normalization replaced with simple affine
// layers.  We will use the affine version of the layers when testing our
// networks.
template <typename SUBNET> using res       = base_res<bn_con,SUBNET>;
template <typename SUBNET> using ares      = base_res<affine,SUBNET>;
template <typename SUBNET> using res_down  = base_res_down<bn_con,SUBNET>;
template <typename SUBNET> using ares_down = base_res_down<affine,SUBNET>;
57

Davis King's avatar
Davis King committed
58
59
60
61
62


// Now that we have these convenient aliases, we can define a residual network
// without a lot of typing.  Note the use of a repeat layer.  This special layer
// type allows us to type repeat<9,res<SUBNET>> instead of
Davis King's avatar
Davis King committed
63
64
65
// res<res<res<res<res<res<res<res<res<SUBNET>>>>>>>>>.  It will also prevent
// the compiler from complaining about super deep template nesting when creating
// large networks.
Davis King's avatar
Davis King committed
66
67
const unsigned long number_of_classes = 10;
using net_type = loss_multiclass_log<fc<number_of_classes,
68
                            avg_pool_everything<
Davis King's avatar
Davis King committed
69
70
71
72
                            res<res<res<res_down<
                            repeat<9,res, // repeat this layer 9 times
                            res_down<
                            res<
Davis King's avatar
Davis King committed
73
74
                            input<matrix<unsigned char>>
                            >>>>>>>>>>;
Davis King's avatar
Davis King committed
75
76
77
78


// And finally, let's define a residual network building block that uses
// parametric ReLU units instead of regular ReLU.
79
template <typename SUBNET> 
Davis King's avatar
Davis King committed
80
using pres  = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
Davis King's avatar
Davis King committed
81
82
83

// ----------------------------------------------------------------------------------------

84
85
86
87
int main(int argc, char** argv) try
{
    if (argc != 2)
    {
Davis King's avatar
Davis King committed
88
89
90
91
        cout << "This example needs the MNIST dataset to run!" << endl;
        cout << "You can get MNIST from http://yann.lecun.com/exdb/mnist/" << endl;
        cout << "Download the 4 files that comprise the dataset, decompress them, and" << endl;
        cout << "put them in a folder.  Then give that folder as input to this program." << endl;
92
93
94
95
96
97
98
99
100
101
        return 1;
    }

    std::vector<matrix<unsigned char>> training_images;
    std::vector<unsigned long> training_labels;
    std::vector<matrix<unsigned char>> testing_images;
    std::vector<unsigned long> testing_labels;
    load_mnist_dataset(argv[1], training_images, training_labels, testing_images, testing_labels);


Davis King's avatar
Davis King committed
102
103
104
105
106
    // dlib uses cuDNN under the covers.  One of the features of cuDNN is the
    // option to use slower methods that use less RAM or faster methods that use
    // a lot of RAM.  If you find that you run out of RAM on your graphics card
    // then you can call this function and we will request the slower but more
    // RAM frugal cuDNN algorithms.
107
    set_dnn_prefer_smallest_algorithms();
108
109


Davis King's avatar
Davis King committed
110
111
112
    // Create a network as defined above.  This network will produce 10 outputs
    // because that's how we defined net_type.  However, fc layers can have the
    // number of outputs they produce changed at runtime.  
113
    net_type net;
Davis King's avatar
Davis King committed
114
115
    // So if you wanted to use the same network but override the number of
    // outputs at runtime you can do so like this:
116
117
    net_type net2(num_fc_outputs(15));

Davis King's avatar
Davis King committed
118
119
120
    // Now, let's imagine we wanted to replace some of the relu layers with
    // prelu layers.  We might do it like this:
    using net_type2 = loss_multiclass_log<fc<number_of_classes,
121
                                avg_pool_everything<
122
123
124
125
                                pres<res<res<res_down< // 2 prelu layers here
                                tag4<repeat<9,pres,    // 9 groups, each containing 2 prelu layers  
                                res_down<
                                res<
Davis King's avatar
Davis King committed
126
127
                                input<matrix<unsigned char>>
                                >>>>>>>>>>>;
128

Davis King's avatar
Davis King committed
129
130
    // prelu layers have a floating point parameter.  If you want to set it to
    // something other than its default value you can do so like this:
131
    net_type2 pnet(prelu_(0.2),  
132
                   prelu_(0.25),
133
                   repeat_group(prelu_(0.3),prelu_(0.4)) // Initialize all the prelu instances in the repeat 
Davis King's avatar
Davis King committed
134
135
                                                         // layer.  repeat_group() is needed to group the 
                                                         // things that are part of repeat's block.
136
                   );
Davis King's avatar
Davis King committed
137
138
139
140
141
    // As you can see, a network will greedily assign things given to its
    // constructor to the layers inside itself.  The assignment is done in the
    // order the layers are defined, but it will skip layers where the
    // assignment doesn't make sense.  

142
143
144
145
146
147
    // Now let's print the details of the pnet to the screen and inspect it.
    cout << "The pnet has " << pnet.num_layers << " layers in it." << endl;
    cout << pnet << endl;
    // These print statements will output this (I've truncated it since it's
    // long, but you get the idea):
    /*
148
149
150
151
152
153
154
155
156
157
158
159
        The pnet has 127 layers in it.
        layer<0>    loss_multiclass_log
        layer<1>    fc       (num_outputs=10)
        layer<2>    avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
        layer<3>    prelu    (initial_param_value=0.2)
        layer<4>    add_prev
        layer<5>    bn_con
        layer<6>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
        layer<7>    prelu    (initial_param_value=0.25)
        layer<8>    bn_con
        layer<9>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
        layer<10>   tag1
160
        ...
161
162
163
164
165
166
167
        layer<33>   con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
        layer<34>   tag1
        layer<35>   avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
        layer<36>   tag4
        layer<37>   prelu    (initial_param_value=0.3)
        layer<38>   add_prev
        layer<39>   bn_con
168
        ...
169
170
171
172
173
174
175
176
177
178
179
180
        layer<115>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
        layer<116>  tag1
        layer<117>  avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
        layer<118>  relu
        layer<119>  add_prev
        layer<120>  bn_con
        layer<121>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
        layer<122>  relu
        layer<123>  bn_con
        layer<124>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
        layer<125>  tag1
        layer<126>  input<matrix>
181
182
183
184
185
    */

    // Now that we know the index numbers for each layer, we can access them
    // individually using layer<index>(pnet).  For example, to access the output
    // tensor for the first prelu layer we can say:
Davis King's avatar
Davis King committed
186
    layer<3>(pnet).get_output();
187
188
189
190
191
192
    // Or to print the prelu parameter for layer 7 we can say:
    cout << "prelu param: "<< layer<7>(pnet).layer_details().get_initial_param_value() << endl;

    // We can also access layers by their type.  This next statement finds the
    // first tag1 layer in pnet, and is therefore equivalent to calling
    // layer<10>(pnet):
Davis King's avatar
Davis King committed
193
    layer<tag1>(pnet);
194
195
196
197
    // The tag layers don't do anything at all and exist simply so you can tag
    // parts of your network and access them by layer<tag>().  You can also
    // index relative to a tag.  So for example, to access the layer immediately
    // after tag4 you can say:
198
    layer<tag4,1>(pnet); // Equivalent to layer<36+1>(pnet).
199
200
201
202
203
204

    // Or to access the layer 2 layers after tag4:
    layer<tag4,2>(pnet);
    // Tagging is a very useful tool for making complex network structures.  For
    // example, the add_prev1 layer is implemented internally by using a call to
    // layer<tag1>().
205

206
207


208
209
    // Ok, that's enough talk about defining and inspecting networks.  Let's
    // talk about training networks!
Davis King's avatar
Davis King committed
210
211

    // The dnn_trainer will use SGD by default, but you can tell it to use
212
213
214
    // different solvers like adam with a weight decay of 0.0005 and the given
    // momentum parameters. 
    dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999));
215
216
217
218
    // Also, if you have multiple graphics cards you can tell the trainer to use
    // them together to make the training faster.  For example, replacing the
    // above constructor call with this one would cause it to use GPU cards 0
    // and 1.
219
    //dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999), {0,1});
220

221
222
    trainer.be_verbose();
    trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100));
Davis King's avatar
Davis King committed
223
224
    // While the trainer is running it keeps an eye on the training error.  If
    // it looks like the error hasn't decreased for the last 2000 iterations it
225
    // will automatically reduce the learning rate by 0.1.  You can change these
Davis King's avatar
Davis King committed
226
    // default parameters to some other values by calling these functions.  Or
227
    // disable the automatic shrinking entirely by setting the shrink amount to 1.
Davis King's avatar
Davis King committed
228
    trainer.set_iterations_without_progress_threshold(2000);
229
230
231
    trainer.set_learning_rate_shrink_amount(0.1);
    // The learning rate will start at 1e-3.
    trainer.set_learning_rate(1e-3);
Davis King's avatar
Davis King committed
232
233
234
235
236
237
238


    // Now, what if your training dataset is so big it doesn't fit in RAM?  You
    // make mini-batches yourself, any way you like, and you send them to the
    // trainer by repeatedly calling trainer.train_one_step(). 
    //
    // For example, the loop below stream MNIST data to out trainer.
239
240
    std::vector<matrix<unsigned char>> mini_batch_samples;
    std::vector<unsigned long> mini_batch_labels; 
241
    dlib::rand rnd(time(0));
242
243
244
245
    // Loop until the trainer's automatic shrinking has shrunk the learning rate to 1e-6.
    // Given our settings, this means it will stop training after it has shrunk the
    // learning rate 3 times.
    while(trainer.get_learning_rate() >= 1e-6)
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    {
        mini_batch_samples.clear();
        mini_batch_labels.clear();

        // make a 128 image mini-batch
        while(mini_batch_samples.size() < 128)
        {
            auto idx = rnd.get_random_32bit_number()%training_images.size();
            mini_batch_samples.push_back(training_images[idx]);
            mini_batch_labels.push_back(training_labels[idx]);
        }

        trainer.train_one_step(mini_batch_samples, mini_batch_labels);
    }
Davis King's avatar
Davis King committed
260
261
262
263
264
265
266

    // When you call train_one_step(), the trainer will do its processing in a
    // separate thread.  This allows the main thread to work on loading data
    // while the trainer is busy executing the mini-batches in parallel.
    // However, this also means we need to wait for any mini-batches that are
    // still executing to stop before we mess with the net object.  Calling
    // get_net() performs the necessary synchronization.
267
268
    trainer.get_net();

Davis King's avatar
Davis King committed
269

270
    net.clean();
Davis King's avatar
Davis King committed
271
272
273
    serialize("mnist_res_network.dat") << net;


Davis King's avatar
Davis King committed
274
275
276
277
278
279
    // Now we have a trained network.  However, it has batch normalization
    // layers in it.  As is customary, we should replace these with simple
    // affine layers before we use the network.  This can be accomplished by
    // making a network type which is identical to net_type but with the batch
    // normalization layers replaced with affine.  For example:
    using test_net_type = loss_multiclass_log<fc<number_of_classes,
280
                                avg_pool_everything<
281
282
283
                                ares<ares<ares<ares_down<
                                repeat<9,res,
                                ares_down<
Davis King's avatar
Davis King committed
284
                                ares<
Davis King's avatar
Davis King committed
285
286
                                input<matrix<unsigned char>>
                                >>>>>>>>>>;
Davis King's avatar
Davis King committed
287
    // Then we can simply assign our trained net to our testing net.
Davis King's avatar
Davis King committed
288
    test_net_type tnet = net;
Davis King's avatar
Davis King committed
289
290
    // Or if you only had a file with your trained network you could deserialize
    // it directly into your testing network.  
Davis King's avatar
Davis King committed
291
292
    deserialize("mnist_res_network.dat") >> tnet;

293

Davis King's avatar
Davis King committed
294
295
    // And finally, we can run the testing network over our data.

Davis King's avatar
Davis King committed
296
    std::vector<unsigned long> predicted_labels = tnet(training_images);
297
298
299
300
301
302
303
304
305
306
307
308
309
310
    int num_right = 0;
    int num_wrong = 0;
    for (size_t i = 0; i < training_images.size(); ++i)
    {
        if (predicted_labels[i] == training_labels[i])
            ++num_right;
        else
            ++num_wrong;
        
    }
    cout << "training num_right: " << num_right << endl;
    cout << "training num_wrong: " << num_wrong << endl;
    cout << "training accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;

Davis King's avatar
Davis King committed
311
    predicted_labels = tnet(testing_images);
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
    num_right = 0;
    num_wrong = 0;
    for (size_t i = 0; i < testing_images.size(); ++i)
    {
        if (predicted_labels[i] == testing_labels[i])
            ++num_right;
        else
            ++num_wrong;
        
    }
    cout << "testing num_right: " << num_right << endl;
    cout << "testing num_wrong: " << num_wrong << endl;
    cout << "testing accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;

}
catch(std::exception& e)
{
    cout << e.what() << endl;
}