dnn_mnist_advanced_ex.cpp 14.4 KB
Newer Older
Davis King's avatar
Davis King committed
1
2
3
4
5
6
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
    This is an example illustrating the use of the deep learning tools from the
    dlib C++ Library.  I'm assuming you have already read the dnn_mnist_ex.cpp
    example.  So in this example program I'm going to go over a number of more
    advanced parts of the API, including:
7
        - Using multiple GPUs
Davis King's avatar
Davis King committed
8
9
10
11
        - Training on large datasets that don't fit in memory 
        - Defining large networks
        - Accessing and configuring layers in a network
*/
12
13
14
15
16
17
18
19
20


#include <dlib/dnn.h>
#include <iostream>
#include <dlib/data_io.h>

using namespace std;
using namespace dlib;

Davis King's avatar
Davis King committed
21
// ----------------------------------------------------------------------------------------
22

Davis King's avatar
Davis King committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// Let's start by showing how you can conveniently define large networks.  The
// most important tool for doing this are C++'s alias templates.  These let us
// define new layer types that are combinations of a bunch of other layers.
// These will form the building blocks for more complex networks.

// So let's begin by defining the building block of a residual network (see
// Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
// and Sun).  You can see a few things in this statement.  The most obvious is
// that we have combined a bunch of layers into the name "base_res".  You can
// also see the use of the tag1 layer.  This layer doesn't do any computation.
// It exists solely so other layers can refer to it.  In this case, the
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
// add it to the input of the add_prev1 layer.  This combination allows us to
// implement skip and residual style networks.  
37
template <int stride, typename SUBNET> 
Davis King's avatar
Davis King committed
38
using base_res  = relu<add_prev1<bn_con<con<8,3,3,1,1,relu<bn_con<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>;
Davis King's avatar
Davis King committed
39

Davis King's avatar
Davis King committed
40
41
42
// Let's also define the same block but with all the batch normalization layers
// replaced with affine transform layers.  We will use this type of construction
// when testing our networks.
43
template <int stride, typename SUBNET> 
44
using base_ares = relu<add_prev1<affine<con<8,3,3,1,1,relu<affine<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>;
45

Davis King's avatar
Davis King committed
46
47
48
// And of course we can define more alias templates based on previously defined
// alias templates.  The _down versions downsample the inputs by a factor of 2
// while the res and ares layer types don't.
49
50
51
52
template <typename SUBNET> using res       = base_res<1,SUBNET>;
template <typename SUBNET> using res_down  = base_res<2,SUBNET>;
template <typename SUBNET> using ares      = base_ares<1,SUBNET>;
template <typename SUBNET> using ares_down = base_ares<2,SUBNET>;
53

Davis King's avatar
Davis King committed
54
55
56
57
58


// Now that we have these convenient aliases, we can define a residual network
// without a lot of typing.  Note the use of a repeat layer.  This special layer
// type allows us to type repeat<9,res<SUBNET>> instead of
Davis King's avatar
Davis King committed
59
60
61
// res<res<res<res<res<res<res<res<res<SUBNET>>>>>>>>>.  It will also prevent
// the compiler from complaining about super deep template nesting when creating
// large networks.
Davis King's avatar
Davis King committed
62
63
const unsigned long number_of_classes = 10;
using net_type = loss_multiclass_log<fc<number_of_classes,
64
                            avg_pool<6,6,11,11,
Davis King's avatar
Davis King committed
65
66
67
68
                            res<res<res<res_down<
                            repeat<9,res, // repeat this layer 9 times
                            res_down<
                            res<
Davis King's avatar
Davis King committed
69
70
                            input<matrix<unsigned char>>
                            >>>>>>>>>>;
Davis King's avatar
Davis King committed
71
72
73
74


// And finally, let's define a residual network building block that uses
// parametric ReLU units instead of regular ReLU.
75
template <typename SUBNET> 
Davis King's avatar
Davis King committed
76
using pres  = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
Davis King's avatar
Davis King committed
77
78
79

// ----------------------------------------------------------------------------------------

80
81
82
83
int main(int argc, char** argv) try
{
    if (argc != 2)
    {
Davis King's avatar
Davis King committed
84
85
86
87
        cout << "This example needs the MNIST dataset to run!" << endl;
        cout << "You can get MNIST from http://yann.lecun.com/exdb/mnist/" << endl;
        cout << "Download the 4 files that comprise the dataset, decompress them, and" << endl;
        cout << "put them in a folder.  Then give that folder as input to this program." << endl;
88
89
90
91
92
93
94
95
96
97
        return 1;
    }

    std::vector<matrix<unsigned char>> training_images;
    std::vector<unsigned long> training_labels;
    std::vector<matrix<unsigned char>> testing_images;
    std::vector<unsigned long> testing_labels;
    load_mnist_dataset(argv[1], training_images, training_labels, testing_images, testing_labels);


Davis King's avatar
Davis King committed
98
99
100
101
102
    // dlib uses cuDNN under the covers.  One of the features of cuDNN is the
    // option to use slower methods that use less RAM or faster methods that use
    // a lot of RAM.  If you find that you run out of RAM on your graphics card
    // then you can call this function and we will request the slower but more
    // RAM frugal cuDNN algorithms.
103
    set_dnn_prefer_smallest_algorithms();
104
105


Davis King's avatar
Davis King committed
106
107
108
    // Create a network as defined above.  This network will produce 10 outputs
    // because that's how we defined net_type.  However, fc layers can have the
    // number of outputs they produce changed at runtime.  
109
    net_type net;
Davis King's avatar
Davis King committed
110
111
    // So if you wanted to use the same network but override the number of
    // outputs at runtime you can do so like this:
112
113
    net_type net2(num_fc_outputs(15));

Davis King's avatar
Davis King committed
114
115
116
    // Now, let's imagine we wanted to replace some of the relu layers with
    // prelu layers.  We might do it like this:
    using net_type2 = loss_multiclass_log<fc<number_of_classes,
117
                                avg_pool<6,6,11,11,
118
119
120
121
                                pres<res<res<res_down< // 2 prelu layers here
                                tag4<repeat<9,pres,    // 9 groups, each containing 2 prelu layers  
                                res_down<
                                res<
Davis King's avatar
Davis King committed
122
123
                                input<matrix<unsigned char>>
                                >>>>>>>>>>>;
124

Davis King's avatar
Davis King committed
125
126
    // prelu layers have a floating point parameter.  If you want to set it to
    // something other than its default value you can do so like this:
127
    net_type2 pnet(prelu_(0.2),  
128
                   prelu_(0.25),
129
                   repeat_group(prelu_(0.3),prelu_(0.4)) // Initialize all the prelu instances in the repeat 
Davis King's avatar
Davis King committed
130
131
                                                         // layer.  repeat_group() is needed to group the 
                                                         // things that are part of repeat's block.
132
                   );
Davis King's avatar
Davis King committed
133
134
135
136
137
    // As you can see, a network will greedily assign things given to its
    // constructor to the layers inside itself.  The assignment is done in the
    // order the layers are defined, but it will skip layers where the
    // assignment doesn't make sense.  

138
139
140
141
142
143
144
145
146
    // Now let's print the details of the pnet to the screen and inspect it.
    cout << "The pnet has " << pnet.num_layers << " layers in it." << endl;
    cout << pnet << endl;
    // These print statements will output this (I've truncated it since it's
    // long, but you get the idea):
    /*
        The pnet has 125 layers in it.
        layer<0>      loss_multiclass_log
        layer<1>      fc       (num_outputs=10)
147
        layer<2>      avg_pool (nr=6, nc=6, stride_y=11, _stride_x=11)
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
        layer<3>      prelu    (initial_param_value=0.2)
        layer<4>      add_prev
        layer<5>      bn_con
        layer<6>      con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
        layer<7>      prelu    (initial_param_value=0.25)
        layer<8>      bn_con
        layer<9>      con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
        layer<10>     tag1
        ...
        layer<33>     con      (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
        layer<34>     tag1
        layer<35>     tag4
        layer<36>     prelu    (initial_param_value=0.3)
        layer<37>     add_prev
        layer<38>     bn_con
        ...
        layer<114>    con      (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
        layer<115>    tag1
        layer<116>    relu
        layer<117>    add_prev
        layer<118>    bn_con
        layer<119>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
        layer<120>    relu
        layer<121>    bn_con
        layer<122>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
        layer<123>    tag1
        layer<124>    input<matrix>
    */

    // Now that we know the index numbers for each layer, we can access them
    // individually using layer<index>(pnet).  For example, to access the output
    // tensor for the first prelu layer we can say:
Davis King's avatar
Davis King committed
180
    layer<3>(pnet).get_output();
181
182
183
184
185
186
    // Or to print the prelu parameter for layer 7 we can say:
    cout << "prelu param: "<< layer<7>(pnet).layer_details().get_initial_param_value() << endl;

    // We can also access layers by their type.  This next statement finds the
    // first tag1 layer in pnet, and is therefore equivalent to calling
    // layer<10>(pnet):
Davis King's avatar
Davis King committed
187
    layer<tag1>(pnet);
188
189
190
191
192
193
194
195
196
197
198
    // The tag layers don't do anything at all and exist simply so you can tag
    // parts of your network and access them by layer<tag>().  You can also
    // index relative to a tag.  So for example, to access the layer immediately
    // after tag4 you can say:
    layer<tag4,1>(pnet); // Equivalent to layer<35+1>(pnet).

    // Or to access the layer 2 layers after tag4:
    layer<tag4,2>(pnet);
    // Tagging is a very useful tool for making complex network structures.  For
    // example, the add_prev1 layer is implemented internally by using a call to
    // layer<tag1>().
199

200
201


202
203
    // Ok, that's enough talk about defining and inspecting networks.  Let's
    // talk about training networks!
Davis King's avatar
Davis King committed
204
205
206

    // The dnn_trainer will use SGD by default, but you can tell it to use
    // different solvers like adam.  
Davis King's avatar
Davis King committed
207
    dnn_trainer<net_type,adam> trainer(net,adam(0.001));
208
209
210
211
    // Also, if you have multiple graphics cards you can tell the trainer to use
    // them together to make the training faster.  For example, replacing the
    // above constructor call with this one would cause it to use GPU cards 0
    // and 1.
Davis King's avatar
Davis King committed
212
    //dnn_trainer<net_type,adam> trainer(net,adam(0.001), {0,1});
213

214
215
    trainer.be_verbose();
    trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100));
Davis King's avatar
Davis King committed
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    // While the trainer is running it keeps an eye on the training error.  If
    // it looks like the error hasn't decreased for the last 2000 iterations it
    // will automatically reduce the step size by 0.1.  You can change these
    // default parameters to some other values by calling these functions.  Or
    // disable them entirely by setting the shrink amount to 1.
    trainer.set_iterations_without_progress_threshold(2000);
    trainer.set_step_size_shrink_amount(0.1);


    // Now, what if your training dataset is so big it doesn't fit in RAM?  You
    // make mini-batches yourself, any way you like, and you send them to the
    // trainer by repeatedly calling trainer.train_one_step(). 
    //
    // For example, the loop below stream MNIST data to out trainer.
230
231
    std::vector<matrix<unsigned char>> mini_batch_samples;
    std::vector<unsigned long> mini_batch_labels; 
232
    dlib::rand rnd(time(0));
Davis King's avatar
Davis King committed
233
234
235
    // Loop until the trainer's automatic shrinking has shrunk the step size by
    // 1e-3.  For the default shrinks amount of 0.1 this means stop after it
    // shrinks it 3 times.
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
    while(trainer.get_step_size() >= 1e-3)
    {
        mini_batch_samples.clear();
        mini_batch_labels.clear();

        // make a 128 image mini-batch
        while(mini_batch_samples.size() < 128)
        {
            auto idx = rnd.get_random_32bit_number()%training_images.size();
            mini_batch_samples.push_back(training_images[idx]);
            mini_batch_labels.push_back(training_labels[idx]);
        }

        trainer.train_one_step(mini_batch_samples, mini_batch_labels);
    }
Davis King's avatar
Davis King committed
251
252
253
254
255
256
257

    // When you call train_one_step(), the trainer will do its processing in a
    // separate thread.  This allows the main thread to work on loading data
    // while the trainer is busy executing the mini-batches in parallel.
    // However, this also means we need to wait for any mini-batches that are
    // still executing to stop before we mess with the net object.  Calling
    // get_net() performs the necessary synchronization.
258
259
    trainer.get_net();

Davis King's avatar
Davis King committed
260

261
    net.clean();
Davis King's avatar
Davis King committed
262
263
264
    serialize("mnist_res_network.dat") << net;


Davis King's avatar
Davis King committed
265
266
267
268
269
270
    // Now we have a trained network.  However, it has batch normalization
    // layers in it.  As is customary, we should replace these with simple
    // affine layers before we use the network.  This can be accomplished by
    // making a network type which is identical to net_type but with the batch
    // normalization layers replaced with affine.  For example:
    using test_net_type = loss_multiclass_log<fc<number_of_classes,
271
                                avg_pool<6,6,11,11,
272
273
274
                                ares<ares<ares<ares_down<
                                repeat<9,res,
                                ares_down<
Davis King's avatar
Davis King committed
275
                                ares<
Davis King's avatar
Davis King committed
276
277
                                input<matrix<unsigned char>>
                                >>>>>>>>>>;
Davis King's avatar
Davis King committed
278
    // Then we can simply assign our trained net to our testing net.
Davis King's avatar
Davis King committed
279
    test_net_type tnet = net;
Davis King's avatar
Davis King committed
280
281
    // Or if you only had a file with your trained network you could deserialize
    // it directly into your testing network.  
Davis King's avatar
Davis King committed
282
283
    deserialize("mnist_res_network.dat") >> tnet;

284

Davis King's avatar
Davis King committed
285
286
    // And finally, we can run the testing network over our data.

Davis King's avatar
Davis King committed
287
    std::vector<unsigned long> predicted_labels = tnet(training_images);
288
289
290
291
292
293
294
295
296
297
298
299
300
301
    int num_right = 0;
    int num_wrong = 0;
    for (size_t i = 0; i < training_images.size(); ++i)
    {
        if (predicted_labels[i] == training_labels[i])
            ++num_right;
        else
            ++num_wrong;
        
    }
    cout << "training num_right: " << num_right << endl;
    cout << "training num_wrong: " << num_wrong << endl;
    cout << "training accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;

Davis King's avatar
Davis King committed
302
    predicted_labels = tnet(testing_images);
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
    num_right = 0;
    num_wrong = 0;
    for (size_t i = 0; i < testing_images.size(); ++i)
    {
        if (predicted_labels[i] == testing_labels[i])
            ++num_right;
        else
            ++num_wrong;
        
    }
    cout << "testing num_right: " << num_right << endl;
    cout << "testing num_wrong: " << num_wrong << endl;
    cout << "testing accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;

}
catch(std::exception& e)
{
    cout << e.what() << endl;
}