dnn_mmod_train_find_cars_ex.cpp 20.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
    This example shows how to train a CNN based object detector using dlib's 
    loss_mmod loss layer.  This loss layer implements the Max-Margin Object
    Detection loss as described in the paper:
        Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046).
    This is the same loss used by the popular SVM+HOG object detector in dlib
    (see fhog_object_detector_ex.cpp) except here we replace the HOG features
    with a CNN and train the entire detector end-to-end.  This allows us to make
    much more powerful detectors.

Davis King's avatar
Davis King committed
12
13
14
15
    It would be a good idea to become familiar with dlib's DNN tooling before reading this
    example.  So you should read dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp
    before reading this example program.  You should also read the introductory DNN+MMOD
    example as well before proceeding.  So read dnn_mmod_ex.cpp first.
16
17
18
19
    

    This example is essentially a more complex version of dnn_mmod_ex.cpp.  In it we train
    a detector that finds the rear ends of motor vehicles.  I will also discuss some
Davis King's avatar
Davis King committed
20
    aspects of data preparation useful when training this kind of detector.  
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    
*/


#include <iostream>
#include <dlib/dnn.h>
#include <dlib/data_io.h>

using namespace std;
using namespace dlib;



template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
template <long num_filters, typename SUBNET> using con5  = con<num_filters,5,5,1,1,SUBNET>;
template <typename SUBNET> using downsampler  = relu<bn_con<con5d<32, relu<bn_con<con5d<32, relu<bn_con<con5d<16,SUBNET>>>>>>>>>;
template <typename SUBNET> using rcon5  = relu<bn_con<con5<55,SUBNET>>>;
using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;


// ----------------------------------------------------------------------------------------

int ignore_overlapped_boxes(
    std::vector<mmod_rect>& boxes,
    const test_box_overlap& overlaps
)
Davis King's avatar
Davis King committed
47
48
49
50
51
52
/*!
    ensures
        - Whenever two rectangles in boxes overlap, according to overlaps(), we set the
          smallest box to ignore.
        - returns the number of newly ignored boxes.
!*/
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
{
    int num_ignored = 0;
    for (size_t i = 0; i < boxes.size(); ++i)
    {
        if (boxes[i].ignore)
            continue;
        for (size_t j = i+1; j < boxes.size(); ++j)
        {
            if (boxes[j].ignore)
                continue;
            if (overlaps(boxes[i], boxes[j]))
            {
                ++num_ignored;
                if(boxes[i].rect.area() < boxes[j].rect.area())
                    boxes[i].ignore = true;
                else
                    boxes[j].ignore = true;
            }
        }
    }
    return num_ignored;
}

// ----------------------------------------------------------------------------------------

int main(int argc, char** argv) try
{
    if (argc != 2)
    {
        cout << "Give the path to a folder containing training.xml and testing.xml files." << endl;
        cout << "This example program is specifically designed to run on the dlib vehicle " << endl;
        cout << "detection dataset, which is available at this URL: " << endl;
        cout << "   http://dlib.net/files/data/dlib_rear_end_vehicles_v1.tar" << endl;
        cout << endl;
        cout << "So download that dataset, extract it somewhere, and then run this program" << endl;
        cout << "with the dlib_rear_end_vehicles folder as an argument.  E.g. if you extract" << endl;
        cout << "the dataset to the current folder then you should run this example program" << endl;
        cout << "by typing: " << endl;
        cout << "   ./dnn_mmod_train_find_cars_ex dlib_rear_end_vehicles" << endl;
        cout << endl;
Davis King's avatar
Davis King committed
93
94
        cout << "It takes about a day to finish if run on a high end GPU like a 1080ti." << endl;
        cout << endl;
95
96
97
98
99
100
101
102
103
104
        return 0;
    }
    const std::string data_directory = argv[1];


    std::vector<matrix<rgb_pixel>> images_train, images_test;
    std::vector<std::vector<mmod_rect>> boxes_train, boxes_test;
    load_image_dataset(images_train, boxes_train, data_directory+"/training.xml");
    load_image_dataset(images_test,  boxes_test,  data_directory+"/testing.xml");

Davis King's avatar
Davis King committed
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
    // When I was creating the dlib vehicle detection dataset I had to label all the cars
    // in each image.  MMOD requires all cars to be labeled, since any unlabeled part of an
    // image is implicitly assumed to be not a car, and the algorithm will use it as
    // negative training data.  So every car must be labeled, either with a normal
    // rectangle or an "ignore" rectangle that tells MMOD to simply ignore it (i.e. neither
    // treat it as a thing to detect nor as negative training data).  
    // 
    // In our present case, many images contain very tiny cars in the distance, ones that
    // are essentially just dark smudges.  It's not reasonable to expect the CNN
    // architecture we defined to detect such vehicles.  However, I erred on the side of
    // having more complete annotations when creating the dataset.  So when I labeled these
    // images I labeled many of these really difficult cases as vehicles to detect.   
    //
    // So the first thing we are going to do is clean up our dataset a little bit.  In
    // particular, we are going to mark boxes smaller than 35*35 pixels as ignore since
    // only really small and blurry cars appear at those sizes.  We will also mark boxes
    // that are heavily overlapped by another box as ignore.  We do this because we want to
    // allow for stronger non-maximum suppression logic in the learned detector, since that
    // will help make it easier to learn a good detector. 
    // 
    // To explain this non-max suppression idea further it's important to understand how
    // the detector works.  Essentially, sliding window detectors scan all image locations
    // and ask "is there a care here?".  If there really is a car in an image then usually
    // many sliding window locations will produce high detection scores, indicating that
    // there is a car at those locations.  If we just stopped there then each car would
    // produce multiple detections.  But that isn't what we want.  We want each car to
    // produce just one detection.  So it's common for detectors to include "non-maximum
    // suppression" logic which simply takes the strongest detection and then deletes all
    // detections "close to" the strongest.  This is a simple post-processing step that can
    // eliminate duplicate detections.  However, we have to define what "close to" means.
    // We can do this by looking at your training data and checking how close the closest
    // target boxes are to each other, and then picking a "close to" measure that doesn't
    // suppress those target boxes but is otherwise as tight as possible.  This is exactly
    // what the mmod_options object does by default.
    //
    // Importantly, this means that if your training dataset contains an image with two
    // target boxes that really overlap a whole lot, then the non-maximum suppression
    // "close to" measure will be configured to allow detections to really overlap a whole
    // lot.  On the other hand, if your dataset didn't contain any overlapped boxes at all,
    // then the non-max suppression logic would be configured to filter out any boxes that
    // overlapped at all, and thus would be performing a much stronger non-max suppression.  
    //
    // Why does this matter?  Well, remember that we want to avoid duplicate detections.
    // If non-max suppression just kills everything in a really wide area around a car then
    // the CNN doesn't really need to learn anything about avoiding duplicate detections.
    // However, if non-max suppression only suppresses a tiny area around each detection
    // then the CNN will need to learn to output small detection scores for those areas of
    // the image not suppressed.  The smaller the non-max suppression region the more the
    // CNN has to learn and the more difficult the learning problem will become.  This is
    // why we remove highly overlapped objects from the training dataset.  That is, we do
    // it so that the non-max suppression logic will be able to be reasonably effective.
    // Here we are ensuring that any boxes that are entirely contained by another are
    // suppressed.  We also ensure that boxes with an intersection over union of 0.5 or
    // greater are suppressed.  This will improve the resulting detector since it will be
    // able to use more aggressive non-max suppression settings.
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

    int num_overlapped_ignored_test = 0;
    for (auto& v : boxes_test)
        num_overlapped_ignored_test += ignore_overlapped_boxes(v, test_box_overlap(0.50, 0.99));

    int num_overlapped_ignored = 0;
    int num_additional_ignored = 0;
    for (auto& v : boxes_train)
    {
        num_overlapped_ignored += ignore_overlapped_boxes(v, test_box_overlap(0.50, 0.99));
        for (auto& bb : v)
        {
            if (bb.rect.width() < 35 && bb.rect.height() < 35)
            {
                if (!bb.ignore)
                {
                    bb.ignore = true;
                    ++num_additional_ignored;
                }
            }

            // The dlib vehicle detection dataset doesn't contain any detections with
            // really extreme aspect ratios.  However, some datasets do, often because of
            // bad labeling.  So it's a good idea to check for that and either eliminate
            // those boxes or set them to ignore.  Although, this depends on your
            // application.  
            // 
            // For instance, if your dataset has boxes with an aspect ratio
            // of 10 then you should think about what that means for the network
            // architecture.  Does the receptive field even cover the entirety of the box
            // in those cases?  Do you care about these boxes?  Are they labeling errors?
            // I find that many people will download some dataset from the internet and
            // just take it as given.  They run it through some training algorithm and take
            // the dataset as unchallengeable truth.  But many datasets are full of
            // labeling errors.  There are also a lot of datasets that aren't full of
            // errors, but are annotated in a sloppy and inconsistent way.  Fixing those
            // errors and inconsistencies can often greatly improve models trained from
            // such data.  It's almost always worth the time to try and improve your
            // training dataset.   
Davis King's avatar
Davis King committed
199
200
201
202
203
204
            //
            // In any case, my point is that there are other types of dataset cleaning you
            // could put here.  What exactly you need depends on your application.  But you
            // should carefully consider it and not take your dataset as a given.  The work
            // of creating a good detector is largely about creating a high quality
            // training dataset.  
205
206
207
        }
    }

Davis King's avatar
Davis King committed
208
209
210
    // When modifying a dataset like this, it's a really good idea to print out a log of
    // how many boxes you ignored.  It's easy to accidentally ignore a huge block of data,
    // so you should always look and see that things are doing what you expect.
211
212
213
214
215
216
217
218
219
220
221
222
223
224
    cout << "num_overlapped_ignored: "<< num_overlapped_ignored << endl;
    cout << "num_additional_ignored: "<< num_additional_ignored << endl;
    cout << "num_overlapped_ignored_test: "<< num_overlapped_ignored_test << endl;


    cout << "num training images: " << images_train.size() << endl;
    cout << "num testing images: " << images_test.size() << endl;


    // Our vehicle detection dataset has basically 3 different types of boxes.  Square
    // boxes, tall and skinny boxes (e.g. semi trucks), and short and wide boxes (e.g.
    // sedans).  Here we are telling the MMOD algorithm that a vehicle is recognizable as
    // long as the longest box side is at least 70 pixels long and the shortest box side is
    // at least 30 pixels long.  It will use these parameters to decide how large each of
Davis King's avatar
Davis King committed
225
226
227
228
    // the sliding windows needs to be so as to be able to detect all the vehicles.  Since
    // our dataset has basically these 3 different aspect ratios, it will decide to use 3
    // different sliding windows.  This means the final con layer in the network will have
    // 3 filters, one for each of these aspect ratios. 
229
230
231
232
233
234
235
236
237
238
239
240
241
    mmod_options options(boxes_train, 70, 30);

    // This setting is very important and dataset specific.  The vehicle detection dataset
    // contains boxes that are marked as "ignore", as we discussed above.  Some of them are
    // ignored because we set ignore to true on them in the above code.  However, the xml
    // files already contained a lot of ignore boxes.  Some of them are large boxes that
    // encompass large parts of an image and the intention is to have everything inside
    // those boxes be ignored.  Therefore, we need to tell the MMOD algorithm to do that,
    // which we do by setting options.overlaps_ignore appropriately.  
    // 
    // But first, we need to understand exactly what this option does.  The MMOD loss
    // is essentially counting the number of false alarms + missed detections, produced by
    // the detector, for each image.  During training, the code is running the detector on
Davis King's avatar
Davis King committed
242
    // each image in a mini-batch and looking at its output and counting the number of
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
    // mistakes.  The optimizer tries to find parameters settings that minimize the number
    // of detector mistakes.
    // 
    // This overlaps_ignore option allows you to tell the loss that some outputs from the
    // detector should be totally ignored, as if they never happened.  In particular, if a
    // detection overlaps a box in the training data with ignore==true then that detection
    // is ignored.  This overlap is determined by calling
    // options.overlaps_ignore(the_detection, the_ignored_training_box).  If it returns
    // true then that detection is ignored.
    // 
    // You should read the documentation for test_box_overlap, the class type for
    // overlaps_ignore for full details.  However, the gist is that the default behavior is
    // to only consider boxes as overlapping if their intersection over union is > 0.5.
    // However, the dlib vehicle detection dataset contains large boxes that are meant to
    // mask out large areas of an image.  So intersection over union isn't an appropriate
    // way to measure "overlaps with box" in this case.  We want any box that is contained
    // inside one of these big regions to be ignored, even if the detection box is really
    // small.  So we set overlaps_ignore to behave that way with this line.
    options.overlaps_ignore = test_box_overlap(0.5, 0.95);

    net_type net(options);
    // The final layer of the network must be a con_ layer that contains 
    // options.detector_windows.size() filters.  This is because these final filters are
Davis King's avatar
Davis King committed
266
267
268
    // what perform the final "sliding window" detection in the network.  For the dlib
    // vehicle dataset, there will be 3 sliding window detectors, so we will be setting
    // num_filters to 3 here.
269
270
    net.subnet().layer_details().set_num_filters(options.detector_windows.size());

Davis King's avatar
Davis King committed
271

272
273
274
    dnn_trainer<net_type> trainer(net,sgd(0.0001,0.9));
    trainer.set_learning_rate(0.1);
    trainer.be_verbose();
Davis King's avatar
Davis King committed
275
276
277
278
279
280
281
282
283
284

    // While training, we are going to use early stopping.  That is, we will be checking
    // how good the detector is performing on our test data and when it stops getting
    // better on the test data we will drop the learning rate.  We will keep doing that
    // until the learning rate is less than 1e-4.   These two settings tell the training to
    // do that.  Essentially, we are setting the first argument to infinity, and only the
    // test iterations without progress threshold will matter.  In particular, it says that
    // once we observe 1000 testing mini-batches where the test loss clearly isn't
    // decreasing we will lower the learning rate.
    trainer.set_iterations_without_progress_threshold(1000000);
285
    trainer.set_test_iterations_without_progress_threshold(1000);
Davis King's avatar
Davis King committed
286

287
288
289
290
291
292
293
294
295
296
297
298
299
300
    const string sync_filename = "mmod_cars_sync";
    trainer.set_synchronization_file(sync_filename, std::chrono::minutes(5));




    std::vector<matrix<rgb_pixel>> mini_batch_samples;
    std::vector<std::vector<mmod_rect>> mini_batch_labels; 
    random_cropper cropper;
    cropper.set_seed(1);
    cropper.set_chip_dims(350, 350);
    cropper.set_min_object_size(0.20); 
    cropper.set_max_rotation_degrees(2);
    dlib::rand rnd;
Davis King's avatar
Davis King committed
301
302

    // Log the training parameters to the console
303
304
305
306
307
308
    cout << trainer << cropper << endl;

    int cnt = 1;
    // Run the trainer until the learning rate gets small.  
    while(trainer.get_learning_rate() >= 1e-4)
    {
Davis King's avatar
Davis King committed
309
        // Every 30 mini-batches we do a testing mini-batch.  
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
        if (cnt%30 != 0 || images_test.size() == 0)
        {
            cropper(87, images_train, boxes_train, mini_batch_samples, mini_batch_labels);
            // We can also randomly jitter the colors and that often helps a detector
            // generalize better to new images.
            for (auto&& img : mini_batch_samples)
                disturb_colors(img, rnd);

            // It's a good idea to, at least once, put code here that displays the images
            // and boxes the random cropper is generating.  You should look at them and
            // think about if the output makes sense for your problem.  Most of the time
            // it will be fine, but sometimes you will realize that the pattern of cropping
            // isn't really appropriate for your problem and you will need to make some
            // change to how the mini-batches are being generated.  Maybe you will tweak
            // some of the cropper's settings, or write your own entirely separate code to
            // create mini-batches.  But either way, if you don't look you will never know.
            // An easy way to do this is to create a dlib::image_window to display the
            // images and boxes.

            trainer.train_one_step(mini_batch_samples, mini_batch_labels);
        }
        else
        {
            cropper(87, images_test, boxes_test, mini_batch_samples, mini_batch_labels);
            // We can also randomly jitter the colors and that often helps a detector
            // generalize better to new images.
            for (auto&& img : mini_batch_samples)
                disturb_colors(img, rnd);

            trainer.test_one_step(mini_batch_samples, mini_batch_labels);
        }
        ++cnt;
    }
    // wait for training threads to stop
    trainer.get_net();
    cout << "done training" << endl;

    // Save the network to disk
    net.clean();
    serialize("mmod_rear_end_vehicle_detector.dat") << net;


    // It's a really good idea to print the training parameters.  This is because you will
    // invariably be running multiple rounds of training and should be logging the output
    // to a log file.  This print statement will include many of the training parameters in
    // your log.
    cout << trainer << cropper << endl;

    cout << "\nsync_filename: " << sync_filename << endl;
    cout << "num training images: "<< images_train.size() << endl;
    cout << "training results: " << test_object_detection_function(net, images_train, boxes_train, test_box_overlap(), 0, options.overlaps_ignore);
Davis King's avatar
Cleanup  
Davis King committed
361
    upsample_image_dataset<pyramid_down<2>>(images_train, boxes_train, 1800*1800);
362
363
364
365
366
    cout << "training upsampled results: " << test_object_detection_function(net, images_train, boxes_train, test_box_overlap(), 0, options.overlaps_ignore);


    cout << "num testing images: "<< images_test.size() << endl;
    cout << "testing results: " << test_object_detection_function(net, images_test, boxes_test, test_box_overlap(), 0, options.overlaps_ignore);
Davis King's avatar
Cleanup  
Davis King committed
367
    upsample_image_dataset<pyramid_down<2>>(images_test, boxes_test, 1800*1800);
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
    cout << "testing upsampled results: " << test_object_detection_function(net, images_test, boxes_test, test_box_overlap(), 0, options.overlaps_ignore);

    /*
        This program takes many hours to execute on a high end GPU.  It took about a day to
        train on an NVIDIA 1080ti.  The resulting model file is available at
        http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2
        It should be noted that this file on dlib.net has a dlib::shape_predictor appended
        onto the end of it (see dnn_mmod_find_cars_ex.cpp for an example of its use).  This
        explains why the model file on dlib.net is larger than the
        mmod_rear_end_vehicle_detector.dat output by this program.

        Also, the training and testing accuracies were:

        num training images: 2217
        training results: 0.990738 0.736431 0.736073 
        training upsampled results: 0.986837 0.937694 0.936912 
        num testing images: 135
        testing results: 0.988827 0.471372 0.470806 
        testing upsampled results: 0.987879 0.651132 0.650399 
    */

    return 0;

}
catch(std::exception& e)
{
    cout << e.what() << endl;
}