dnn_mmod_find_cars_ex.cpp 10.7 KB
Newer Older
Davis King's avatar
Davis King committed
1
2
3
4
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
    This example shows how to run a CNN based vehicle detector using dlib.  The
    example loads a pretrained model and uses it to find the rear ends of cars in
Davis King's avatar
cleanup  
Davis King committed
5
    an image.  We will also visualize some of the detector's processing steps by
Davis King's avatar
Davis King committed
6
    plotting various intermediate images on the screen.  Viewing these can help
Davis King's avatar
cleanup  
Davis King committed
7
    you understand how the detector works.
Davis King's avatar
Davis King committed
8
9
10
    
    The model used by this example was trained by the dnn_mmod_train_find_cars_ex.cpp 
    example.  Also, since this is a CNN, you really should use a GPU to get the
Davis King's avatar
Davis King committed
11
12
13
    best execution speed.  For instance, when run on a NVIDIA 1080ti, this detector 
    runs at 98fps when run on the provided test image.  That's more than an order 
    of magnitude faster than when run on the CPU.
Davis King's avatar
Davis King committed
14
15
16
17
18

    Users who are just learning about dlib's deep learning API should read
    the dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp examples to learn
    how the API works.  For an introduction to the object detection method you
    should read dnn_mmod_ex.cpp.
Davis King's avatar
Davis King committed
19
20
21
22

    You can also see some videos of this vehicle detector running on YouTube:
        https://www.youtube.com/watch?v=4B3bzmxMAZU
        https://www.youtube.com/watch?v=bP2SUo5vSlc
Davis King's avatar
Davis King committed
23
*/
24
25
26
27


#include <iostream>
#include <dlib/dnn.h>
Davis King's avatar
Davis King committed
28
#include <dlib/image_io.h>
29
30
31
32
33
34
35
36
#include <dlib/gui_widgets.h>
#include <dlib/image_processing.h>

using namespace std;
using namespace dlib;



Davis King's avatar
cleanup  
Davis King committed
37
// The rear view vehicle detector network
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
template <long num_filters, typename SUBNET> using con5  = con<num_filters,5,5,1,1,SUBNET>;
template <typename SUBNET> using downsampler  = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16,SUBNET>>>>>>>>>;
template <typename SUBNET> using rcon5  = relu<affine<con5<55,SUBNET>>>;
using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;

// ----------------------------------------------------------------------------------------

int main() try
{
    net_type net;
    shape_predictor sp;
    // You can get this file from http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2
    // This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program.
    // As you can see, it also includes a shape_predictor.  To see a generic example of how
    // to train those refer to train_shape_predictor_ex.cpp.
    deserialize("mmod_rear_end_vehicle_detector.dat") >> net >> sp;

    matrix<rgb_pixel> img;
    load_image(img, "../mmod_cars_test_image.jpg");

    image_window win;
    win.set_image(img);

    // Run the detector on the image and show us the output.
    for (auto&& d : net(img))
    {
        // We use a shape_predictor to refine the exact shape and location of the detection
Davis King's avatar
cleanup  
Davis King committed
66
67
68
        // box.  This shape_predictor is trained to simply output the 4 corner points of
        // the box.  So all we do is make a rectangle that tightly contains those 4 points
        // and that rectangle is our refined detection position.
69
70
        auto fd = sp(img,d);
        rectangle rect;
Davis King's avatar
Davis King committed
71
        for (unsigned long j = 0; j < fd.num_parts(); ++j)
72
73
74
75
76
77
78
79
80
81
            rect += fd.part(j);
        win.add_overlay(rect, rgb_pixel(255,0,0));
    }



    cout << "Hit enter to view the intermediate processing steps" << endl;
    cin.get();


Davis King's avatar
cleanup  
Davis King committed
82
    // Now let's look at how the detector works.  The high level processing steps look like:
Davis King's avatar
Davis King committed
83
    //   1. Create an image pyramid and pack the pyramid into one big image.  We call this
Davis King's avatar
Davis King committed
84
    //      image the "tiled pyramid".
Davis King's avatar
Davis King committed
85
86
    //   2. Run the tiled pyramid image through the CNN.  The CNN outputs a new image where
    //      bright pixels in the output image indicate the presence of cars.  
Davis King's avatar
cleanup  
Davis King committed
87
    //   3. Find pixels in the CNN's output image with a value > 0.  Those locations are your
Davis King's avatar
Davis King committed
88
89
90
91
92
    //      preliminary car detections.  
    //   4. Perform non-maximum suppression on the preliminary detections to produce the
    //      final output.
    //
    // We will be plotting the images from steps 1 and 2 so you can visualize what's
Davis King's avatar
cleanup  
Davis King committed
93
    // happening.  For the CNN's output image, we will use the jet colormap so that "bright"
Davis King's avatar
Davis King committed
94
95
96
97
98
99
100
101
102
    // outputs, i.e. pixels with big values, appear in red and "dim" outputs appear as a
    // cold blue color.  To do this we pick a range of CNN output values for the color
    // mapping.  The specific values don't matter.  They are just selected to give a nice
    // looking output image.
    const float lower = -2.5;
    const float upper = 0.0;
    cout << "jet color mapping range:  lower="<< lower << "  upper="<< upper << endl;


103

Davis King's avatar
Davis King committed
104
    // Create a tiled pyramid image and display it on the screen. 
105
106
    std::vector<rectangle> rects;
    matrix<rgb_pixel> tiled_img;
Davis King's avatar
cleanup  
Davis King committed
107
108
109
110
111
112
    // Get the type of pyramid the CNN used
    using pyramid_type = std::remove_reference<decltype(input_layer(net))>::type::pyramid_type;
    // And tell create_tiled_pyramid to create the pyramid using that pyramid type.
    create_tiled_pyramid<pyramid_type>(img, tiled_img, rects, 
                                       input_layer(net).get_pyramid_padding(), 
                                       input_layer(net).get_pyramid_outer_padding());
Davis King's avatar
Davis King committed
113
    image_window winpyr(tiled_img, "Tiled pyramid");
114
115
116



Davis King's avatar
cleanup  
Davis King committed
117
118
119
120
121
122
123
    // This CNN detector represents a sliding window detector with 3 sliding windows.  Each
    // of the 3 windows has a different aspect ratio, allowing it to find vehicles which
    // are either tall and skinny, squarish, or short and wide.  The aspect ratio of a
    // detection is determined by which channel in the output image triggers the detection.
    // Here we are just going to max pool the channels together to get one final image for
    // our display.  In this image, a pixel will be bright if any of the sliding window
    // detectors thinks there is a car at that location.
124
125
126
127
    cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl;
    matrix<float> network_output = image_plane(net.subnet().get_output(),0,0);
    for (long k = 1; k < net.subnet().get_output().k(); ++k)
        network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k));
Davis King's avatar
cleanup  
Davis King committed
128
    // We will also upsample the CNN's output image.  The CNN we defined has an 8x
Davis King's avatar
Davis King committed
129
130
131
132
133
134
    // downsampling layer at the beginning. In the code below we are going to overlay this
    // CNN output image on top of the raw input image.  To make that look nice it helps to
    // upsample the CNN output image back to the same resolution as the input image, which
    // we do here.
    const double network_output_scale = img.nc()/(double)network_output.nc();
    resize_image(network_output_scale, network_output);
135
136


Davis King's avatar
Davis King committed
137
    // Display the network's output as a color image.   
138
139
140
    image_window win_output(jet(network_output, upper, lower), "Output tensor from the network");


Davis King's avatar
Davis King committed
141
    // Also, overlay network_output on top of the tiled image pyramid and display it.
Davis King's avatar
cleanup  
Davis King committed
142
    for (long r = 0; r < tiled_img.nr(); ++r)
143
    {
Davis King's avatar
cleanup  
Davis King committed
144
        for (long c = 0; c < tiled_img.nc(); ++c)
145
146
147
        {
            dpoint tmp(c,r);
            tmp = input_tensor_to_output_tensor(net, tmp);
Davis King's avatar
Davis King committed
148
            tmp = point(network_output_scale*tmp);
149
150
151
            if (get_rect(network_output).contains(tmp))
            {
                float val = network_output(tmp.y(),tmp.x());
Davis King's avatar
Davis King committed
152
153
                // alpha blend the network output pixel with the RGB image to make our
                // overlay.
154
155
156
                rgb_alpha_pixel p;
                assign_pixel(p , colormap_jet(val,lower,upper));
                p.alpha = 120;
Davis King's avatar
cleanup  
Davis King committed
157
                assign_pixel(tiled_img(r,c), p);
158
159
160
            }
        }
    }
Davis King's avatar
cleanup  
Davis King committed
161
162
163
164
165
166
    // If you look at this image you can see that the vehicles have bright red blobs on
    // them.  That's the CNN saying "there is a car here!".  You will also notice there is
    // a certain scale at which it finds cars.  They have to be not too big or too small,
    // which is why we have an image pyramid.  The pyramid allows us to find cars of all
    // scales.
    image_window win_pyr_overlay(tiled_img, "Detection scores on image pyramid");
167
168
169
170




Davis King's avatar
Davis King committed
171
172
173
174
    // Finally, we can collapse the pyramid back into the original image.  The CNN doesn't
    // actually do this step, since it's enough to threshold the tiled pyramid image to get
    // the detections.  However, it makes a nice visualization and clearly indicates that
    // the detector is firing for all the cars.
Davis King's avatar
cleanup  
Davis King committed
175
    matrix<float> collapsed(img.nr(), img.nc());
176
177
    resizable_tensor input_tensor;
    input_layer(net).to_tensor(&img, &img+1, input_tensor);
Davis King's avatar
cleanup  
Davis King committed
178
    for (long r = 0; r < collapsed.nr(); ++r)
179
    {
Davis King's avatar
cleanup  
Davis King committed
180
        for (long c = 0; c < collapsed.nc(); ++c)
181
        {
Davis King's avatar
cleanup  
Davis King committed
182
183
184
185
            // Loop over a bunch of scale values and look up what part of network_output
            // corresponds to the point(c,r) in the original image, then take the max
            // detection score over all the scales and save it at pixel point(c,r).
            float max_score = -1e30;
186
187
            for (double scale = 1; scale > 0.2; scale *= 5.0/6.0)
            {
Davis King's avatar
Davis King committed
188
                // Map from input image coordinates to tiled pyramid coordinates.
189
                dpoint tmp = center(input_layer(net).image_space_to_tensor_space(input_tensor,scale, drectangle(dpoint(c,r))));
Davis King's avatar
Davis King committed
190
191
192
                // Now map from pyramid coordinates to network_output coordinates.
                tmp = point(network_output_scale*input_tensor_to_output_tensor(net, tmp));

193
194
195
                if (get_rect(network_output).contains(tmp))
                {
                    float val = network_output(tmp.y(),tmp.x());
Davis King's avatar
cleanup  
Davis King committed
196
197
                    if (val > max_score)
                        max_score = val;
198
199
200
                }
            }

Davis King's avatar
cleanup  
Davis King committed
201
            collapsed(r,c) = max_score;
202

Davis King's avatar
cleanup  
Davis King committed
203
            // Also blend the scores into the original input image so we can view it as
204
205
            // an overlay on the cars.
            rgb_alpha_pixel p;
Davis King's avatar
cleanup  
Davis King committed
206
            assign_pixel(p , colormap_jet(max_score,lower,upper));
207
208
209
210
211
            p.alpha = 120;
            assign_pixel(img(r,c), p);
        }
    }

Davis King's avatar
cleanup  
Davis King committed
212
213
    image_window win_collapsed(jet(collapsed, upper, lower), "Collapsed output tensor from the network");
    image_window win_img_and_sal(img, "Collapsed detection scores on raw image");
214
215
216
217
218
219
220
221
222
223
224
225
226


    cout << "Hit enter to end program" << endl;
    cin.get();
}
catch(image_load_error& e)
{
    cout << e.what() << endl;
    cout << "The test image is located in the examples folder.  So you should run this program from a sub folder so that the relative path is correct." << endl;
}
catch(serialization_error& e)
{
    cout << e.what() << endl;
Davis King's avatar
Davis King committed
227
    cout << "The correct model file can be obtained from: http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2   Don't forget to unzip the file." << endl;
228
229
230
231
232
233
234
235
236
}
catch(std::exception& e)
{
    cout << e.what() << endl;
}