simple_object_detector.h 12 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// Copyright (C) 2014  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_SIMPLE_ObJECT_DETECTOR_H__
#define DLIB_SIMPLE_ObJECT_DETECTOR_H__

#include "simple_object_detector_abstract.h"
#include "dlib/image_processing/object_detector.h"
#include "dlib/string.h"
#include "dlib/image_processing/scan_fhog_pyramid.h"
#include "dlib/svm/structural_object_detection_trainer.h"
#include "dlib/geometry.h"
#include "dlib/data_io/load_image_dataset.h"
#include "dlib/image_processing/remove_unobtainable_rectangles.h"


namespace dlib
{

// ----------------------------------------------------------------------------------------

    typedef object_detector<scan_fhog_pyramid<pyramid_down<6> > > simple_object_detector;

// ----------------------------------------------------------------------------------------

    struct simple_object_detector_training_options
    {
        simple_object_detector_training_options()
        {
            be_verbose = false;
            add_left_right_image_flips = false;
            num_threads = 4;
            detection_window_size = 80*80;
33
            C = 1;
34
            epsilon = 0.01;
35
36
37
38
39
40
        }

        bool be_verbose;
        bool add_left_right_image_flips;
        unsigned long num_threads;
        unsigned long detection_window_size;
41
        double C;
42
        double epsilon;
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    };

// ----------------------------------------------------------------------------------------

    namespace impl
    {
        inline void pick_best_window_size (
            const std::vector<std::vector<rectangle> >& boxes,
            unsigned long& width,
            unsigned long& height,
            const unsigned long target_size
        )
        {
            // find the average width and height
            running_stats<double> avg_width, avg_height;
            for (unsigned long i = 0; i < boxes.size(); ++i)
            {
                for (unsigned long j = 0; j < boxes[i].size(); ++j)
                {
                    avg_width.add(boxes[i][j].width());
                    avg_height.add(boxes[i][j].height());
                }
            }

            // now adjust the box size so that it is about target_pixels pixels in size
            double size = avg_width.mean()*avg_height.mean();
            double scale = std::sqrt(target_size/size);

            width = (unsigned long)(avg_width.mean()*scale+0.5);
            height = (unsigned long)(avg_height.mean()*scale+0.5);
            // make sure the width and height never round to zero.
            if (width == 0)
                width = 1;
            if (height == 0)
                height = 1;
        }

        inline bool contains_any_boxes (
            const std::vector<std::vector<rectangle> >& boxes
        )
        {
            for (unsigned long i = 0; i < boxes.size(); ++i)
            {
                if (boxes[i].size() != 0)
                    return true;
            }
            return false;
        }

        inline void throw_invalid_box_error_message (
            const std::string& dataset_filename,
            const std::vector<std::vector<rectangle> >& removed,
Davis King's avatar
Davis King committed
95
            const simple_object_detector_training_options& options
96
97
98
99
        )
        {

            std::ostringstream sout;
100
101
102
103
            // Note that the 1/16 factor is here because we will try to upsample the image
            // 2 times to accommodate small boxes.  We also take the max because we want to
            // lower bound the size of the smallest recommended box.  This is because the
            // 8x8 HOG cells can't really deal with really small object boxes.
Davis King's avatar
Davis King committed
104
            sout << "Error!  An impossible set of object boxes was given for training. ";
105
            sout << "All the boxes need to have a similar aspect ratio and also not be ";
106
107
            sout << "smaller than about " << std::max<long>(20*20,options.detection_window_size/16) << " pixels in area. ";

108
            std::ostringstream sout2;
109
            if (dataset_filename.size() != 0)
110
            {
111
112
113
114
                sout << "The following images contain invalid boxes:\n";
                image_dataset_metadata::dataset data;
                load_image_dataset_metadata(data, dataset_filename);
                for (unsigned long i = 0; i < removed.size(); ++i)
115
                {
116
117
118
119
120
                    if (removed[i].size() != 0)
                    {
                        const std::string imgname = data.images[i].filename;
                        sout2 << "  " << imgname << "\n";
                    }
121
122
123
124
125
126
127
128
                }
            }
            throw error("\n"+wrap_string(sout.str()) + "\n" + sout2.str());
        }
    }

// ----------------------------------------------------------------------------------------

129
    template <typename image_array>
130
    inline simple_object_detector train_simple_object_detector_on_images (
131
132
133
134
        const std::string& dataset_filename, // can be "" if it's not applicable
        image_array& images,
        std::vector<std::vector<rectangle> >& boxes,
        std::vector<std::vector<rectangle> >& ignore,
135
        const simple_object_detector_training_options& options 
136
137
    )
    {
138
        if (options.C <= 0)
139
            throw error("Invalid C value given to train_simple_object_detector(), C must be > 0.");
140
141
        if (options.epsilon <= 0)
            throw error("Invalid epsilon value given to train_simple_object_detector(), epsilon must be > 0.");
142

143
144
145
146
        if (images.size() != boxes.size())
            throw error("The list of images must have the same length as the list of boxes.");
        if (images.size() != ignore.size())
            throw error("The list of images must have the same length as the list of ignore boxes.");
147
148

        if (impl::contains_any_boxes(boxes) == false)
149
            throw error("Error, the training dataset does not have any labeled object boxes in it.");
150
151
152
153
154
155
156
157

        typedef scan_fhog_pyramid<pyramid_down<6> > image_scanner_type; 
        image_scanner_type scanner;
        unsigned long width, height;
        impl::pick_best_window_size(boxes, width, height, options.detection_window_size);
        scanner.set_detection_window_size(width, height); 
        structural_object_detection_trainer<image_scanner_type> trainer(scanner);
        trainer.set_num_threads(options.num_threads);  
158
        trainer.set_c(options.C);
159
        trainer.set_epsilon(options.epsilon);
160
161
        if (options.be_verbose)
        {
162
            std::cout << "Training with C: " << options.C << std::endl;
163
            std::cout << "Training with epsilon: " << options.epsilon << std::endl;
164
165
166
167
168
169
170
171
172
173
174
175
176
            std::cout << "Training using " << options.num_threads << " threads."<< std::endl;
            std::cout << "Training with sliding window " << width << " pixels wide by " << height << " pixels tall." << std::endl;
            if (options.add_left_right_image_flips)
                std::cout << "Training on both left and right flipped versions of images." << std::endl;
            trainer.be_verbose();
        }

        unsigned long upsample_amount = 0;

        // now make sure all the boxes are obtainable by the scanner.  We will try and
        // upsample the images at most two times to help make the boxes obtainable.
        std::vector<std::vector<rectangle> > temp(boxes), removed;
        removed = remove_unobtainable_rectangles(trainer, images, temp);
177
        while (impl::contains_any_boxes(removed) && upsample_amount < 2)
178
179
180
        {
            ++upsample_amount;
            if (options.be_verbose)
181
                std::cout << "Upsample images..." << std::endl;
182
183
184
185
186
187
            upsample_image_dataset<pyramid_down<2> >(images, boxes, ignore);
            temp = boxes;
            removed = remove_unobtainable_rectangles(trainer, images, temp);
        }
        // if we weren't able to get all the boxes to match then throw an error 
        if (impl::contains_any_boxes(removed))
Davis King's avatar
Davis King committed
188
            impl::throw_invalid_box_error_message(dataset_filename, removed, options);
189
190
191
192
193
194
195
196

        if (options.add_left_right_image_flips)
            add_image_left_right_flips(images, boxes, ignore);

        simple_object_detector detector = trainer.train(images, boxes, ignore);

        if (options.be_verbose)
        {
197
            std::cout << "Training complete." << std::endl;
198
            std::cout << "Trained with C: " << options.C << std::endl;
199
            std::cout << "Training with epsilon: " << options.epsilon << std::endl;
200
201
202
203
            std::cout << "Trained using " << options.num_threads << " threads."<< std::endl;
            std::cout << "Trained with sliding window " << width << " pixels wide by " << height << " pixels tall." << std::endl;
            if (upsample_amount != 0)
            {
204
205
206
207
                // Unsampled images # time(s) to allow detection of small boxes
                std::cout << "Upsampled images " << upsample_amount;
                std::cout << (upsample_amount == 1) ? " time" : " times";
                std::cout << " to allow detection of small boxes." << std::endl;
208
209
210
211
            }
            if (options.add_left_right_image_flips)
                std::cout << "Trained on both left and right flipped versions of images." << std::endl;
        }
212
213

        return detector;
214
215
    }

216
217
218
219
220
221
222
223
224
225
226
227
// ----------------------------------------------------------------------------------------

    inline void train_simple_object_detector (
        const std::string& dataset_filename,
        const std::string& detector_output_filename,
        const simple_object_detector_training_options& options 
    )
    {
        dlib::array<array2d<rgb_pixel> > images;
        std::vector<std::vector<rectangle> > boxes, ignore;
        ignore = load_image_dataset(images, boxes, dataset_filename);

228
229
230
231
232
233
234
235
236
        simple_object_detector detector = train_simple_object_detector_on_images(dataset_filename, images, boxes, ignore, options);

        std::ofstream fout(detector_output_filename.c_str(), std::ios::binary);
        int version = 1;
        serialize(detector, fout);
        serialize(version, fout);

        if (options.be_verbose)
            std::cout << "Saved detector to file " << detector_output_filename << std::endl;
237
238
    }

239
240
241
242
243
244
245
246
247
// ----------------------------------------------------------------------------------------

    struct simple_test_results
    {
        double precision;
        double recall;
        double average_precision;
    };

248
249
250
    template <typename image_array>
    inline const simple_test_results test_simple_object_detector_with_images (
            image_array& images,
251
            const unsigned int upsample_amount,
252
253
            std::vector<std::vector<rectangle> >& boxes,
            std::vector<std::vector<rectangle> >& ignore,
254
            simple_object_detector& detector
255
256
    )
    {
257
258
        for (unsigned int i = 0; i < upsample_amount; ++i)
            upsample_image_dataset<pyramid_down<2> >(images, boxes);
259
260
261
262
263
264
265
266
267

        matrix<double,1,3> res = test_object_detection_function(detector, images, boxes, ignore);
        simple_test_results ret;
        ret.precision = res(0);
        ret.recall = res(1);
        ret.average_precision = res(2);
        return ret;
    }

268
269
    inline const simple_test_results test_simple_object_detector (
        const std::string& dataset_filename,
270
271
        const std::string& detector_filename,
        const unsigned int upsample_amount
272
273
    )
    {
274
        // Load all the testing images
275
276
277
278
        dlib::array<array2d<rgb_pixel> > images;
        std::vector<std::vector<rectangle> > boxes, ignore;
        ignore = load_image_dataset(images, boxes, dataset_filename);

279
280
281
282
283
284
285
286
287
288
289
290
        // Load the detector off disk
        simple_object_detector detector;
        int version = 0;
        std::ifstream fin(detector_filename.c_str(), std::ios::binary);
        if (!fin)
            throw error("Unable to open file " + detector_filename);
        deserialize(detector, fin);
        deserialize(version, fin);
        if (version != 1)
            throw error("Unknown simple_object_detector format.");

        return test_simple_object_detector_with_images(images, upsample_amount, boxes, ignore, detector);
291
292
    }

293
294
295
296
297
298
// ----------------------------------------------------------------------------------------

}

#endif // DLIB_SIMPLE_ObJECT_DETECTOR_H__