shape_predictor.cpp 17.8 KB
Newer Older
1
2
3
// Copyright (C) 2014  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.

Davis King's avatar
Davis King committed
4
#include "opaque_types.h"
5
6
7
8
9
10
11
12
#include <dlib/python.h>
#include <dlib/geometry.h>
#include <dlib/image_processing.h>
#include "shape_predictor.h"
#include "conversion.h"

using namespace dlib;
using namespace std;
13
14

namespace py = pybind11;
15
16
17
18
19

// ----------------------------------------------------------------------------------------

full_object_detection run_predictor (
        shape_predictor& predictor,
20
        py::array img,
Davis King's avatar
Davis King committed
21
        const rectangle& box
22
23
)
{
24
    if (is_image<unsigned char>(img))
25
    {
26
        return predictor(numpy_image<unsigned char>(img), box);
27
    }
28
    else if (is_image<rgb_pixel>(img))
29
    {
30
        return predictor(numpy_image<rgb_pixel>(img), box);
31
32
33
34
35
36
37
    }
    else
    {
        throw dlib::error("Unsupported image type, must be 8bit gray or RGB image.");
    }
}

38
39
40
41
42
43
void save_shape_predictor(const shape_predictor& predictor, const std::string& predictor_output_filename)
{
    std::ofstream fout(predictor_output_filename.c_str(), std::ios::binary);
    serialize(predictor, fout);
}

44
45
46
47
48
49
50
51
52
53
// ----------------------------------------------------------------------------------------

rectangle full_obj_det_get_rect (const full_object_detection& detection)
{ return detection.get_rect(); }

unsigned long full_obj_det_num_parts (const full_object_detection& detection)
{ return detection.num_parts(); }

point full_obj_det_part (const full_object_detection& detection, const unsigned long idx)
{
54
    if (idx >= detection.num_parts())
55
56
    {
        PyErr_SetString(PyExc_IndexError, "Index out of range");
57
        throw py::error_already_set();
58
59
60
61
62
63
64
65
66
67
68
69
70
    }
    return detection.part(idx);
}

std::vector<point> full_obj_det_parts (const full_object_detection& detection)
{
    const unsigned long num_parts = detection.num_parts();
    std::vector<point> parts(num_parts);
    for (unsigned long j = 0; j < num_parts; ++j)
        parts[j] = detection.part(j);
    return parts;
}

71
std::shared_ptr<full_object_detection> full_obj_det_init(const rectangle& rect, const py::object& pyparts_)
72
{
73
74
75
76
77
78
79
80
    try 
    {
        auto&& pyparts = pyparts_.cast<py::list>();

        const unsigned long num_parts = py::len(pyparts);
        std::vector<point> parts;
        for (const auto& item : pyparts)
            parts.push_back(item.cast<point>());
81

82
83
84
85
86
87
88
89
        return std::make_shared<full_object_detection>(rect, parts);
    }
    catch (py::cast_error&)
    {
        // if it's not a py::list it better be a vector<point>.
        auto&& parts = pyparts_.cast<const std::vector<point>&>();
        return std::make_shared<full_object_detection>(rect, parts);
    }
90
91
92
93
}

// ----------------------------------------------------------------------------------------

94
inline shape_predictor train_shape_predictor_on_images_py (
95
96
        const py::list& pyimages,
        const py::list& pydetections,
97
98
99
        const shape_predictor_training_options& options
)
{
100
101
    const unsigned long num_images = py::len(pyimages);
    if (num_images != py::len(pydetections))
102
103
104
        throw dlib::error("The length of the detections list must match the length of the images list.");

    std::vector<std::vector<full_object_detection> > detections(num_images);
105
    dlib::array<numpy_image<unsigned char>> images(num_images);
106
107
    images_and_nested_params_to_dlib(pyimages, pydetections, images, detections);

108
    return train_shape_predictor_on_images(images, detections, options);
109
110
111
112
}


inline double test_shape_predictor_with_images_py (
113
114
115
        const py::list& pyimages,
        const py::list& pydetections,
        const py::list& pyscales,
116
        const shape_predictor& predictor
117
118
)
{
119
120
121
    const unsigned long num_images = py::len(pyimages);
    const unsigned long num_scales = py::len(pyscales);
    if (num_images != py::len(pydetections))
122
123
124
125
126
127
128
129
130
        throw dlib::error("The length of the detections list must match the length of the images list.");

    if (num_scales > 0 && num_scales != num_images)
        throw dlib::error("The length of the scales list must match the length of the detections list.");

    std::vector<std::vector<full_object_detection> > detections(num_images);
    std::vector<std::vector<double> > scales;
    if (num_scales > 0)
        scales.resize(num_scales);
131
    dlib::array<numpy_image<unsigned char>> images(num_images);
132

133
    // Now copy the data into dlib based objects so we can call the testing routine.
134
135
    for (unsigned long i = 0; i < num_images; ++i)
    {
136
137
138
139
140
        const unsigned long num_boxes = py::len(pydetections[i]);
        for (py::iterator det_it = pydetections[i].begin();
             det_it != pydetections[i].end();
             ++det_it)
          detections[i].push_back(det_it->cast<full_object_detection>());
141

142
        assign_image(images[i], pyimages[i].cast<py::array>());
143
144
        if (num_scales > 0)
        {
145
            if (num_boxes != py::len(pyscales[i]))
146
                throw dlib::error("The length of the scales list must match the length of the detections list.");
147
            for (py::iterator scale_it = pyscales[i].begin(); scale_it != pyscales[i].end(); ++scale_it)
148
                scales[i].push_back(scale_it->cast<double>());
149
150
151
        }
    }

152
    return test_shape_predictor_with_images(images, detections, scales, predictor);
153
154
155
}

inline double test_shape_predictor_with_images_no_scales_py (
156
157
        const py::list& pyimages,
        const py::list& pydetections,
158
        const shape_predictor& predictor
159
160
)
{
161
    py::list pyscales;
162
    return test_shape_predictor_with_images_py(pyimages, pydetections, pyscales, predictor);
163
164
165
166
}

// ----------------------------------------------------------------------------------------

167
void bind_shape_predictors(py::module &m)
168
169
170
{
    {
    typedef full_object_detection type;
171
    py::class_<type, std::shared_ptr<type>>(m, "full_object_detection",
172
173
    "This object represents the location of an object in an image along with the \
    positions of each of its constituent parts.")
Davis King's avatar
Davis King committed
174
        .def(py::init(&full_obj_det_init), py::arg("rect"), py::arg("parts"),
175
176
"requires \n\
    - rect: dlib rectangle \n\
177
    - parts: list of dlib.point, or a dlib.points object.")
178
179
180
        .def_property_readonly("rect", &full_obj_det_get_rect, "Bounding box from the underlying detector. Parts can be outside box if appropriate.")
        .def_property_readonly("num_parts", &full_obj_det_num_parts, "The number of parts of the object.")
        .def("part", &full_obj_det_part, py::arg("idx"), "A single part of the object as a dlib point.")
181
        .def("parts", &full_obj_det_parts, "A vector of dlib points representing all of the parts.")
182
        .def(py::pickle(&getstate<type>, &setstate<type>));
183
184
185
    }
    {
    typedef shape_predictor_training_options type;
186
    py::class_<type>(m, "shape_predictor_training_options",
187
        "This object is a container for the options to the train_shape_predictor() routine.")
188
189
        .def(py::init())
        .def_readwrite("be_verbose", &type::be_verbose,
190
                      "If true, train_shape_predictor() will print out a lot of information to stdout while training.")
191
        .def_readwrite("cascade_depth", &type::cascade_depth,
192
                      "The number of cascades created to train the model with.")
193
        .def_readwrite("tree_depth", &type::tree_depth,
194
                      "The depth of the trees used in each cascade. There are pow(2, get_tree_depth()) leaves in each tree")
195
        .def_readwrite("num_trees_per_cascade_level", &type::num_trees_per_cascade_level,
196
                      "The number of trees created for each cascade.")
197
        .def_readwrite("nu", &type::nu,
198
199
                      "The regularization parameter.  Larger values of this parameter \
                       will cause the algorithm to fit the training data better but may also \
200
                       cause overfitting.  The value must be in the range (0, 1].")
201
        .def_readwrite("oversampling_amount", &type::oversampling_amount,
202
                      "The number of randomly selected initial starting points sampled for each training example")
203
204
        .def_readwrite("oversampling_translation_jitter", &type::oversampling_translation_jitter,
                      "The amount of translation jittering to apply to bounding boxes, a good value is in in the range [0 0.5].")
205
        .def_readwrite("feature_pool_size", &type::feature_pool_size,
206
                      "Number of pixels used to generate features for the random trees.")
207
        .def_readwrite("lambda_param", &type::lambda_param,
208
                      "Controls how tight the feature sampling should be. Lower values enforce closer features.")
209
        .def_readwrite("num_test_splits", &type::num_test_splits,
210
                      "Number of split features at each node to sample. The one that gives the best split is chosen.")
211
212
        .def_readwrite("landmark_relative_padding_mode", &type::landmark_relative_padding_mode,
                      "If True then features are drawn only from the box around the landmarks, otherwise they come from the bounding box and landmarks together.  See feature_pool_region_padding doc for more details.")
213
        .def_readwrite("feature_pool_region_padding", &type::feature_pool_region_padding,
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
            /*!
                  This algorithm works by comparing the relative intensity of pairs of
                  pixels in the input image.  To decide which pixels to look at, the
                  training algorithm randomly selects pixels from a box roughly centered
                  around the object of interest.  We call this box the feature pool region
                  box.  
                  
                  Each object of interest is defined by a full_object_detection, which
                  contains a bounding box and a list of landmarks.  If
                  landmark_relative_padding_mode==True then the feature pool region box is
                  the tightest box that contains the landmarks inside the
                  full_object_detection.  In this mode the full_object_detection's bounding
                  box is ignored.  Otherwise, if the padding mode is bounding_box_relative
                  then the feature pool region box is the tightest box that contains BOTH
                  the landmarks and the full_object_detection's bounding box.

                  Additionally, you can adjust the size of the feature pool padding region
                  by setting feature_pool_region_padding to some value.  If
                  feature_pool_region_padding then the feature pool region box is
                  unmodified and defined exactly as stated above. However, you can expand
                  the size of the box by setting the padding > 0 or shrink it by setting it
                  to something < 0.

                  To explain this precisely, for a padding of 0 we say that the pixels are
                  sampled from a box of size 1x1.  The padding value is added to each side
                  of the box.  So a padding of 0.5 would cause the algorithm to sample
                  pixels from a box that was 2x2, effectively multiplying the area pixels
                  are sampled from by 4.  Similarly, setting the padding to -0.2 would
                  cause it to sample from a box 0.6x0.6 in size.
            !*/
                      "Size of region within which to sample features for the feature pool. \
                      positive values increase the sampling region while negative values decrease it. E.g. padding of 0 means we \
                      sample fr")
247
        .def_readwrite("random_seed", &type::random_seed,
248
                      "The random seed used by the internal random number generator")
249
250
        .def_readwrite("num_threads", &type::num_threads,
                        "Use this many threads/CPU cores for training.")
251
        .def("__str__", &::print_shape_predictor_training_options)
Davis King's avatar
Davis King committed
252
        .def("__repr__", &::print_shape_predictor_training_options)
253
        .def(py::pickle(&getstate<type>, &setstate<type>));
254
255
256
    }
    {
    typedef shape_predictor type;
257
    py::class_<type, std::shared_ptr<type>>(m, "shape_predictor",
258
259
260
261
262
"This object is a tool that takes in an image region containing some object and \
outputs a set of point locations that define the pose of the object. The classic \
example of this is human face pose prediction, where you take an image of a human \
face as input and are expected to identify the locations of important facial \
landmarks such as the corners of the mouth and eyes, tip of the nose, and so forth.")
263
264
        .def(py::init())
        .def(py::init(&load_object_from_file<type>),
265
266
"Loads a shape_predictor from a file that contains the output of the \n\
train_shape_predictor() routine.")
267
        .def("__call__", &run_predictor, py::arg("image"), py::arg("box"),
268
269
270
271
272
273
"requires \n\
    - image is a numpy ndarray containing either an 8bit grayscale or RGB \n\
      image. \n\
    - box is the bounding box to begin the shape prediction inside. \n\
ensures \n\
    - This function runs the shape predictor on the input image and returns \n\
274
      a single full_object_detection.")
275
276
        .def("save", save_shape_predictor, py::arg("predictor_output_filename"), "Save a shape_predictor to the provided path.")
        .def(py::pickle(&getstate<type>, &setstate<type>));
277
278
    }
    {
279
280
    m.def("train_shape_predictor", train_shape_predictor_on_images_py,
        py::arg("images"), py::arg("object_detections"), py::arg("options"),
281
"requires \n\
282
    - options.lambda_param > 0 \n\
283
    - 0 < options.nu <= 1 \n\
284
285
286
287
288
289
    - options.feature_pool_region_padding >= 0 \n\
    - len(images) == len(object_detections) \n\
    - images should be a list of numpy matrices that represent images, either RGB or grayscale. \n\
    - object_detections should be a list of lists of dlib.full_object_detection objects. \
      Each dlib.full_object_detection contains the bounding box and the lists of points that make up the object parts.\n\
ensures \n\
290
291
    - Uses dlib's shape_predictor_trainer object to train a \n\
      shape_predictor based on the provided labeled images, full_object_detections, and options.\n\
292
    - The trained shape_predictor is returned");
293

294
295
    m.def("train_shape_predictor", train_shape_predictor,
        py::arg("dataset_filename"), py::arg("predictor_output_filename"), py::arg("options"),
296
"requires \n\
297
    - options.lambda_param > 0 \n\
298
    - 0 < options.nu <= 1 \n\
299
300
    - options.feature_pool_region_padding >= 0 \n\
ensures \n\
301
    - Uses dlib's shape_predictor_trainer to train a \n\
302
      shape_predictor based on the labeled images in the XML file \n\
303
      dataset_filename and the provided options.  This function assumes the file dataset_filename is in the \n\
304
305
306
      XML format produced by dlib's save_image_dataset_metadata() routine. \n\
    - The trained shape predictor is serialized to the file predictor_output_filename.");

307
308
    m.def("test_shape_predictor", test_shape_predictor_py,
        py::arg("dataset_filename"), py::arg("predictor_filename"),
309
310
311
312
313
314
315
316
317
318
319
320
"ensures \n\
    - Loads an image dataset from dataset_filename.  We assume dataset_filename is \n\
      a file using the XML format written by save_image_dataset_metadata(). \n\
    - Loads a shape_predictor from the file predictor_filename.  This means \n\
      predictor_filename should be a file produced by the train_shape_predictor() \n\
      routine. \n\
    - This function tests the predictor against the dataset and returns the \n\
      mean average error of the detector.  In fact, The \n\
      return value of this function is identical to that of dlib's \n\
      shape_predictor_trainer() routine.  Therefore, see the documentation \n\
      for shape_predictor_trainer() for a detailed definition of the mean average error.");

321
322
    m.def("test_shape_predictor", test_shape_predictor_with_images_no_scales_py,
            py::arg("images"), py::arg("detections"), py::arg("shape_predictor"),
323
324
325
326
327
328
"requires \n\
    - len(images) == len(object_detections) \n\
    - images should be a list of numpy matrices that represent images, either RGB or grayscale. \n\
    - object_detections should be a list of lists of dlib.full_object_detection objects. \
      Each dlib.full_object_detection contains the bounding box and the lists of points that make up the object parts.\n\
 ensures \n\
329
    - shape_predictor should be a file produced by the train_shape_predictor()  \n\
330
331
332
333
334
335
336
337
      routine. \n\
    - This function tests the predictor against the dataset and returns the \n\
      mean average error of the detector.  In fact, The \n\
      return value of this function is identical to that of dlib's \n\
      shape_predictor_trainer() routine.  Therefore, see the documentation \n\
      for shape_predictor_trainer() for a detailed definition of the mean average error.");


338
339
    m.def("test_shape_predictor", test_shape_predictor_with_images_py,
            py::arg("images"), py::arg("detections"), py::arg("scales"), py::arg("shape_predictor"),
340
341
342
343
344
345
346
347
348
349
"requires \n\
    - len(images) == len(object_detections) \n\
    - len(object_detections) == len(scales) \n\
    - for every sublist in object_detections: len(object_detections[i]) == len(scales[i]) \n\
    - scales is a list of floating point scales that each predicted part location \
      should be divided by. Useful for normalization. \n\
    - images should be a list of numpy matrices that represent images, either RGB or grayscale. \n\
    - object_detections should be a list of lists of dlib.full_object_detection objects. \
      Each dlib.full_object_detection contains the bounding box and the lists of points that make up the object parts.\n\
 ensures \n\
350
    - shape_predictor should be a file produced by the train_shape_predictor()  \n\
351
352
353
354
355
356
357
358
      routine. \n\
    - This function tests the predictor against the dataset and returns the \n\
      mean average error of the detector.  In fact, The \n\
      return value of this function is identical to that of dlib's \n\
      shape_predictor_trainer() routine.  Therefore, see the documentation \n\
      for shape_predictor_trainer() for a detailed definition of the mean average error.");
    }
}