YOLO loss (#2376)

16500906 · Adrià Arrufat · GitHub · 951fdd00 · 16500906 · 16500906
Unverified Commit 16500906 authored Jul 30, 2021 by Adrià Arrufat Committed by GitHub Jul 29, 2021
17 changed files
--- a/dlib/data_io/image_dataset_metadata.cpp
+++ b/dlib/data_io/image_dataset_metadata.cpp
@@ -60,7 +60,13 @@ namespace dlib
            fout << "<images>\n";
            for (unsigned long i = 0; i < images.size(); ++i)
            {
-                fout << "  <image file='" << images[i].filename << "'>\n";
+                fout << "  <image file='" << images[i].filename << "'";
+                if (images[i].width != 0 && images[i].height != 0)
+                {
+                    fout << " width='" << images[i].width << "'";
+                    fout << " height='" << images[i].height << "'";
+                }
+                fout << ">\n";
                // save all the boxes
                for (unsigned long j = 0; j < images[i].boxes.size(); ++j)
@@ -251,6 +257,9 @@ namespace dlib
                        if (atts.is_in_list("file")) temp_image.filename = atts["file"];
                        else throw dlib::error("<image> missing required attribute 'file'");
+                        if (atts.is_in_list("width")) temp_image.width = sa = atts["width"];
+                        if (atts.is_in_list("height")) temp_image.height = sa = atts["height"];
                    }
                    ts.push_back(name);

--- a/dlib/data_io/image_dataset_metadata.h
+++ b/dlib/data_io/image_dataset_metadata.h
@@ -101,7 +101,7 @@ namespace dlib
        {
            /*!
                WHAT THIS OBJECT REPRESENTS
-                    This object represents an annotated image.   
+                    This object represents an annotated image.
            !*/
            image() {}
@@ -109,6 +109,8 @@ namespace dlib
            std::string filename;
            std::vector<box> boxes;
+            long width = 0;
+            long height = 0;
        };
    // ------------------------------------------------------------------------------------

--- a/dlib/dnn/loss.h
+++ b/dlib/dnn/loss.h
--- a/dlib/dnn/loss_abstract.h
+++ b/dlib/dnn/loss_abstract.h
@@ -1852,9 +1852,192 @@ namespace dlib
    template <typename SUBNET>
    using loss_dot = add_loss_layer<loss_dot_, SUBNET>;
+// ----------------------------------------------------------------------------------------
+    struct yolo_options
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object contains all the parameters that control the behavior of loss_yolo_.
+        !*/
+    public:
+        struct anchor_box_details
+        {
+            anchor_box_details() = default;
+            anchor_box_details(unsigned long w, unsigned long h) : width(w), height(h) {}
+            unsigned long width = 0;
+            unsigned long height = 0;
+            friend inline void serialize(const anchor_box_details& item, std::ostream& out);
+            friend inline void deserialize(anchor_box_details& item, std::istream& in);
+        };
+        yolo_options() = default;
+        // This kind of object detector is a multi-scale object detector with bounding box
+        // regression for anchor boxes.  The anchors field determines which anchors will be
+        // used at the output pointed by the tag layer whose id is the key of the map.
+        std::unordered_map<int, std::vector<anchor_box_details>> anchors;
+        template <template <typename> class TAG_TYPE>
+        void add_anchors(
+            const std::vector<anchor_box_details>& boxes
+        );
+        /*!
+            ensures
+                - anchors.at(tag_id<TAG_TYPE>::id) == boxes
+        !*/
+        // This field contains the labels of all the possible objects this detector can find.
+        std::vector<std::string> labels;
+        // When computing the objectness loss, any detection that has an IoU above
+        // iou_ignore_threshold with a ground truth box will not incur any loss.
+        double iou_ignore_threshold = 0.7;
+        // When computing the YOLO loss (objectness + bounding box regression + classification),
+        // the best match between a truth and an anchor is always used, regardless of the IoU.
+        // However, if other anchors have an IoU with a truth box above iou_anchor_threshold, they
+        // will also experience loss against that truth box as well.  Setting iou_anchor_threshold to 1 will
+        // make the model use only the best anchor for each ground truth, so other anchors can be
+        // used for other ground truth boxes in the same cell (useful for detecting objects in crowds).
+        // This setting is meant to be used with "high capacity" models, not small ones.
+        double iou_anchor_threshold = 1.0;
+        // When doing non-max suppression, we use overlaps_nms to decide if a box overlaps
+        // an already output detection and should therefore be thrown out.
+        test_box_overlap overlaps_nms = test_box_overlap(0.45, 1.0);
+        // When set to true, NMS will only be applied between objects with the same class label.
+        bool classwise_nms = true;
+        // These parameters control how we penalize different kinds of mistakes: notably the objectness loss,
+        // the box (bounding box regression) loss, and the classification loss.
+        double lambda_obj = 1.0;
+        double lambda_box = 1.0;
+        double lambda_cls = 1.0;
+    };
+    void serialize(const yolo_options& item, std::ostream& out)
+    void deserialize(yolo_options& item, std::istream& in)
+// ----------------------------------------------------------------------------------------
+    template <template <typename> class... TAG_TYPES>
+    class loss_yolo_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the YOLO detection
+                loss defined in the paper:
+                    YOLOv3: An Incremental Improvement by Joseph Redmon and Ali Farhadi.
+                This means you use this loss if you want to detect the locations of objects
+                in images.
+                It should also be noted that this loss layer requires tag layers as template
+                parameters, which in turn require a subnetwork to be of type:
+                layer<TAG_TYPE>(net).subnet(): sig<con<(num_classes + 5) * num_anchors), SUBNET>>
+                Where num_classes is the number of categories that the detector is trained on,
+                and num_anchors is the number of priors or anchor boxes at the output pointed
+                by the tag layer. The number 5 corresponds to the objectness plus the 4 coordinates
+                for performing bounding box regression.
+        !*/
+    public:
+        typedef std::vector<yolo_rect> training_label_type;
+        typedef std::vector<yolo_rect> output_label_type;
+        loss_yolo_(
+        );
+        /*!
+            ensures
+                - #get_options() == yolo_options()
+        !*/
+        loss_yolo_(
+            yolo_options options_
+        );
+        /*!
+            ensures
+                - #get_options() == options_
+        !*/
+        const yolo_options& get_options (
+        ) const;
+        /*!
+            ensures
+                - returns the options object that defines the general behavior of this loss layer.
+        !*/
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter,
+            double adjust_threshold = 0.25
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that:
+                - layer<TAG_TYPE>(sub).get_output().k() == options.anchors.at(tag_id<TAG_TYPE>::id).size() * (5 + options.labels.size());
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            Also, the output labels are std::vectors of yolo_rects where, for each yolo_rect R,
+            we have the following interpretations:
+                - R.rect == the location of an object in the image.
+                - R.detection_confidence == the score for the object, between 0 and 1.  Only
+                  objects with a detection_confidence > adjust_threshold are output.  So if
+                  you want to output more objects (that are also of less confidence) you
+                  can call to_label() with a smaller value of adjust_threshold.
+                - R.label == the label of the detected object.
+                - R.labels == a std::vector<std::pair<double, std::string>> containing all the confidence values
+                  and labels that have a detection score > adjust_threshold, since this loss allows
+                  for multi-label outputs.  Note that the following is true:
+                      - R.labels[0].first == R.detection_confidence
+                      - R.labels[0].second == R.label
+                - R.ignore == false (this value is unused by to_label()).
+        !*/
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() 
+            except it has the additional calling requirements that: 
+                - layer<TAG_TYPE>(sub).get_output().k() == options.anchors.at(tag_id<TAG_TYPE>::id).size() * (5 + options.labels.size());
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            Also, the loss value returned corresponds to the squared norm of the error gradient.
+        !*/
+        void adjust_nms (
+            double iou_thresh,
+            double percent_covered_thresh = 1,
+            bool classwise = true
+        );
+        /*!
+            ensures
+                - #get_options().overlaps_nms == test_box_overlap(iou_thresh, percent_covered_thresh)
+                - #get_options().classwise_nms == classwise
+        !*/
+    };
+    template <typename SUBNET>
+    using loss_yolo = add_loss_layer<loss_yolo_, SUBNET>;
 // ----------------------------------------------------------------------------------------
 }
 #endif // DLIB_DNn_LOSS_ABSTRACT_H_
--- a/dlib/image_processing/full_object_detection.h
+++ b/dlib/image_processing/full_object_detection.h
@@ -183,6 +183,60 @@ namespace dlib
            item.label = "";
    }
+// ----------------------------------------------------------------------------------------
+    struct yolo_rect
+    {
+        yolo_rect() = default;
+        yolo_rect(const drectangle& r) : rect(r) {}
+        yolo_rect(const drectangle& r, double score) : rect(r),detection_confidence(score) {}
+        yolo_rect(const drectangle& r, double score, const std::string& label) : rect(r),detection_confidence(score), label(label) {}
+        yolo_rect(const mmod_rect& r) : rect(r.rect), detection_confidence(r.detection_confidence), ignore(r.ignore), label(r.label) {}
+        drectangle rect;
+        double detection_confidence = 0;
+        bool ignore = false;
+        std::string label;
+        std::vector<std::pair<double, std::string>> labels;
+        operator rectangle() const { return rect; }
+        bool operator == (const yolo_rect& rhs) const
+        {
+            return rect == rhs.rect
+                   && detection_confidence == rhs.detection_confidence
+                   && ignore == rhs.ignore
+                   && label == rhs.label;
+        }
+        bool operator<(const yolo_rect& rhs) const
+        {
+            return detection_confidence < rhs.detection_confidence;
+        }
+    };
+    inline void serialize(const yolo_rect& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.rect, out);
+        serialize(item.detection_confidence, out);
+        serialize(item.ignore, out);
+        serialize(item.label, out);
+        serialize(item.labels, out);
+    }
+    inline void deserialize(yolo_rect& item, std::istream& in)
+    {
+        int version = 0;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::yolo_rect");
+        deserialize(item.rect, in);
+        deserialize(item.detection_confidence, in);
+        deserialize(item.ignore, in);
+        deserialize(item.label, in);
+        deserialize(item.labels, in);
+    }
 // ----------------------------------------------------------------------------------------
 }

--- a/dlib/image_processing/full_object_detection_abstract.h
+++ b/dlib/image_processing/full_object_detection_abstract.h
@@ -194,10 +194,53 @@ namespace dlib
        provides serialization support
    !*/
+// ----------------------------------------------------------------------------------------
+    struct yolo_rect
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a simple struct that is used to give training data and receive detections
+                from the YOLO Detection loss layer loss_yolo_ object.
+        !*/
+        yolo_rect() = default;
+        yolo_rect(const drectangle& r) : rect(r) {}
+        yolo_rect(const drectangle& r, double score) : rect(r),detection_confidence(score) {}
+        yolo_rect(const drectangle& r, double score, const std::string& label) : rect(r),detection_confidence(score), label(label) {}
+        yolo_rect(const mmod_rect& r) : rect(r.rect), detection_confidence(r.detection_confidence), ignore(r.ignore), label(r.label) {}
+        drectangle rect;
+        double detection_confidence = 0;
+        bool ignore = false;
+        std::string label;
+        // YOLO detectors are multi label detectors: this field will contain all confidences and labels for a particular detection
+        std::vector<std::pair<double, std::string>> labels;
+        operator rectangle() const { return rect; }
+        bool operator== (const yolo_rect& rhs) const;
+        /*!
+            ensures
+                - returns true if and only if rect == rhs.rect && detection_confidence == rhs.detection_confidence && label == rhs.label.
+        !*/
+        bool operator<(const yolo_rect& rhs) const
+        /*!
+            ensures
+                - returns true if and only if detection_confidence < rhs.detection_confidence.
+        !*/
+    };
+    inline void serialize(const yolo_rect& item, std::ostream& out);
+    inline void deserialize(yolo_rect& item, std::istream& in);
+    /*!
+        provides serialization support
+    !*/
 // ----------------------------------------------------------------------------------------
 }
 #endif // DLIB_FULL_OBJECT_DeTECTION_ABSTRACT_Hh_
--- a/dlib/image_transforms/random_cropper.h
+++ b/dlib/image_transforms/random_cropper.h
@@ -23,6 +23,7 @@ namespace dlib
        double max_object_size = 0.7; // cropped object will be at most this fraction of the size of the image.
        double background_crops_fraction = 0.5;
        double translate_amount = 0.10;
+        double min_object_coverage = 1.0;
        std::mutex rnd_mutex;
        dlib::rand rnd;
@@ -104,15 +105,26 @@ namespace dlib
            max_object_size = value; 
        }
+        double get_min_object_coverage (
+        ) const { return min_object_coverage; }
+        void set_min_object_coverage (
+            double value
+        )
+        {
+            DLIB_CASSERT(0 < value && value <= 1);
+            min_object_coverage = value;
+        }
        template <
-            typename array_type
+            typename array_type,
+            typename rectangle_type
            >
        void operator() (
            size_t num_crops,
            const array_type& images,
-            const std::vector<std::vector<mmod_rect>>& rects,
+            const std::vector<std::vector<rectangle_type>>& rects,
            array_type& crops,
-            std::vector<std::vector<mmod_rect>>& crop_rects
+            std::vector<std::vector<rectangle_type>>& crop_rects
        )
        {
            DLIB_CASSERT(images.size() == rects.size());
@@ -122,14 +134,15 @@ namespace dlib
        }
        template <
-            typename array_type
+            typename array_type,
+            typename rectangle_type
            >
        void append (
            size_t num_crops,
            const array_type& images,
-            const std::vector<std::vector<mmod_rect>>& rects,
+            const std::vector<std::vector<rectangle_type>>& rects,
            array_type& crops,
-            std::vector<std::vector<mmod_rect>>& crop_rects
+            std::vector<std::vector<rectangle_type>>& crop_rects
        )
        {
            DLIB_CASSERT(images.size() == rects.size());
@@ -145,13 +158,14 @@ namespace dlib
        template <
            typename array_type,
-            typename image_type
+            typename image_type,
+            typename rectangle_type
            >
        void operator() (
            const array_type& images,
-            const std::vector<std::vector<mmod_rect>>& rects,
+            const std::vector<std::vector<rectangle_type>>& rects,
            image_type& crop,
-            std::vector<mmod_rect>& crop_rects
+            std::vector<rectangle_type>& crop_rects
        )
        {
            DLIB_CASSERT(images.size() == rects.size());
@@ -163,27 +177,29 @@ namespace dlib
        }
        template <
-            typename image_type1
+            typename image_type1,
+            typename rectangle_type
            >
        image_type1 operator() (
            const image_type1& img
        )
        {
            image_type1 crop;
-            std::vector<mmod_rect> junk1, junk2;
+            std::vector<rectangle_type> junk1, junk2;
            (*this)(img, junk1, crop, junk2);
            return crop;
        }
        template <
            typename image_type1,
-            typename image_type2
+            typename image_type2,
+            typename rectangle_type
            >
        void operator() (
            const image_type1& img,
-            const std::vector<mmod_rect>& rects,
+            const std::vector<rectangle_type>& rects,
            image_type2& crop,
-            std::vector<mmod_rect>& crop_rects
+            std::vector<rectangle_type>& crop_rects
        )
        {
            DLIB_CASSERT(num_rows(img)*num_columns(img) != 0);
@@ -202,12 +218,14 @@ namespace dlib
                // map to crop
                rect.rect = tform(rect.rect);
+                const double intersection = get_rect(crop).intersect(rect.rect).area();
                // if the rect is at least partly in the crop
-                if (get_rect(crop).intersect(rect.rect).area() != 0)
+                if (intersection != 0)
                {
                    // set to ignore if not totally in the crop or if too small.
-                    if (!get_rect(crop).contains(rect.rect) || 
+                    if (intersection / rect.rect.area() < min_object_coverage ||
-                        ((long)rect.rect.height() < min_object_length_long_dim  && (long)rect.rect.width() < min_object_length_long_dim) || 
+                        ((long)rect.rect.height() < min_object_length_long_dim  && (long)rect.rect.width() < min_object_length_long_dim) ||
                        ((long)rect.rect.height() < min_object_length_short_dim || (long)rect.rect.width() < min_object_length_short_dim))
                    {
                        rect.ignore = true;
@@ -230,10 +248,13 @@ namespace dlib
    private:
-        template <typename image_type1>
+        template <
+            typename image_type1,
+            typename rectangle_type
+            >
        void make_crop_plan (
            const image_type1& img,
-            const std::vector<mmod_rect>& rects,
+            const std::vector<rectangle_type>& rects,
            chip_details& crop_plan,
            bool& should_flip_crop
        )
@@ -285,8 +306,9 @@ namespace dlib
            crop_plan = chip_details(crop_rect, dims, angle);
        }
+        template <typename rectangle_type>
        bool has_non_ignored_box (
-            const std::vector<mmod_rect>& rects
+            const std::vector<rectangle_type>& rects
        ) const
        {
            for (auto&& b : rects)
@@ -297,8 +319,9 @@ namespace dlib
            return false;
        }
+        template <typename rectangle_type>
        size_t randomly_pick_rect (
-            const std::vector<mmod_rect>& rects
+            const std::vector<rectangle_type>& rects
        ) 
        {
            DLIB_CASSERT(has_non_ignored_box(rects));

--- a/dlib/image_transforms/random_cropper_abstract.h
+++ b/dlib/image_transforms/random_cropper_abstract.h
@@ -19,8 +19,8 @@ namespace dlib
                This object is a tool for extracting random crops of objects from a set of
                images.  The crops are randomly jittered in scale, translation, and
                rotation but more or less centered on objects specified by mmod_rect
-                objects.
+                objects (or other rectangle types with a compatible interface).
            THREAD SAFETY
                It is safe for multiple threads to make concurrent calls to this object's
                operator() methods.
@@ -40,6 +40,7 @@ namespace dlib
                - #get_max_object_size() == 0.7
                - #get_background_crops_fraction() == 0.5
                - #get_translate_amount() == 0.1
+                - #get_min_object_coverage == 1.0
        !*/
        void set_seed (
@@ -152,7 +153,7 @@ namespace dlib
                  the longest edge of the object (i.e. either its height or width,
                  whichever is longer) is at least #get_min_object_length_long_dim() pixels
                  in length.  When we say "object" here we are referring specifically to
-                  the rectangle in the mmod_rect output by the cropper.
+                  the rectangle in the rectangle_type output by the cropper.
        !*/
        long get_min_object_length_short_dim (
@@ -163,7 +164,7 @@ namespace dlib
                  the shortest edge of the object (i.e. either its height or width,
                  whichever is shorter) is at least #get_min_object_length_short_dim()
                  pixels in length.  When we say "object" here we are referring
-                  specifically to the rectangle in the mmod_rect output by the cropper.
+                  specifically to the rectangle in the rectangle_type output by the cropper.
        !*/
        void set_min_object_size (
@@ -199,15 +200,34 @@ namespace dlib
                - #get_max_object_size() == value
        !*/
+        double get_min_object_coverage (
+        ) const;
+        /*!
+            ensures
+                - When a chip is extracted, any object that has less than get_min_object_coverage() fraction of its 
+                   total area contained within the crop will have its ignore field set to true.
+        !*/
+        void set_min_object_coverage (
+            double value
+        );
+        /*!
+            requires
+                - 0 < value <= 1
+            ensures
+                - #get_min_object_coverage() == value
+        !*/
        template <
-            typename array_type
+            typename array_type,
+            typename rectangle_type
            >
        void append (
            size_t num_crops,
            const array_type& images,
-            const std::vector<std::vector<mmod_rect>>& rects,
+            const std::vector<std::vector<rectangle_type>>& rects,
            array_type& crops,
-            std::vector<std::vector<mmod_rect>>& crop_rects
+            std::vector<std::vector<rectangle_type>>& crop_rects
        );
        /*!
            requires
@@ -218,6 +238,8 @@ namespace dlib
                - array_type is a type with an interface compatible with dlib::array or
                  std::vector and it must in turn contain image objects that implement the
                  interface defined in dlib/image_processing/generic_image.h 
+                - rectangle_type is a type with an interface compatible with mmod_rect, such
+                  as yolo_rect.
            ensures
                - Randomly extracts num_crops chips from images and appends them to the end
                  of crops.  We also copy the object metadata for each extracted crop and
@@ -230,14 +252,15 @@ namespace dlib
        !*/
        template <
-            typename array_type
+            typename array_type,
+            typename rectangle_type
            >
        void operator() (
            size_t num_crops,
            const array_type& images,
-            const std::vector<std::vector<mmod_rect>>& rects,
+            const std::vector<std::vector<rectangle_type>>& rects,
            array_type& crops,
-            std::vector<std::vector<mmod_rect>>& crop_rects
+            std::vector<std::vector<rectangle_type>>& crop_rects
        );
        /*!
            requires
@@ -247,6 +270,8 @@ namespace dlib
                - array_type is a type with an interface compatible with dlib::array or
                  std::vector and it must in turn contain image objects that implement the
                  interface defined in dlib/image_processing/generic_image.h 
+                - rectangle_type is a type with an interface compatible with mmod_rect, such
+                  as yolo_rect.
            ensures
                - Randomly extracts num_crops chips from images.  We also copy the object
                  metadata for each extracted crop and store it into #crop_rects.  In
@@ -259,13 +284,14 @@ namespace dlib
        template <
            typename array_type,
-            typename image_type
+            typename image_type,
+            typename rectangle_type
            >
        void operator() (
            const array_type& images,
-            const std::vector<std::vector<mmod_rect>>& rects,
+            const std::vector<std::vector<rectangle_type>>& rects,
            image_type& crop,
-            std::vector<mmod_rect>& crop_rects
+            std::vector<rectangle_type>& crop_rects
        );
        /*!
            requires
@@ -277,6 +303,8 @@ namespace dlib
                - array_type is a type with an interface compatible with dlib::array or
                  std::vector and it must in turn contain image objects that implement the
                  interface defined in dlib/image_processing/generic_image.h 
+                - rectangle_type is a type with an interface compatible with mmod_rect, such
+                  as yolo_rect.
            ensures
                - Selects a random image and creates a random crop from it.  Specifically,
                  we pick a random index IDX < images.size() and then execute 
@@ -285,13 +313,14 @@ namespace dlib
        template <
            typename image_type1,
-            typename image_type2
+            typename image_type2,
+            typename rectangle_type
            >
        void operator() (
            const image_type1& img,
-            const std::vector<mmod_rect>& rects,
+            const std::vector<rectangle_type>& rects,
            image_type2& crop,
-            std::vector<mmod_rect>& crop_rects
+            std::vector<rectangle_type>& crop_rects
        );
        /*!
            requires
@@ -300,9 +329,11 @@ namespace dlib
                  dlib/image_processing/generic_image.h 
                - image_type2 == an image object that implements the interface defined in
                  dlib/image_processing/generic_image.h 
+                - rectangle_type is a type with an interface compatible with mmod_rect, such
+                  as yolo_rect.
            ensures
-                - Extracts a random crop from img and copies over the mmod_rect objects in
+                - Extracts a random crop from img and copies over the rectangle_type objects
-                  rects to #crop_rects if they are contained inside the crop.  Moreover,
+                  in rects to #crop_rects if they are contained inside the crop.  Moreover,
                  rectangles are marked as ignore if they aren't completely contained
                  inside the crop.
                - #crop_rects.size() <= rects.size()
@@ -343,4 +374,3 @@ namespace dlib
 #endif // DLIB_RaNDOM_CROPPER_ABSTRACT_H_
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -155,6 +155,7 @@ if (NOT USING_OLD_VISUAL_STUDIO_COMPILER)
   add_example(dnn_instance_segmentation_train_ex)
   add_example(dnn_metric_learning_on_images_ex)
   add_gui_example(dnn_dcgan_train_ex)
+   add_gui_example(dnn_yolo_train_ex)
 endif()

--- a/examples/dnn_yolo_train_ex.cpp
+++ b/examples/dnn_yolo_train_ex.cpp
--- a/examples/faces/testing.xml
+++ b/examples/faces/testing.xml
@@ -4,7 +4,7 @@
 <name>Testing faces</name>
 <comment>These are images from the PASCAL VOC 2011 dataset.</comment>
 <images>
-  <image file='2008_002470.jpg'>
+  <image file='2008_002470.jpg' width='500' height='332'>
    <box top='181' left='274' width='52' height='53'/>
    <box top='156' left='55' width='44' height='44'/>
    <box top='166' left='146' width='37' height='37'/>
@@ -12,12 +12,12 @@
    <box top='74' left='233' width='44' height='44'/>
    <box top='86' left='178' width='37' height='37'/>
  </image>
-  <image file='2008_002506.jpg'>
+  <image file='2008_002506.jpg' width='500' height='375'>
    <box top='78' left='329' width='109' height='109'/>
    <box top='95' left='224' width='91' height='91'/>
    <box top='65' left='125' width='90' height='91'/>
  </image>
-  <image file='2008_004176.jpg'>
+  <image file='2008_004176.jpg' width='480' height='438'>
    <box top='230' left='206' width='37' height='37'/>
    <box top='118' left='162' width='37' height='37'/>
    <box top='82' left='190' width='37' height='37'/>
@@ -26,7 +26,7 @@
    <box top='86' left='110' width='37' height='37'/>
    <box top='102' left='282' width='37' height='37'/>
  </image>
-  <image file='2008_007676.jpg'>
+  <image file='2008_007676.jpg' width='500' height='334'>
    <box top='62' left='226' width='37' height='37'/>
    <box top='113' left='194' width='44' height='44'/>
    <box top='130' left='262' width='37' height='37'/>
@@ -35,9 +35,9 @@
    <box top='141' left='107' width='52' height='53'/>
    <box top='84' left='137' width='44' height='44'/>
  </image>
-  <image file='2009_004587.jpg'>
+  <image file='2009_004587.jpg' width='400' height='500'>
    <box top='46' left='154' width='75' height='76'/>
    <box top='280' left='266' width='63' height='63'/>
  </image>
 </images>
 </dataset>
\ No newline at end of file
--- a/examples/faces/testing_with_face_landmarks.xml
+++ b/examples/faces/testing_with_face_landmarks.xml
@@ -5,10 +5,9 @@
 <comment>These are images from the PASCAL VOC 2011 dataset.
   The face landmarks are from dlib's shape_predictor_68_face_landmarks.dat
   landmarking model.  The model uses the 68 landmark scheme used by the iBUG
-   300-W dataset.
+   300-W dataset.</comment>
-</comment>
 <images>
-  <image file='2008_002470.jpg'>
+  <image file='2008_002470.jpg' width='500' height='332'>
    <box top='181' left='274' width='52' height='53'>
      <part name='00' x='277' y='194'/>
      <part name='01' x='278' y='200'/>
@@ -430,7 +429,7 @@
      <part name='67' x='196' y='112'/>
    </box>
  </image>
-  <image file='2008_002506.jpg'>
+  <image file='2008_002506.jpg' width='500' height='375'>
    <box top='78' left='329' width='109' height='109'>
      <part name='00' x='342' y='134'/>
      <part name='01' x='345' y='145'/>
@@ -642,7 +641,7 @@
      <part name='67' x='163' y='133'/>
    </box>
  </image>
-  <image file='2008_004176.jpg'>
+  <image file='2008_004176.jpg' width='480' height='438'>
    <box top='230' left='206' width='37' height='37'>
      <part name='00' x='206' y='241'/>
      <part name='01' x='206' y='245'/>
@@ -1134,7 +1133,7 @@
      <part name='67' x='294' y='126'/>
    </box>
  </image>
-  <image file='2008_007676.jpg'>
+  <image file='2008_007676.jpg' width='500' height='334'>
    <box top='62' left='226' width='37' height='37'>
      <part name='00' x='223' y='72'/>
      <part name='01' x='224' y='77'/>
@@ -1626,7 +1625,7 @@
      <part name='67' x='160' y='115'/>
    </box>
  </image>
-  <image file='2009_004587.jpg'>
+  <image file='2009_004587.jpg' width='400' height='500'>
    <box top='46' left='154' width='75' height='76'>
      <part name='00' x='147' y='74'/>
      <part name='01' x='147' y='84'/>
@@ -1769,4 +1768,4 @@
    </box>
  </image>
 </images>
 </dataset>
\ No newline at end of file
--- a/examples/faces/training.xml
+++ b/examples/faces/training.xml
@@ -4,7 +4,7 @@
 <name>Training faces</name>
 <comment>These are images from the PASCAL VOC 2011 dataset.</comment>
 <images>
-  <image file='2007_007763.jpg'>
+  <image file='2007_007763.jpg' width='500' height='375'>
    <box top='90' left='194' width='37' height='37'/>
    <box top='114' left='158' width='37' height='37'/>
    <box top='89' left='381' width='45' height='44'/>
@@ -13,7 +13,7 @@
    <box top='86' left='294' width='37' height='37'/>
    <box top='233' left='309' width='45' height='44'/>
  </image>
-  <image file='2008_002079.jpg'>
+  <image file='2008_002079.jpg' width='500' height='375'>
    <box top='166' left='407' width='37' height='37'/>
    <box top='134' left='122' width='37' height='37'/>
    <box top='138' left='346' width='37' height='37'/>
@@ -21,11 +21,11 @@
    <box top='134' left='62' width='37' height='37'/>
    <box top='194' left='41' width='44' height='44'/>
  </image>
-  <image file='2008_001009.jpg'>
+  <image file='2008_001009.jpg' width='360' height='480'>
    <box top='79' left='145' width='76' height='76'/>
    <box top='214' left='125' width='90' height='91'/>
  </image>
-  <image file='2008_001322.jpg'>
+  <image file='2008_001322.jpg' width='500' height='375'>
    <box top='162' left='104' width='76' height='76'/>
    <box top='218' left='232' width='63' height='63'/>
    <box top='155' left='344' width='90' height='90'/>

--- a/examples/faces/training_with_face_landmarks.xml
+++ b/examples/faces/training_with_face_landmarks.xml
@@ -5,10 +5,9 @@
 <comment>These are images from the PASCAL VOC 2011 dataset.
   The face landmarks are from dlib's shape_predictor_68_face_landmarks.dat
   landmarking model.  The model uses the 68 landmark scheme used by the iBUG
-   300-W dataset.
+   300-W dataset.</comment>
-</comment>
 <images>
-  <image file='2007_007763.jpg'>
+  <image file='2007_007763.jpg' width='500' height='375'>
    <box top='90' left='194' width='37' height='37'>
      <part name='00' x='201' y='107'/>
      <part name='01' x='201' y='110'/>
@@ -500,7 +499,7 @@
      <part name='67' x='323' y='267'/>
    </box>
  </image>
-  <image file='2008_002079.jpg'>
+  <image file='2008_002079.jpg' width='500' height='375'>
    <box top='166' left='406' width='37' height='37'>
      <part name='00' x='412' y='179'/>
      <part name='01' x='411' y='183'/>
@@ -922,7 +921,7 @@
      <part name='67' x='68' y='227'/>
    </box>
  </image>
-  <image file='2008_001009.jpg'>
+  <image file='2008_001009.jpg' width='360' height='480'>
    <box top='79' left='145' width='76' height='76'>
      <part name='00' x='145' y='115'/>
      <part name='01' x='148' y='124'/>
@@ -1064,7 +1063,7 @@
      <part name='67' x='168' y='280'/>
    </box>
  </image>
-  <image file='2008_001322.jpg'>
+  <image file='2008_001322.jpg' width='500' height='375'>
    <box top='162' left='104' width='76' height='76'>
      <part name='00' x='106' y='183'/>
      <part name='01' x='106' y='193'/>
@@ -1277,4 +1276,4 @@
    </box>
  </image>
 </images>
 </dataset>
\ No newline at end of file
--- a/tools/imglab/src/cluster.cpp
+++ b/tools/imglab/src/cluster.cpp
@@ -247,6 +247,8 @@ int cluster_dataset(
        {
            idata[i].first = std::numeric_limits<double>::infinity();
            idata[i].second.filename = data.images[i].filename;
+            idata[i].second.width = data.images[i].width;
+            idata[i].second.height = data.images[i].height;
            if (!has_non_ignored_boxes(data.images[i]))
                continue;

--- a/tools/imglab/src/main.cpp
+++ b/tools/imglab/src/main.cpp
@@ -21,7 +21,7 @@
 #include <dlib/dir_nav.h>
-const char* VERSION = "1.17";
+const char* VERSION = "1.18";
@@ -332,6 +332,8 @@ void rotate_dataset(const command_line_parser& parser)
        load_image(img, metadata.images[i].filename);
        const point_transform_affine tran = rotate_image(img, temp, angle*pi/180);
+        metadata.images[i].width = temp.nc();
+        metadata.images[i].height = temp.nr();
        if (parser.option("jpg"))
        {
            filename = to_jpg_name(filename);
@@ -359,6 +361,32 @@ void rotate_dataset(const command_line_parser& parser)
 // ----------------------------------------------------------------------------------------
+void add_width_and_height_metadata(const command_line_parser& parser)
+{
+    for (unsigned long i = 0; i < parser.number_of_arguments(); ++i) {
+        image_dataset_metadata::dataset metadata;
+        const string datasource = parser[i];
+        load_image_dataset_metadata(metadata,datasource);
+        // Set the current directory to be the one that contains the
+        // metadata file. We do this because the file might contain
+        // file paths which are relative to this folder.
+        set_current_dir(get_parent_directory(file(datasource)));
+        parallel_for(0, metadata.images.size(), [&](long i) 
+        {
+            array2d<rgb_pixel> img;
+            load_image(img, metadata.images[i].filename);
+            metadata.images[i].width = img.nc();
+            metadata.images[i].height = img.nr();
+        });
+        save_image_dataset_metadata(metadata, datasource);
+    }
+}
+// ----------------------------------------------------------------------------------------
 int resample_dataset(const command_line_parser& parser)
 {
    if (parser.number_of_arguments() != 1)
@@ -447,6 +475,8 @@ int resample_dataset(const command_line_parser& parser)
            std::ostringstream sout;
            sout << hex << murmur_hash3_128bit(&chip[0][0], chip.size()*sizeof(chip[0][0])).second;
            dimg.filename = data.images[i].filename + "_RESAMPLED_"+sout.str()+".png";
+            dimg.width = chip.nc();
+            dimg.height = chip.nr();
            if (parser.option("jpg"))
            {
@@ -588,6 +618,8 @@ int main(int argc, char** argv)
                                        "The parts are instead simply mirrored to the flipped dataset.", 1);
        parser.add_option("rotate", "Read an XML image dataset and output a copy that is rotated counter clockwise by <arg> degrees. "
                                  "The output is saved to an XML file prefixed with rotated_<arg>.",1);
+        parser.add_option("add-width-height-metadata", "Open the given xml files and set the width and height image metadata fields "
+                            "for every image. This involves loading each image to find these values.");
        parser.add_option("cluster", "Cluster all the objects in an XML file into <arg> different clusters (pass 0 to find automatically) and save "
                                     "the results as cluster_###.xml and cluster_###.jpg files.",1);
        parser.add_option("ignore", "Mark boxes labeled as <arg> as ignored.  The resulting XML file is output as a separate file and the original is not modified.",1);
@@ -612,7 +644,7 @@ int main(int argc, char** argv)
        const char* singles[] = {"h","c","r","l","files","convert","parts","rmdiff", "rmtrunc", "rmdupes", "seed", "shuffle", "split", "add", 
                                 "flip-basic", "flip", "rotate", "tile", "size", "cluster", "resample", "min-object-size", "rmempty",
                                 "crop-size", "cropped-object-size", "rmlabel", "rm-other-labels", "rm-if-overlaps", "sort-num-objects", 
-                                 "one-object-per-image", "jpg", "rmignore", "sort", "split-train-test", "box-images"};
+                                 "one-object-per-image", "jpg", "rmignore", "sort", "split-train-test", "box-images", "add-width-height-metadata"};
        parser.check_one_time_options(singles);
        const char* c_sub_ops[] = {"r", "convert"};
        parser.check_sub_options("c", c_sub_ops);
@@ -637,6 +669,7 @@ int main(int argc, char** argv)
        parser.check_incompatible_options("c", "flip-basic");
        parser.check_incompatible_options("flip", "flip-basic");
        parser.check_incompatible_options("c", "rotate");
+        parser.check_incompatible_options("c", "add-width-height-metadata");
        parser.check_incompatible_options("c", "rename");
        parser.check_incompatible_options("c", "ignore");
        parser.check_incompatible_options("c", "parts");
@@ -650,6 +683,7 @@ int main(int argc, char** argv)
        parser.check_incompatible_options("l", "flip");
        parser.check_incompatible_options("l", "flip-basic");
        parser.check_incompatible_options("l", "rotate");
+        parser.check_incompatible_options("l", "add-width-height-metadata");
        parser.check_incompatible_options("files", "rename");
        parser.check_incompatible_options("files", "ignore");
        parser.check_incompatible_options("files", "add");
@@ -657,22 +691,27 @@ int main(int argc, char** argv)
        parser.check_incompatible_options("files", "flip");
        parser.check_incompatible_options("files", "flip-basic");
        parser.check_incompatible_options("files", "rotate");
+        parser.check_incompatible_options("files", "add-width-height-metadata");
        parser.check_incompatible_options("add", "flip");
        parser.check_incompatible_options("add", "flip-basic");
        parser.check_incompatible_options("add", "rotate");
+        parser.check_incompatible_options("add", "add-width-height-metadata");
        parser.check_incompatible_options("add", "tile");
        parser.check_incompatible_options("flip", "tile");
        parser.check_incompatible_options("flip-basic", "tile");
        parser.check_incompatible_options("rotate", "tile");
+        parser.check_incompatible_options("add-width-height-metadata", "tile");
        parser.check_incompatible_options("cluster", "tile");
        parser.check_incompatible_options("resample", "tile");
        parser.check_incompatible_options("flip", "cluster");
        parser.check_incompatible_options("flip-basic", "cluster");
        parser.check_incompatible_options("rotate", "cluster");
+        parser.check_incompatible_options("add-width-height-metadata", "cluster");
        parser.check_incompatible_options("add", "cluster");
        parser.check_incompatible_options("flip", "resample");
        parser.check_incompatible_options("flip-basic", "resample");
        parser.check_incompatible_options("rotate", "resample");
+        parser.check_incompatible_options("add-width-height-metadata", "resample");
        parser.check_incompatible_options("add", "resample");
        parser.check_incompatible_options("shuffle", "tile");
        parser.check_incompatible_options("sort-num-objects", "tile");
@@ -738,6 +777,12 @@ int main(int argc, char** argv)
            return EXIT_SUCCESS;
        }
+        if (parser.option("add-width-height-metadata"))
+        {
+            add_width_and_height_metadata(parser);
+            return EXIT_SUCCESS;
+        }
        if (parser.option("v"))
        {
            cout << "imglab v" << VERSION 

--- a/tools/imglab/src/metadata_editor.cpp
+++ b/tools/imglab/src/metadata_editor.cpp
@@ -224,7 +224,12 @@ void propagate_boxes(
    array2d<rgb_pixel> img1, img2;
    dlib::load_image(img1, data.images[prev].filename);
+    data.images[prev].width = img1.nc();
+    data.images[prev].height = img1.nr();
    dlib::load_image(img2, data.images[next].filename);
+    data.images[next].width = img2.nc();
+    data.images[next].height = img2.nr();
    for (unsigned long i = 0; i < data.images[prev].boxes.size(); ++i)
    {
        correlation_tracker tracker;
@@ -513,6 +518,8 @@ load_image(
    try
    {
        dlib::load_image(img, metadata.images[idx].filename);
+        metadata.images[idx].width = img.nc();
+        metadata.images[idx].height = img.nr();
        set_title(metadata.name + " #"+cast_to_string(idx)+": " +metadata.images[idx].filename);
    }
    catch (exception& e)
@@ -543,6 +550,8 @@ load_image_and_set_size(
    try
    {
        dlib::load_image(img, metadata.images[idx].filename);
+        metadata.images[idx].width = img.nc();
+        metadata.images[idx].height = img.nr();
        set_title(metadata.name + " #"+cast_to_string(idx)+": " +metadata.images[idx].filename);
    }
    catch (exception& e)