Added the scan_image_pyramid object.

51c0c148 · Davis King · 0aac2844 · 51c0c148 · 51c0c148 · 51c0c148
Commit 51c0c148 authored Sep 07, 2011 by Davis King
3 changed files
--- a/dlib/image_processing.h
+++ b/dlib/image_processing.h
@@ -4,6 +4,7 @@
 #define DLIB_IMAGE_PROCESSInG_H___ 
 #include "image_processing/scan_image.h"
+#include "image_processing/scan_image_pyramid.h"
 #endif // DLIB_IMAGE_PROCESSInG_H___

--- a/dlib/image_processing/scan_image_pyramid.h
+++ b/dlib/image_processing/scan_image_pyramid.h
--- a/dlib/image_processing/scan_image_pyramid_abstract.h
+++ b/dlib/image_processing/scan_image_pyramid_abstract.h
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_SCAN_IMaGE_PYRAMID_ABSTRACT_H__
+#ifdef DLIB_SCAN_IMaGE_PYRAMID_ABSTRACT_H__
+#include "../matrix.h"
+#include "../geometry.h"
+#include "../image_processing.h"
+#include "../array2d.h"
+#include <vector>
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    class scan_image_pyramid : noncopyable
+    {
+        /*!
+            REQUIREMENTS ON Pyramid_type
+                - must be one of the pyramid_down objects defined in 
+                  dlib/image_transforms/image_pyramid_abstract.h or an object with
+                  a compatible interface
+            REQUIREMENTS ON Feature_extractor_type
+                - must be an object with an interface compatible with the hashed_feature_image 
+                  object defined in dlib/image_keypoint/hashed_feature_image_abstract.h.
+            INITIAL VALUE
+                - get_num_detection_templates() == 0
+                - is_loaded_with_image() == false
+                - get_max_detections_per_template() == 2000
+                - get_max_pyramid_levels() == 1000
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for running a sliding window classifier over
+                an image pyramid.  This object can also be understood as a general 
+                tool for implementing the spatial pyramid models described in the paper:
+                    Beyond Bags of Features: Spatial Pyramid Matching for Recognizing 
+                    Natural Scene Categories by Svetlana Lazebnik, Cordelia Schmid, 
+                    and Jean Ponce
+                The sliding window classifiers used by this object have three parts: 
+                   1. The underlying feature extraction provided by Feature_extractor_type
+                      objects, which associate a vector with each location in an image.
+                   2. A detection template.  This is a rectangle which defines the shape of a 
+                      sliding window (the object_box), as well as a set of rectangles which
+                      envelop it.  This set of enveloping rectangles defines the spatial
+                      structure of the overall feature extraction within a sliding window.  
+                      In particular, each location of a sliding window has a feature vector
+                      associated with it.  This feature vector is defined as follows:
+                        - Let N denote the number of enveloping rectangles.
+                        - Let M denote the dimensionality of the vectors output by Feature_extractor_type
+                          objects.
+                        - Let F(i) == the M dimensional vector which is the sum of all vectors 
+                          given by our Feature_extractor_type object inside the ith enveloping 
+                          rectangle.
+                        - Then the feature vector for a sliding window is an M*N dimensional vector
+                          [F(1) F(2) F(3) ... F(N)] (i.e. it is a concatenation of the N vectors).
+                          This feature vector can be though of as a collection of N "bags of features",
+                          each bag coming from a spatial location determined one of the enveloping 
+                          rectangles. 
+                   3. A weight vector and a threshold value.  The dot product between the weight
+                      vector and the feature vector for a sliding window location gives the score 
+                      of the window.  If this score is greater than the threshold value then the 
+                      window location is output as a detection.
+                Finally, the sliding window classifiers described above are applied to every level 
+                of an image pyramid.  
+        !*/
+    public:
+        typedef matrix<double,0,1> feature_vector_type;
+        typedef Pyramid_type pyramid_type;
+        typedef Feature_extractor_type feature_extractor_type;
+        scan_image_pyramid (
+        );  
+        /*!
+            ensures
+                - this object is properly initialized
+        !*/
+        template <
+            typename image_type
+            >
+        void load (
+            const image_type& img
+        );
+        /*!
+            requires
+                - image_type must be a type with the following properties:
+                    - image_type is default constructable.
+                    - image_type is swappable by the global swap() function.
+                    - image_type logically represents some kind of image and therefore
+                      has .nr() and .nc() member functions.  .nr() should return the
+                      number of rows while .nc() returns the number of columns.
+                    - image_type objects can be loaded into Feature_extractor_type
+                      objects via Feature_extractor_type::load().
+                    - image_type objects can be used with Pyramid_type.  That is,
+                      if pyr is an object of type Pyramid_type while img1 and img2
+                      are objects of image_type.  Then pyr(img1,img2) should be
+                      a valid expression which downsamples img1 into img2.
+            ensures
+                - #is_loaded_with_image() == true
+                - This object is ready to run sliding window classifiers over img.  Call
+                  detect() to do this.
+        !*/
+        bool is_loaded_with_image (
+        ) const;
+        /*!
+            ensures
+                - returns true if this object has been loaded with an image to process
+                  and false otherwise.
+        !*/
+        void copy_configuration(
+            const feature_extractor_type& fe
+        );
+        /*!
+            ensures
+                - Let BASE_FE denote the feature_extractor_type object used
+                  internally for local feature extraction.  Then this function
+                  performs BASE_FE.copy_configuration(fe)
+                  (i.e. this function allows you to configure the parameters of the 
+                  underlying feature extractor used by a scan_image_pyramid object)
+        !*/
+        void copy_configuration (
+            const scan_image_pyramid& item
+        );
+        /*!
+            ensures
+                - copies all the state information of item into *this, except for state 
+                  information populated by load().  More precisely, given two scan_image_pyramid 
+                  objects S1 and S2, the following sequence of instructions should always 
+                  result in both of them having the exact same state.
+                    S2.copy_configuration(S1);
+                    S1.load(img);
+                    S2.load(img);
+        !*/
+        void add_detection_template (
+            const rectangle& object_box,
+            const std::vector<rectangle>& feature_extraction_regions 
+        );
+        /*!
+            requires
+                - center(object_box) == point(0,0),
+                - if (get_num_detection_templates() > 0) then
+                    - get_num_components_per_detection_template() == feature_extraction_regions.size()
+                      (i.e. if you already have detection templates in this object, then
+                      any new detection template must declare a consistent number of 
+                      feature extraction regions)
+            ensures
+                - Adds another detection template to this object.  In particular, object_box 
+                  defines the size and shape of a sliding window while feature_extraction_regions 
+                  defines the locations for feature extraction as discussed in the WHAT THIS 
+                  OBJECT REPRESENTS section above.  Note also that the locations of the feature 
+                  extraction regions are relative to the object_box.  
+                - #get_num_detection_templates() == get_num_detection_templates() + 1
+                - The order of rectangles in feature_extraction_regions matters.  Recall that
+                  each rectangle gets its own set of features.  So given two different templates, 
+                  their ith rectangles will both share the same part of the weight vector (w) 
+                  supplied to detect().  So there should be some reasonable correspondence 
+                  between the rectangle ordering in different detection templates.  For,
+                  example, different detection templates should place corresponding 
+                  feature extraction regions in roughly the same part of the object_box.
+        !*/
+        unsigned long get_num_detection_templates (
+        ) const;
+        /*!
+            ensures
+                - returns the number of detection templates in this object
+        !*/
+        unsigned long get_num_components_per_detection_template (
+        ) const;
+        /*!
+            requires
+                - get_num_detection_templates() > 0
+            ensures
+                - A detection template is a rectangle which defines the shape of a 
+                  sliding window (the object_box), as well as a set of rectangles which
+                  envelop it.  This function returns the number of enveloping rectangles
+                  in the detection templates used by this object.
+        !*/
+        long get_num_dimensions (
+        ) const;
+        /*!
+            requires
+                - get_num_detection_templates() > 0
+            ensures
+                - returns the number of dimensions in the feature vector for a sliding window
+                  location.  This value is the dimensionality of the underlying feature vectors 
+                  produced by Feature_extractor_type times get_num_components_per_detection_template().
+        !*/
+        unsigned long get_max_pyramid_levels (
+        ) const;
+        /*!
+            ensures
+                - returns the maximum number of image pyramid levels this object will use.
+                  Note that #get_max_pyramid_levels() == 1 indicates that no image pyramid
+                  will be used at all.  That is, only the original image will be processed
+                  and no lower scale versions will be created.  
+        !*/
+        void set_max_pyramid_levels (
+            unsigned long max_levels
+        );
+        /*!
+            requires
+                - max_levels > 0
+            ensures
+                - #get_max_pyramid_levels() == max_levels
+        !*/
+        unsigned long get_max_detections_per_template (
+        ) const;
+        /*!
+            ensures
+                - For each image pyramid layer and detection template, this object scans a sliding
+                  window classifier over an image and produces a number of detections.  This
+                  function returns a number which defines a hard upper limit on the number of
+                  detections allowed by a single scan.  This means that the total number of
+                  possible detections produced by detect() is get_max_detections_per_template()*
+                  get_num_detection_templates()*(number of image pyramid layers).
+        !*/
+        void set_max_detections_per_template (
+            unsigned long max_dets
+        );
+        /*!
+            requires
+                - max_dets > 0
+            ensures
+                - #get_max_detections_per_template() == max_dets
+        !*/
+        void detect (
+            const feature_vector_type& w,
+            std::vector<std::pair<double, rectangle> >& dets,
+            const double thresh
+        ) const;
+        /*!
+            requires
+                - w.size() >= get_num_dimensions()
+                - is_loaded_with_image() == true
+                - get_num_detection_templates() > 0
+            ensures
+                - Scans all the detection templates over all pyramid layers as discussed in the 
+                  WHAT THIS OBJECT REPRESENTS section and stores all detections into #dets.
+                - for all valid i:
+                    - #dets[i].second == The object box which produced this detection.  This rectangle gives
+                      the location of the detection.  Note that the rectangle will have been converted back into
+                      the original image input space.  That is, if this detection was made at a low level in the
+                      image pyramid then the object box will have been automatically mapped up the pyramid layers
+                      to the original image space.  Or in other words, if you plot #dets[i].second on top of the 
+                      image given to load() it will show up in the right place.
+                    - #dets[i].first == The score for this detection.  This value is equal to dot(w, feature vector
+                      for this sliding window location).
+                    - #dets[i].first >= thresh
+                - #dets will be sorted in descending order. (i.e.  #dets[i].first >= #dets[j].first for all i, and j>i)
+                - Elements of w beyond index get_num_dimensions()-1 are ignored.  I.e. only the first
+                  get_num_dimensions() are used.
+                - Note that no form of non-max suppression is performed.  If a window has a score >= thresh
+                  then it is reported in #dets (assuming the limit imposed by get_max_detections_per_template() hasn't 
+                  been reached).
+        !*/
+        void get_feature_vector (
+            const std::vector<rectangle>& rects,
+            feature_vector_type& psi,
+            std::vector<rectangle>& mapped_rects
+        ) const;
+        /*!
+            requires
+                - is_loaded_with_image() == true
+                - get_num_detection_templates() > 0
+                - psi.size() >= get_num_dimensions()
+            ensures
+                - This function allows you to determine the feature vector used for a sliding window location
+                  or the sum of such vectors for a set of locations.
+                - if (rects was produced by a call to detect(), i.e. rects contains the contents of dets) then
+                    - #psi == the sum of feature vectors corresponding to the sliding window locations contained
+                      in rects.
+                    - #mapped_rects == rects
+                    - Let w denote the w vector given to detect(), then we have:
+                        - dot(w,#psi) == sum of scores of the dets produced by detect()
+                - else
+                    - Since scan_image_pyramid is a sliding window classifier system, not all possible rectangles can 
+                      be output by detect().  So in the case where rects contains rectangles which could not arise
+                      from a call to detect(), this function will map the rectangles in rects to the nearest possible 
+                      object boxes and then store the sum of feature vectors for the mapped rectangles into #psi.
+                    - for all valid i: #mapped_rects[i] == the rectangle rects[i] gets mapped to for feature extraction.
+                - #mapped_rects.size() == rects.size()
+        !*/
+    };
+// ----------------------------------------------------------------------------------------
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    void serialize (
+        const scan_image_pyramid<Pyramid_type,Feature_extractor_type>& item,
+        std::ostream& out
+    );
+    /*!
+        provides serialization support 
+    !*/
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    void deserialize (
+        scan_image_pyramid<Pyramid_type,Feature_extractor_type>& item,
+        std::istream& in 
+    );
+    /*!
+        provides deserialization support 
+    !*/
+// ----------------------------------------------------------------------------------------
+}
+#endif // DLIB_SCAN_IMaGE_PYRAMID_ABSTRACT_H__