Merge remote-tracking branch 'upstream/master'

22164e5d · Joachim · 462d61ef · 664ef398 · 22164e5d · 22164e5d
Commit 22164e5d authored Apr 10, 2017 by Joachim
20 changed files
--- a/dlib/graph_utils/edge_list_graphs.h
+++ b/dlib/graph_utils/edge_list_graphs.h
@@ -286,7 +286,7 @@ namespace dlib
        // Hold the length for the longest edge for each node.  Initially they are all infinity.
        std::vector<double> worst_dists(samples.size(), std::numeric_limits<double>::infinity());

-        std::vector<sample_pair>::iterator begin_i, end_i, begin_j, end_j, itr;
+        std::vector<sample_pair>::iterator begin_i, end_i, begin_j, end_j;
        begin_i = edges.begin();
        end_i = begin_i + k;


--- a/dlib/image_processing/scan_fhog_pyramid.h
+++ b/dlib/image_processing/scan_fhog_pyramid.h
@@ -1306,7 +1306,6 @@ namespace dlib


        // Do non-max suppression
-        dets.clear();
        if (detectors.size() > 1)
            std::sort(dets_accum.rbegin(), dets_accum.rend());
        for (unsigned long i = 0; i < dets_accum.size(); ++i)

--- a/dlib/image_transforms/image_pyramid.h
+++ b/dlib/image_transforms/image_pyramid.h
@@ -319,8 +319,8 @@ namespace dlib
                        ptype temp = temp_img[r-2][c] + 
                                    temp_img[r-1][c]*4 +  
                                    temp_img[r  ][c]*6 +  
-                                    temp_img[r-1][c]*4 +  
-                                    temp_img[r-2][c];  
+                                    temp_img[r+1][c]*4 +  
+                                    temp_img[r+2][c];  

                        assign_pixel(down[dr][c],temp/256);
                    }
@@ -443,18 +443,18 @@ namespace dlib
                        temp.red = temp_img[r-2][c].red + 
                                temp_img[r-1][c].red*4 +  
                                temp_img[r  ][c].red*6 +  
-                                temp_img[r-1][c].red*4 +  
-                                temp_img[r-2][c].red;  
+                                temp_img[r+1][c].red*4 +  
+                                temp_img[r+2][c].red;  
                        temp.green = temp_img[r-2][c].green + 
                                    temp_img[r-1][c].green*4 +  
                                    temp_img[r  ][c].green*6 +  
-                                    temp_img[r-1][c].green*4 +  
-                                    temp_img[r-2][c].green;  
+                                    temp_img[r+1][c].green*4 +  
+                                    temp_img[r+2][c].green;  
                        temp.blue = temp_img[r-2][c].blue + 
                                    temp_img[r-1][c].blue*4 +  
                                    temp_img[r  ][c].blue*6 +  
-                                    temp_img[r-1][c].blue*4 +  
-                                    temp_img[r-2][c].blue;  
+                                    temp_img[r+1][c].blue*4 +  
+                                    temp_img[r+2][c].blue;  

                        down[dr][c].red = temp.red/256;
                        down[dr][c].green = temp.green/256;

--- a/dlib/matlab/cmake_mex_wrapper
+++ b/dlib/matlab/cmake_mex_wrapper
@@ -8,6 +8,9 @@ cmake_minimum_required(VERSION 2.8.11)
 set(BUILDING_MATLAB_MEX_FILE true)
 set(CMAKE_POSITION_INDEPENDENT_CODE True)

+# Trying to use cuda with matlab hasn't worked well, so just disable it.
+SET(DLIB_USE_CUDA OFF CACHE BOOL "" FORCE)
+
 # Find MATLAB's include directory and needed libraries 
 find_program(MATLAB_EXECUTABLE matlab PATHS
        "C:/Program Files/MATLAB/*/bin"

--- a/dlib/matlab/mex_wrapper.cpp
+++ b/dlib/matlab/mex_wrapper.cpp
@@ -387,6 +387,29 @@ namespace mex_binding
        return escape_percent(sout.str());
    }

+// -------------------------------------------------------
+
+    template <
+        typename matrix_type
+        >
+    typename dlib::enable_if_c<is_matrix<matrix_type>::value || is_array2d<matrix_type>::value >::type
+    clear_mat (
+        matrix_type& m
+    )  
+    {
+        m.set_size(0,0);
+    }
+
+    template <
+        typename matrix_type
+        >
+    typename dlib::disable_if_c<is_matrix<matrix_type>::value || is_array2d<matrix_type>::value >::type
+    clear_mat (
+        matrix_type& 
+    )  
+    {
+    }
+
 // -------------------------------------------------------

    template <
@@ -651,6 +674,12 @@ namespace mex_binding
        }
        else if (is_matrix<T>::value || is_array2d<T>::value)
        {
+            if (prhs == NULL)
+            {
+                clear_mat(arg);
+                return;
+            }
+
            typedef typename inner_type<T>::type type;

            const int num_dims = mxGetNumberOfDimensions(prhs);

--- a/dlib/matrix/matrix_trsm.h
+++ b/dlib/matrix/matrix_trsm.h
@@ -9,6 +9,7 @@ namespace dlib
 {
    namespace blas_bindings
    {
+#ifdef DLIB_USE_BLAS
 #ifndef CBLAS_H
        extern "C"
        {
@@ -25,6 +26,7 @@ namespace dlib
                             double *B, const int ldb);
        }
 #endif // if not CBLAS_H
+#endif // if DLIB_USE_BLAS

    // ------------------------------------------------------------------------------------


--- a/dlib/optimization/optimization_solve_qp_using_smo.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo.h
@@ -5,6 +5,8 @@

 #include "optimization_solve_qp_using_smo_abstract.h"
 #include "../matrix.h"
+#include <map>
+#include "../unordered_pair.h"

 namespace dlib
 {
@@ -412,8 +414,8 @@ namespace dlib
        typename T, long NR, long NC, typename MM, typename L
        >
    unsigned long solve_qp_box_constrained ( 
-        const matrix_exp<EXP1>& _Q,
-        const matrix_exp<EXP2>& _b,
+        const matrix_exp<EXP1>& Q,
+        const matrix_exp<EXP2>& b,
        matrix<T,NR,NC,MM,L>& alpha,
        const matrix<T,NR,NC,MM,L>& lower,
        const matrix<T,NR,NC,MM,L>& upper,
@@ -421,9 +423,6 @@ namespace dlib
        unsigned long max_iter
    )
    {
-        const_temp_matrix<EXP1> Q(_Q);
-        const_temp_matrix<EXP2> b(_b);
-
        // make sure requires clause is not broken
        DLIB_ASSERT(Q.nr() == Q.nc() &&
                     alpha.size() == lower.size() &&
@@ -551,6 +550,329 @@ namespace dlib
        return iter+1;
    }

+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        // Check if each vector in Q_offdiag is actually a constant times the 1s vector.
+        template <
+            typename T, long NR, long NC, typename MM, typename L
+            >
+        bool has_uniform_offdiag_vectors(
+            const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag
+        )
+        {
+            for (auto& x : Q_offdiag)
+            {
+                auto ref = x.second(0);
+                for (auto& y : x.second)
+                    if (ref != y)
+                        return false;
+            }
+            return true;
+        }
+
+        template <
+            typename T, long NR, long NC, typename MM, typename L
+            >
+        matrix<T,0,0,MM,L> compact_offdiag(
+            const size_t& num_blocks,
+            const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag
+        )
+        {
+            matrix<T,0,0,MM,L> temp;
+            // we can only compact the offdiag information if they are uniform vectors
+            if (!has_uniform_offdiag_vectors(Q_offdiag))
+                return temp;
+
+            temp.set_size(num_blocks, num_blocks);
+            temp = 0;
+
+            for (auto& x : Q_offdiag)
+            {
+                long r = x.first.first;
+                long c = x.first.second;
+                temp(r,c) = x.second(0);
+                temp(c,r) = x.second(0);
+            }
+
+            return temp;
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_box_constrained_blockdiag ( 
+        const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks,
+        const std::vector<matrix<T,NR,NC,MM,L>>& bs,
+        const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag,
+        std::vector<matrix<T,NR,NC,MM,L>>& alphas,
+        const std::vector<matrix<T,NR,NC,MM,L>>& lowers,
+        const std::vector<matrix<T,NR,NC,MM,L>>& uppers,
+        T eps,
+        unsigned long max_iter
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_CASSERT(Q_blocks.size() > 0);
+        DLIB_CASSERT(Q_blocks.size() == bs.size() && 
+                     Q_blocks.size() == alphas.size() &&
+                     Q_blocks.size() == lowers.size() &&
+                     Q_blocks.size() == uppers.size(),
+                   "Q_blocks.size():  "<< Q_blocks.size() << "\n" <<
+                   "bs.size():        "<< bs.size() << "\n" <<
+                   "alphas.size():    "<< alphas.size() << "\n" <<
+                   "lowers.size():    "<< lowers.size() << "\n" <<
+                   "uppers.size():    "<< uppers.size() << "\n"
+                   );
+        for (auto& Q : Q_blocks)
+        {
+            DLIB_CASSERT(Q.nr() == Q.nc(), "All the matrices in Q_blocks have the same dimensions.");
+            DLIB_CASSERT(Q.size() > 0, "All the matrices in Q_blocks must be non-empty and have the same dimensions.");
+            DLIB_CASSERT(Q.nr() == Q_blocks[0].nr() && Q.nc() == Q_blocks[0].nc(), "All the matrices in Q_blocks have the same dimensions.");
+        }
+#ifdef ENABLE_ASSERTS
+        for (size_t i = 0; i < alphas.size(); ++i)
+        {
+            DLIB_CASSERT(is_col_vector(bs[i]) && bs[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(bs["<<i<<"]): " << is_col_vector(bs[i]) << "\n" <<
+                "bs["<<i<<"].size():         " << bs[i].size() << "\n" <<
+                "Q_blocks[0].nr():           " << Q_blocks[0].nr());
+
+            for (auto& Qoffdiag : Q_offdiag)
+            {
+                auto& Q_offdiag_element = Qoffdiag.second;
+                long r = Qoffdiag.first.first;
+                long c = Qoffdiag.first.second;
+                DLIB_CASSERT(is_col_vector(Q_offdiag_element) && Q_offdiag_element.size() == Q_blocks[0].nr(),
+                    "is_col_vector(Q_offdiag["<<r<<","<<c<<"]): " << is_col_vector(Q_offdiag_element) << "\n" <<
+                    "Q_offdiag["<<r<<","<<c<<"].size():         " << Q_offdiag_element.size() << "\n" <<
+                    "Q_blocks[0].nr():                  " << Q_blocks[0].nr());
+            }
+
+            DLIB_CASSERT(is_col_vector(alphas[i]) && alphas[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(alphas["<<i<<"]): " << is_col_vector(alphas[i]) << "\n" <<
+                "alphas["<<i<<"].size():         " << alphas[i].size() << "\n" <<
+                "Q_blocks[0].nr():               " << Q_blocks[0].nr());
+
+            DLIB_CASSERT(is_col_vector(lowers[i]) && lowers[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(lowers["<<i<<"]): " << is_col_vector(lowers[i]) << "\n" <<
+                "lowers["<<i<<"].size():         " << lowers[i].size() << "\n" <<
+                "Q_blocks[0].nr():               " << Q_blocks[0].nr());
+
+            DLIB_CASSERT(is_col_vector(uppers[i]) && uppers[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(uppers["<<i<<"]): " << is_col_vector(uppers[i]) << "\n" <<
+                "uppers["<<i<<"].size():         " << uppers[i].size() << "\n" <<
+                "Q_blocks[0].nr():               " << Q_blocks[0].nr());
+
+            DLIB_CASSERT(0 <= min(alphas[i]-lowers[i]), "min(alphas["<<i<<"]-lowers["<<i<<"]): " << min(alphas[i]-lowers[i]));
+            DLIB_CASSERT(0 <= max(uppers[i]-alphas[i]), "max(uppers["<<i<<"]-alphas["<<i<<"]): " << max(uppers[i]-alphas[i]));
+        }
+        DLIB_CASSERT(eps > 0 && max_iter > 0, "eps: " << eps << "\nmax_iter: "<< max_iter);
+#endif // ENABLE_ASSERTS
+
+
+        const auto offdiag_compact = impl::compact_offdiag(Q_blocks.size(), Q_offdiag);
+        matrix<T,0,0,MM,L> temp, alphas_compact;
+
+        // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.  
+        std::vector<matrix<T,NR,NC,MM,L>> df;// = Q*alpha + b;
+        auto compute_df = [&]()
+        {
+            df.resize(Q_blocks.size());
+            for (size_t i = 0; i < df.size(); ++i)
+                df[i] = Q_blocks[i]*alphas[i] + bs[i];
+
+
+            // Don't forget to include the Q_offdiag terms in the computation.  Note that
+            // we have two options for how we can compute this part.  If Q_offdiag is
+            // uniform and can be compacted into a simple matrix and there are a lot of off
+            // diagonal entries then it's faster to do it as a matrix multiply.  Otherwise
+            // we do the more general computation.
+            if (offdiag_compact.size() != 0 && Q_offdiag.size() > Q_blocks.size()*5)
+            {
+                // Do it as a matrix multiply (with a bit of data shuffling)
+                alphas_compact.set_size(alphas[0].size(), offdiag_compact.nr());
+                for (long c = 0; c < alphas_compact.nc(); ++c)
+                    set_colm(alphas_compact,c) = alphas[c];
+                temp = alphas_compact*offdiag_compact;
+                for (size_t i = 0; i < df.size(); ++i)
+                    df[i] += colm(temp,i);
+            }
+            else
+            {
+                // Do the fully general computation that allows for non-uniform values in
+                // the off diagonal vectors.
+                for (auto& p : Q_offdiag)
+                {
+                    long r = p.first.first;
+                    long c = p.first.second;
+                    df[r] += pointwise_multiply(p.second, alphas[c]);
+                    if (r != c)
+                        df[c] += pointwise_multiply(p.second, alphas[r]);
+                }
+            }
+        };
+        compute_df();
+
+
+
+        std::vector<matrix<T,NR,NC,MM,L>> Q_diag, Q_ggd;
+        std::vector<matrix<T,NR,NC,MM,L>> QQ;// = reciprocal_max(diag(Q));
+        QQ.resize(Q_blocks.size());
+        Q_diag.resize(Q_blocks.size());
+        Q_ggd.resize(Q_blocks.size());
+
+        // We need to get an upper bound on the Lipschitz constant for this QP. Since that
+        // is just the max eigenvalue of Q we can do it using Gershgorin disks.
+        //const T lipschitz_bound = max(diag(Q) + (sum_cols(abs(Q)) - abs(diag(Q))));
+        for (size_t i = 0; i < QQ.size(); ++i)
+        {
+            auto f = Q_offdiag.find(make_unordered_pair(i,i));
+            if (f != Q_offdiag.end())
+                Q_diag[i] = diag(Q_blocks[i]) + f->second;
+            else
+                Q_diag[i] = diag(Q_blocks[i]);
+            QQ[i] = reciprocal_max(Q_diag[i]);
+
+            Q_ggd[i] = Q_diag[i] + (sum_cols(abs(Q_blocks[i]))-abs(diag(Q_blocks[i])));
+        }
+        for (auto& p : Q_offdiag)
+        {
+            long r = p.first.first;
+            long c = p.first.second;
+            if (r != c)
+            {
+                Q_ggd[r] += abs(p.second);
+                Q_ggd[c] += abs(p.second);
+            }
+        }
+        T lipschitz_bound = -std::numeric_limits<T>::infinity();
+        for (auto& x : Q_ggd)
+            lipschitz_bound = std::max(lipschitz_bound, max(x));
+
+
+        const long num_variables = alphas.size()*alphas[0].size();
+
+        // First we use a coordinate descent method to initialize alpha. 
+        double max_df = 0;
+        for (long iter = 0; iter < num_variables*2; ++iter)
+        {
+            max_df = 0;
+            long best_r =0;
+            size_t best_r2 =0;
+            // find the best alpha to optimize.
+            for (size_t r2 = 0; r2 < alphas.size(); ++r2) 
+            {
+                auto& alpha = alphas[r2];
+                auto& df_ = df[r2];
+                auto& lower = lowers[r2];
+                auto& upper = uppers[r2];
+                for (long r = 0; r < alpha.nr(); ++r)
+                {
+                    if (alpha(r) <= lower(r) && df_(r) > 0)
+                        ;//alpha(r) = lower(r);
+                    else if (alpha(r) >= upper(r) && df_(r) < 0)
+                        ;//alpha(r) = upper(r);
+                    else if (std::abs(df_(r)) > max_df)
+                    {
+                        best_r = r;
+                        best_r2 = r2;
+                        max_df = std::abs(df_(r));
+                    }
+                }
+            }
+
+            // now optimize alphas[best_r2](best_r)
+            const long r = best_r;
+            auto& alpha = alphas[best_r2];
+            auto& lower = lowers[best_r2];
+            auto& upper = uppers[best_r2];
+            auto& df_ = df[best_r2];
+            const T old_alpha = alpha(r);
+            alpha(r) = -(df_(r)-Q_diag[best_r2](r)*alpha(r))*QQ[best_r2](r);
+            if (alpha(r) < lower(r))
+                alpha(r) = lower(r);
+            else if (alpha(r) > upper(r))
+                alpha(r) = upper(r);
+
+            const T delta = old_alpha-alpha(r);
+
+            // Now update the gradient. We will perform the equivalent of: df = Q*alpha +
+            // b; except we only need to compute one column of the matrix multiply because
+            // only one element of alpha changed.
+            auto& Q = Q_blocks[best_r2];
+            for(long k = 0; k < df_.nr(); ++k)
+                df_(k) -= Q(r,k)*delta;
+            for(size_t j = 0; j < Q_blocks.size(); ++j)
+            {
+                auto f = Q_offdiag.find(make_unordered_pair(best_r2, j));
+                if (f != Q_offdiag.end())
+                    df[j](r) -= f->second(r)*delta;
+            }
+        }
+
+
+
+
+        std::vector<matrix<T,NR,NC,MM,L>> v(alphas), v_old(alphas.size());
+        double lambda = 0;
+        unsigned long iter;
+        // Now do the main iteration block of this solver.  The coordinate descent method
+        // we used above can improve the objective rapidly in the beginning.  However,
+        // Nesterov's method has more rapid convergence once it gets going so this is what
+        // we use for the main iteration.
+        for (iter = 0; iter < max_iter; ++iter)
+        {
+            const double next_lambda = (1 + std::sqrt(1+4*lambda*lambda))/2;
+            const double gamma = (1-lambda)/next_lambda;
+            lambda = next_lambda;
+
+            v_old.swap(v);
+
+            //df = Q*alpha + b;
+            compute_df();
+
+            // now take a projected gradient step using Nesterov's method.
+            for (size_t j = 0; j < alphas.size(); ++j)
+            {
+                v[j] = clamp(alphas[j] - 1.0/lipschitz_bound * df[j], lowers[j], uppers[j]);
+                alphas[j] = clamp((1-gamma)*v[j] + gamma*v_old[j], lowers[j], uppers[j]);
+            }
+
+
+            // check for convergence every 10 iterations
+            if (iter%10 == 0)
+            {
+                max_df = 0;
+                for (size_t r2 = 0; r2 < alphas.size(); ++r2) 
+                {
+                    auto& alpha = alphas[r2];
+                    auto& df_ = df[r2];
+                    auto& lower = lowers[r2];
+                    auto& upper = uppers[r2];
+                    for (long r = 0; r < alpha.nr(); ++r)
+                    {
+                        if (alpha(r) <= lower(r) && df_(r) > 0)
+                            ;//alpha(r) = lower(r);
+                        else if (alpha(r) >= upper(r) && df_(r) < 0)
+                            ;//alpha(r) = upper(r);
+                        else if (std::abs(df_(r)) > max_df)
+                            max_df = std::abs(df_(r));
+                    }
+                }
+                if (max_df < eps)
+                    break;
+            }
+        }
+
+        return iter+1;
+    }
+
 // ----------------------------------------------------------------------------------------

    template <

--- a/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
@@ -4,6 +4,8 @@
 #ifdef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_

 #include "../matrix.h"
+#include <map>
+#include "../unordered_pair.h"

 namespace dlib
 {
@@ -162,6 +164,74 @@ namespace dlib
              converge to eps accuracy then the number returned will be max_iter+1.
    !*/

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_box_constrained_blockdiag ( 
+        const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks,
+        const std::vector<matrix<T,NR,NC,MM,L>>& bs,
+        const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag,
+        std::vector<matrix<T,NR,NC,MM,L>>& alphas,
+        const std::vector<matrix<T,NR,NC,MM,L>>& lowers,
+        const std::vector<matrix<T,NR,NC,MM,L>>& uppers,
+        T eps,
+        unsigned long max_iter
+    );
+    /*!
+        requires
+            - Q_blocks.size() > 0
+            - Q_blocks.size() == bs.size() == alphas.size() == lowers.size() == uppers.size()
+            - All the matrices in Q_blocks have the same dimensions.  Moreover, they are
+              non-empty square matrices.
+            - All the matrices in bs, Q_offdiag, alphas, lowers, and uppers have the same
+              dimensions.  Moreover, they are all column vectors.
+            - Q_blocks[0].nr() == alphas[0].size()
+              (i.e. the dimensionality of the column vectors in alphas must match the
+              dimensionality of the square matrices in Q_blocks.)
+            - for all valid i:
+                - 0 <= min(alphas[i]-lowers[i])
+                - 0 <= max(uppers[i]-alphas[i])
+            - eps > 0
+            - max_iter > 0
+        ensures
+            - This function solves the same QP as solve_qp_box_constrained(), except it is
+              optimized for the case where the Q matrix has a certain sparsity structure.
+              To be precise:
+                - Let Q1 be a block diagonal matrix with the elements of Q_blocks placed
+                  along its diagonal, and in the order contained in Q_blocks.  
+                - Let Q2 be a matrix with the same size as Q1, except instead of being block diagonal, it
+                  is block structured into Q_blocks.nr() by Q_blocks.nc() blocks.  If we let (r,c) be the
+                  coordinate of each block then each block contains the matrix
+                  diagm(Q_offdiag[make_unordered_pair(r,c)]) or the zero matrix if Q_offdiag has no entry
+                  for the coordinate (r,c).
+                - Let Q == Q1+Q2
+                - Let b == the concatenation of all the vectors in bs into one big vector.
+                - Let alpha == the concatenation of all the vectors in alphas into one big vector.
+                - Let lower == the concatenation of all the vectors in lowers into one big vector.
+                - Let upper == the concatenation of all the vectors in uppers into one big vector.
+                - Then this function solves the following quadratic program:
+                    Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(b)*alpha 
+                    subject to the following box constraints on alpha:
+                        - 0 <= min(alpha-lower)
+                        - 0 <= max(upper-alpha)
+                    Where f is convex.  This means that Q should be positive-semidefinite.
+                - More specifically, this function is identical to
+                  solve_qp_box_constrained(Q, b, alpha, lower, upper, eps, max_iter),
+                  except that it runs faster since it avoids unnecessary computation by
+                  taking advantage of the sparsity structure in the QP.
+            - The solution to the above QP will be stored in #alphas.
+            - This function uses a combination of a SMO algorithm along with Nesterov's
+              method as the main iteration of the solver.  It starts the algorithm with the
+              given alpha and it works on the problem until the derivative of f(alpha) is
+              smaller than eps for each element of alpha or the alpha value is at a box
+              constraint.  So eps controls how accurate the solution is and smaller values
+              result in better solutions.
+            - At most max_iter iterations of optimization will be performed.  
+            - returns the number of iterations performed.  If this method fails to
+              converge to eps accuracy then the number returned will be max_iter+1.
+    !*/
 // ----------------------------------------------------------------------------------------

    template <

--- a/dlib/test/CMakeLists.txt
+++ b/dlib/test/CMakeLists.txt
@@ -174,7 +174,7 @@ if (CMAKE_COMPILER_IS_GNUCXX)
 endif()


-TARGET_LINK_LIBRARIES(${target_name} dlib )
+TARGET_LINK_LIBRARIES(${target_name} dlib::dlib )


 if (NOT DLIB_NO_GUI_SUPPORT)

--- a/dlib/test/cublas.cpp
+++ b/dlib/test/cublas.cpp
@@ -8,7 +8,7 @@
 #include <cstdlib>
 #include <ctime>
 #include <vector>
-#include "../dnn/cublas_dlibapi.h"
+#include "../dnn/tensor_tools.h"

 #include "tester.h"

@@ -25,6 +25,26 @@ namespace
    logger dlog("test.cublas");


+    void test_inv()
+    {
+        tt::tensor_rand rnd;
+        dlib::tt::inv tinv;
+        dlib::cuda::inv cinv;
+        resizable_tensor minv1, minv2;
+        for (int n = 1; n < 20; ++n)
+        {
+            print_spinner();
+            resizable_tensor m(n,n);
+            rnd.fill_uniform(m);
+
+            tinv(m, minv1);
+            cinv(m, minv2);
+            matrix<float> mref = inv(mat(m));
+            DLIB_TEST_MSG(mean(abs(mref-mat(minv1)))/mean(abs(mref)) < 1e-5, mean(abs(mref-mat(minv1)))/mean(abs(mref)) <<"  n: " << n);
+            DLIB_TEST_MSG(mean(abs(mref-mat(minv2)))/mean(abs(mref)) < 1e-5, mean(abs(mref-mat(minv2)))/mean(abs(mref)) <<"  n: " << n);
+        }
+    }
+

    class cublas_tester : public tester
    {
@@ -38,6 +58,7 @@ namespace
        void perform_test (
        )
        {
+            test_inv();
            {
                resizable_tensor a(4,3), b(3,4), c(3,3);


--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -698,6 +698,45 @@ namespace

 #ifdef DLIB_USE_CUDA

+    void test_affine_rect()
+    {
+        dlib::rand rnd;
+
+        for (int iter = 0; iter < 20; ++iter)
+        {
+
+            long nr = 1 + rnd.get_random_32bit_number()%10;
+            long nc = 1 + rnd.get_random_32bit_number()%10;
+
+            resizable_tensor dest1(nr,nc), dest2(nr,nc), src1(nr,nc), src2(nr,nc), src3(nr,nc);
+            matrix<float> dest3;
+
+            dest1 = 1;
+            dest2 = 1;
+            dest3 = mat(dest1);
+            src1 = 2;
+            src2 = 3;
+            src3 = 4;
+
+            point p1(rnd.get_random_32bit_number()%nc, rnd.get_random_32bit_number()%nr);
+            point p2(rnd.get_random_32bit_number()%nc, rnd.get_random_32bit_number()%nr);
+            rectangle rect(p1,p2);
+
+            cuda::affine_transform(rect, dest1, src1, src2, src3, 2,3,4);
+
+            cpu::affine_transform(rect, dest2, src1, src2, src3, 2,3,4);
+
+            DLIB_TEST(mat(dest1) == mat(dest2));
+
+            set_subm(dest3,rect) = 2*subm(mat(src1),rect) + 3*subm(mat(src2),rect) + 4*subm(mat(src3),rect);
+            DLIB_TEST(dest3 == mat(dest1));
+
+            dest1 = 1;
+            tt::affine_transform(rect, dest1, src1, src2, src3, 2,3,4);
+            DLIB_TEST(dest3 == mat(dest1));
+        }
+    }
+
    void test_conv()
    {
        cuda::tensor_conv conv1;
@@ -1883,6 +1922,7 @@ namespace

            test_tagging();
 #ifdef DLIB_USE_CUDA
+            test_affine_rect();
            test_conv();
            test_more_ops2();
            test_more_ops(1,1);

--- a/dlib/test/opt_qp_solver.cpp
+++ b/dlib/test/opt_qp_solver.cpp
@@ -507,6 +507,171 @@ namespace
        DLIB_TEST(length(A*c1 - B*c2) < 4);
    }

+// ----------------------------------------------------------------------------------------
+
+    void test_solve_qp_box_constrained_blockdiag()
+    {
+        dlib::rand rnd;
+        for (int iter = 0; iter < 50; ++iter)
+        {
+            print_spinner();
+
+            matrix<double> Q1, Q2;
+            matrix<double,0,1> b1, b2;
+
+            Q1 = randm(4,4,rnd); Q1 = Q1*trans(Q1);
+            Q2 = randm(4,4,rnd); Q2 = Q2*trans(Q2);
+            b1 = gaussian_randm(4,1, iter*2+0);
+            b2 = gaussian_randm(4,1, iter*2+1);
+
+            std::map<unordered_pair<size_t>, matrix<double,0,1>> offdiag;
+
+            if (rnd.get_random_gaussian() > 0)
+                offdiag[make_unordered_pair(0,0)] = randm(4,1,rnd);
+            if (rnd.get_random_gaussian() > 0)
+                offdiag[make_unordered_pair(1,0)] = randm(4,1,rnd);
+            if (rnd.get_random_gaussian() > 0)
+                offdiag[make_unordered_pair(1,1)] = randm(4,1,rnd);
+
+            std::vector<matrix<double>> Q_blocks = {Q1, Q2};
+            std::vector<matrix<double,0,1>> bs = {b1, b2};
+
+
+            // make the single big Q and b
+            matrix<double> Q = join_cols(join_rows(Q1, zeros_matrix(Q1)),
+                join_rows(zeros_matrix(Q2),Q2));
+            matrix<double,0,1> b = join_cols(b1,b2);
+            for (auto& p : offdiag)
+            {
+                long r = p.first.first;
+                long c = p.first.second;
+                set_subm(Q, 4*r,4*c, 4,4) += diagm(p.second);
+                if (c != r)
+                    set_subm(Q, 4*c,4*r, 4,4) += diagm(p.second);
+            }
+
+
+            matrix<double,0,1> alpha = zeros_matrix(b);
+            matrix<double,0,1> lower = -10000*ones_matrix(b);
+            matrix<double,0,1> upper = 10000*ones_matrix(b);
+
+            auto iters = solve_qp_box_constrained(Q, b, alpha, lower, upper, 1e-9, 10000);
+            dlog << LINFO << "iters: "<< iters;
+            dlog << LINFO << "alpha: " << trans(alpha);
+
+            dlog << LINFO;
+
+            std::vector<matrix<double,0,1>> alphas(2);
+            alphas[0] = zeros_matrix<double>(4,1); alphas[1] = zeros_matrix<double>(4,1);
+
+            lower = -10000*ones_matrix(alphas[0]);
+            upper = 10000*ones_matrix(alphas[0]);
+            std::vector<matrix<double,0,1>> lowers = {lower,lower}, uppers = {upper, upper};
+            auto iters2 = solve_qp_box_constrained_blockdiag(Q_blocks, bs, offdiag, alphas, lowers, uppers, 1e-9, 10000);
+            dlog << LINFO << "iters2: "<< iters2;
+            dlog << LINFO << "alpha: " << trans(join_cols(alphas[0],alphas[1]));
+
+            dlog << LINFO << "obj1: "<< 0.5*trans(alpha)*Q*alpha + trans(b)*alpha;
+            dlog << LINFO << "obj2: "<< 0.5*trans(join_cols(alphas[0],alphas[1]))*Q*join_cols(alphas[0],alphas[1]) + trans(b)*join_cols(alphas[0],alphas[1]);
+            dlog << LINFO << "obj1-obj2: "<<(0.5*trans(alpha)*Q*alpha + trans(b)*alpha) - (0.5*trans(join_cols(alphas[0],alphas[1]))*Q*join_cols(alphas[0],alphas[1]) + trans(b)*join_cols(alphas[0],alphas[1]));
+
+            DLIB_TEST_MSG(max(abs(alpha - join_cols(alphas[0], alphas[1]))) < 1e-6, max(abs(alpha - join_cols(alphas[0], alphas[1]))));
+
+            DLIB_TEST(iters == iters2);
+
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void test_solve_qp_box_constrained_blockdiag_compact(dlib::rand& rnd, double percent_off_diag_present)
+    {
+        print_spinner();
+
+        dlog << LINFO << "test_solve_qp_box_constrained_blockdiag_compact(), percent_off_diag_present==" << percent_off_diag_present;
+
+        std::map<unordered_pair<size_t>, matrix<double,0,1>> offdiag;
+        std::vector<matrix<double>> Q_blocks;
+        std::vector<matrix<double,0,1>> bs;
+
+        const long num_blocks = 20;
+        const long dims = 4;
+        const double lambda = 10;
+        for (long i = 0; i < num_blocks; ++i)
+        {
+            matrix<double> Q1;
+            matrix<double,0,1> b1;
+            Q1 = randm(dims,dims,rnd); Q1 = Q1*trans(Q1);
+            b1 = gaussian_randm(dims,1, i);
+
+            Q_blocks.push_back(Q1);
+            bs.push_back(b1);
+
+            // test with some graph regularization terms
+            for (long j = 0; j < num_blocks; ++j)
+            {
+                if (rnd.get_random_double() < percent_off_diag_present)
+                {
+                    if (i==j)
+                        offdiag[make_unordered_pair(i,j)] = (num_blocks-1)*lambda*rnd.get_random_double()*ones_matrix<double>(dims,1);
+                    else
+                        offdiag[make_unordered_pair(i,j)] = -lambda*rnd.get_random_double()*ones_matrix<double>(dims,1);
+                }
+            }
+        }
+
+        // build out the dense version of the QP so we can test it against the dense solver.
+        matrix<double> Q(num_blocks*dims, num_blocks*dims); 
+        Q = 0;
+        matrix<double,0,1> b(num_blocks*dims);
+        for (long i = 0; i < num_blocks; ++i)
+        {
+            set_subm(Q,i*dims,i*dims,dims,dims) = Q_blocks[i];
+            set_subm(b,i*dims,0,dims,1) = bs[i];
+        }
+        for (auto& p : offdiag)
+        {
+            long r = p.first.first;
+            long c = p.first.second;
+            set_subm(Q, dims*r,dims*c, dims,dims) += diagm(p.second);
+            if (c != r)
+                set_subm(Q, dims*c,dims*r, dims,dims) += diagm(p.second);
+        }
+
+
+
+        matrix<double,0,1> alpha = zeros_matrix<double>(dims*num_blocks,1);
+        matrix<double,0,1> lower = -10000*ones_matrix<double>(dims*num_blocks,1);
+        matrix<double,0,1> upper = 10000*ones_matrix<double>(dims*num_blocks,1);
+
+        auto iters = solve_qp_box_constrained(Q, b, alpha, lower, upper, 1e-9, 20000);
+        dlog << LINFO << "iters: "<< iters;
+
+
+        matrix<double,0,1> init_alpha = zeros_matrix(bs[0]);
+        lower = -10000*ones_matrix(bs[0]);
+        upper = 10000*ones_matrix(bs[0]);
+
+        std::vector<matrix<double,0,1>> alphas(num_blocks, init_alpha);
+        std::vector<matrix<double,0,1>> lowers(num_blocks, lower);
+        std::vector<matrix<double,0,1>> uppers(num_blocks, upper);
+
+        auto iters2 = solve_qp_box_constrained_blockdiag(Q_blocks, bs, offdiag, alphas, lowers, uppers, 1e-9, 20000);
+        dlog << LINFO << "iters2: "<< iters2;
+
+
+        const matrix<double> refalpha = reshape(alpha, num_blocks, dims);
+
+        // now make sure the two solvers agree on the outputs.
+        for (long r = 0; r < num_blocks; ++r)
+        {
+            for (long c = 0; c < dims; ++c)
+            {
+                DLIB_TEST_MSG(std::abs(refalpha(r,c) - alphas[r](c)) < 1e-6, std::abs(refalpha(r,c) - alphas[r](c)));
+            }
+        }
+    }
+
 // ----------------------------------------------------------------------------------------

    class opt_qp_solver_tester : public tester
@@ -566,6 +731,16 @@ namespace


            test_find_gap_between_convex_hulls();
+            test_solve_qp_box_constrained_blockdiag();
+
+            // try a range of off diagonal sparseness.  We do this to make sure we exercise both
+            // the compact and sparse code paths within the solver.
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.001);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.01);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.04);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.10);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.50);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 1.00);
        }

        double do_the_test (

--- a/dlib/timer/timer.cpp
+++ b/dlib/timer/timer.cpp
@@ -24,11 +24,38 @@ namespace dlib
    timer_global_clock::
    ~timer_global_clock()
    {
+        // The only time this destructor is called is when 
+        //
+        // a) the process terminates
+        // b) the dynamic library(.so/.dll) is unloaded (could be a part of a))
+        // 
+        // in case of a)
+        //   windows: the process termination is especially painful, since threads are killed
+        //     before destructors of the process image .dll's are called.
+        //     Thus, for the windows platform, there is no threads running, so the only thing
+        //     to do here is just let the standard memberwise destructors run
+        //   linux: it's ok to just signal shutdown and wait for the running thread, to exit
+        //   
+        // in case of b)
+        //   windows:
+        //     if it's part of the termination process, a) applies
+        //     if its part of user doing manual load_library/unload_library
+        //     there is no (safe/robust)solution, but best practices are described here
+        //          https://msdn.microsoft.com/en-us/library/windows/desktop/dn633971.aspx
+        //     to support such a clean shutdown, you are required to make a call prior to
+        //     unload dll, that shutdown all the threads in the contained dll.
+        //     This could be done in this module by providing a global_delete_clock()
+        // 
+        // linux: the destructor for linux will do it's usual job regardless.
+        //
+
+        #ifndef _WIN32
        m.lock();
        shutdown = true;
        s.signal();
        m.unlock();
        wait();
+        #endif
    }

 // ----------------------------------------------------------------------------------------

--- a/dlib/timer/timer_heavy.h
+++ b/dlib/timer/timer_heavy.h
@@ -48,7 +48,7 @@ namespace dlib
                    - there is a thread running
                - if (is_running()) then
                    - next_time_to_run == the time when the next execution of the action
-                      function should occurr.  (the time is given by ts.get_timestamp())
+                      function should occur.  (the time is given by ts.get_timestamp())

                - stop_running is used to tell the thread to quit.  If it is
                  set to true then the thread should end.

--- a/dlib/tokenizer/tokenizer_kernel_1.cpp
+++ b/dlib/tokenizer/tokenizer_kernel_1.cpp
@@ -29,7 +29,7 @@ namespace dlib
        catch (...)
        {
            if (headset) delete [] headset;
-            if (bodyset) delete [] headset;
+            if (bodyset) delete [] bodyset;
            throw;
        }
    }

--- a/docs/docs/optimization.xml
+++ b/docs/docs/optimization.xml
@@ -44,6 +44,7 @@
         <name>Special Purpose Optimizers</name>
         <item>find_gap_between_convex_hulls</item> 
         <item>solve_qp_box_constrained</item> 
+         <item>solve_qp_box_constrained_blockdiag</item> 
         <item>solve_qp_using_smo</item> 
         <item>solve_qp2_using_smo</item> 
         <item>solve_qp3_using_smo</item> 
@@ -475,6 +476,31 @@ subject to the following constraint:
                                 
      </component>

+   <!-- ************************************************************************* -->
+      
+      <component>
+         <name>solve_qp_box_constrained_blockdiag</name>
+         <file>dlib/optimization.h</file>
+         <spec_file link="true">dlib/optimization/optimization_solve_qp_using_smo_abstract.h</spec_file>
+         <description>
+             This function solves the following quadratic program:
+<pre>
+   Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(b)*alpha 
+   subject to the following box constraints on alpha:
+      0 &lt;= min(alpha-lower)
+      0 &lt;= max(upper-alpha)
+   Where f is convex.  This means that Q should be positive-semidefinite.
+</pre>
+            
+            So it does the same thing as <a href="#solve_qp_box_constrained">solve_qp_box_constrained</a>,
+            except it is optimized for large Q matrices with a special block
+            structure.  In particular, Q must be grouped into identically sized
+            blocks where all blocks are diagonal matrices, except those on the
+            main diagonal which can be dense. 
+         </description>
+                                 
+      </component>
+
   <!-- ************************************************************************* -->
      
      <component>

--- a/docs/docs/term_index.xml
+++ b/docs/docs/term_index.xml
@@ -286,6 +286,7 @@
         <term file="optimization.html" name="find_optimal_parameters"        include="dlib/optimization/find_optimal_parameters.h"/>
         <term file="optimization.html" name="elastic_net"                    include="dlib/optimization/elastic_net.h"/>
         <term file="optimization.html" name="solve_qp_box_constrained"       include="dlib/optimization.h"/>
+         <term file="optimization.html" name="solve_qp_box_constrained_blockdiag"       include="dlib/optimization.h"/>
         <term file="optimization.html" name="solve_qp_using_smo"             include="dlib/optimization.h"/>
         <term file="optimization.html" name="find_gap_between_convex_hulls"  include="dlib/optimization.h"/>
         <term file="optimization.html" name="solve_qp2_using_smo"            include="dlib/optimization.h"/>

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
 #
-# This is a CMake makefile.  You can find the cmake utility and
-# information about it at http://www.cmake.org
+#  _______ _    _ _____  _____     _____  _____                 
+# |__   __| |  | |_   _|/ ____|   |_   _|/ ____|       /\       
+#    | |  | |__| | | | | (___       | | | (___        /  \      
+#    | |  |  __  | | |  \___ \      | |  \___ \      / /\ \     
+#    | |  | |  | |_| |_ ____) |    _| |_ ____) |    / ____ \    
+#    |_|__|_|_ |_|_____|_____/__  |_____|_____/    /_/  _ \_\   
+#   |__   __| |  | |__   __/ __ \|  __ \|_   _|   /\   | |      
+#      | |  | |  | |  | | | |  | | |__) | | |    /  \  | |      
+#      | |  | |  | |  | | | |  | |  _  /  | |   / /\ \ | |      
+#      | |  | |__| |  | | | |__| | | \ \ _| |_ / ____ \| |____  
+#      |_|   \____/   |_|  \____/|_|  \_\_____/_/    \_\______| 
+#
+#
+#    _____  ______          _____      _______ _    _ ______       
+#   |  __ \|  ____|   /\   |  __ \    |__   __| |  | |  ____|      
+#   | |__) | |__     /  \  | |  | |      | |  | |__| | |__         
+#   |  _  /|  __|   / /\ \ | |  | |      | |  |  __  |  __|        
+#   | | \ \| |____ / ____ \| |__| |      | |  | |  | | |____       
+#   |_|__\_\______/_/_ __\_\_____/__ _   |_|__|_|_ |_|______|_ _ _ 
+#  / ____/ __ \|  \/  |  \/  |  ____| \ | |__   __/ ____| | | | | |
+# | |   | |  | | \  / | \  / | |__  |  \| |  | | | (___   | | | | |
+# | |   | |  | | |\/| | |\/| |  __| | . ` |  | |  \___ \  | | | | |
+# | |___| |__| | |  | | |  | | |____| |\  |  | |  ____) | |_|_|_|_|
+#  \_____\____/|_|  |_|_|  |_|______|_| \_|  |_| |_____/  (_|_|_|_)
+#                                                                  
+#
+#
+# This is a CMake makefile.  CMake is a tool that helps you build C++ programs.
+# You can download CMake from http://www.cmake.org.  This CMakeLists.txt file
+# you are reading builds dlib's example programs. 
 #


 cmake_minimum_required(VERSION 2.8.12)
+# Every project needs a name.  We call this the "examples" project.
+project(examples)

-PROJECT(examples)

+# Tell cmake we will need dlib.  This command will pull in dlib and compile it
+# into your project.  Note that you don't need to compile or install dlib.  All
+# it needs is the dlib source code folder and it will take care of everything.
 include(../dlib/cmake)

-# Tell CMake to compile a program.  We do this with the ADD_EXECUTABLE()
-# statement which takes the name of the output executable and then a list of
-# .cpp files to compile.  Here each example consists of only one .cpp file but
-# in general you will make programs that const of many .cpp files.
-ADD_EXECUTABLE(assignment_learning_ex assignment_learning_ex.cpp)
-# Then we tell it to link with dlib.
-TARGET_LINK_LIBRARIES(assignment_learning_ex dlib::dlib)
+
+# The next thing we need to do is tell CMake about the code you want to
+# compile.  We do this with the add_executable() statement which takes the name
+# of the output executable and then a list of .cpp files to compile.  Here we
+# are going to compile one of the dlib example programs which has only one .cpp
+# file, assignment_learning_ex.cpp.  If your program consisted of multiple .cpp
+# files you would simply list them here in the add_executable() statement.  
+add_executable(assignment_learning_ex assignment_learning_ex.cpp)
+# Finally, you need to tell CMake that this program, assignment_learning_ex,
+# depends on dlib.  You do that with this statement: 
+target_link_libraries(assignment_learning_ex dlib::dlib)
+
+
+
+# To compile this program all you need to do is ask cmake.  You would type
+# these commands from within the directory containing this CMakeLists.txt
+# file:
+#   mkdir build
+#   cmake ..
+#   cmake --build . --config Release
+#
+# The cmake .. command looks in the parent folder for a file named
+# CMakeLists.txt, reads it, sets up everything needed to build program.  Also,
+# note that CMake can also generate Visual Studio or XCode project files.  So
+# if instead you had written:
+#   mkdir build
+#   cmake .. -G "Visual Studio 14 2015 Win64" ..
+#
+# You would be able to open the resulting visual studio project and compile and
+# edit the example programs within the visual studio IDE.  CMake can generate a
+# lot of different types of IDE projects.  Run the cmake -h command to see a list
+# of arguments to -G to see what kinds of projects cmake can generate for you.
+# It probably includes your favorite IDE in the list.
+
+


+#################################################################################
+#################################################################################
+#  A CMakeLists.txt file can compile more than just one program.  So below we
+#  tell it to compile the other dlib example programs using pretty much the
+#  same CMake commands we used above.
+#################################################################################
+#################################################################################


 # Since there are a lot of examples I'm going to use a macro to simply this
 # CMakeLists.txt file.  However, usually you will create only one executable in
 # your cmake projects and use the syntax shown above.
-MACRO(add_example name)
-   ADD_EXECUTABLE(${name} ${name}.cpp)
-   TARGET_LINK_LIBRARIES(${name} dlib::dlib )
-ENDMACRO()
+macro(add_example name)
+   add_executable(${name} ${name}.cpp)
+   target_link_libraries(${name} dlib::dlib )
+endmacro()

 # if an example requires GUI, call this macro to check DLIB_NO_GUI_SUPPORT to include or exclude
-MACRO(add_gui_example name)
+macro(add_gui_example name)
   if (DLIB_NO_GUI_SUPPORT)
      message("No GUI support, so we won't build the ${name} example.")
   else()
      add_example(${name})
   endif()
-ENDMACRO()
+endmacro()

 # The deep learning toolkit requires a compiler with essentially complete C++11
 # support.  However, versions of Visual Studio prior to October 2016 didn't
@@ -62,6 +129,23 @@ if (NOT USING_OLD_VISUAL_STUDIO_COMPILER)
   endif()
 endif()

+
+if (DLIB_NO_GUI_SUPPORT)
+   message("No GUI support, so we won't build the webcam_face_pose_ex example.")
+else()
+   find_package(OpenCV QUIET)
+   if (OpenCV_FOUND)
+      include_directories(${OpenCV_INCLUDE_DIRS})
+
+      add_executable(webcam_face_pose_ex webcam_face_pose_ex.cpp)
+      target_link_libraries(webcam_face_pose_ex dlib::dlib ${OpenCV_LIBS} )
+   else()
+      message("OpenCV not found, so we won't build the webcam_face_pose_ex example.")
+   endif()
+endif()
+
+
+
 #here we apply our macros 
 add_gui_example(3d_point_cloud_ex)
 add_example(bayes_net_ex)
@@ -146,21 +230,6 @@ add_gui_example(video_tracking_ex)
 add_example(xml_parser_ex)


-if (DLIB_NO_GUI_SUPPORT)
-   message("No GUI support, so we won't build the webcam_face_pose_ex example.")
-else()
-   find_package(OpenCV QUIET)
-   if (OpenCV_FOUND)
-      include_directories(${OpenCV_INCLUDE_DIRS})
-
-      ADD_EXECUTABLE(webcam_face_pose_ex webcam_face_pose_ex.cpp)
-      TARGET_LINK_LIBRARIES(webcam_face_pose_ex dlib::dlib ${OpenCV_LIBS} )
-   else()
-      message("OpenCV not found, so we won't build the webcam_face_pose_ex example.")
-   endif()
-endif()
-
-
 if (DLIB_LINK_WITH_SQLITE3)
   add_example(sqlite_ex)
 endif()

--- a/tools/imglab/src/main.cpp
+++ b/tools/imglab/src/main.cpp
@@ -20,7 +20,7 @@
 #include <dlib/dir_nav.h>


-const char* VERSION = "1.8";
+const char* VERSION = "1.9";

 const int JPEG_QUALITY = 90;


--- a/tools/imglab/src/metadata_editor.cpp
+++ b/tools/imglab/src/metadata_editor.cpp
@@ -336,6 +336,12 @@ on_keydown (
            last_keyboard_jump_pos_update = 0;
        }

+        if (key == 'd' && (state&base_window::KBD_MOD_ALT))
+        {
+            remove_selected_images();
+        }
+
+
        return;
    }

@@ -450,7 +456,7 @@ load_image(
    try
    {
        dlib::load_image(img, metadata.images[idx].filename);
-        set_title(metadata.name + ": " +metadata.images[idx].filename);
+        set_title(metadata.name + " #"+cast_to_string(idx)+": " +metadata.images[idx].filename);
    }
    catch (exception& e)
    {
@@ -478,7 +484,7 @@ load_image_and_set_size(
    try
    {
        dlib::load_image(img, metadata.images[idx].filename);
-        set_title(metadata.name + ": " +metadata.images[idx].filename);
+        set_title(metadata.name + " #"+cast_to_string(idx)+": " +metadata.images[idx].filename);
    }
    catch (exception& e)
    {
@@ -571,7 +577,8 @@ display_about(
                        "by hitting the tab key. Double clicking "
                        "a rectangle selects it and the delete key removes it.  You can also mark "
                        "a rectangle as ignored by hitting the i key when it is selected.  Ignored "
-                        "rectangles are visually displayed with an X through them."
+                        "rectangles are visually displayed with an X through them.  You can remove an image "
+                        "entirely by selecting it in the list on the left and pressing alt+d."
                        ,0,0) << endl << endl;

    sout << wrap_string("It is also possible to label object parts by selecting a rectangle and "