optimization_ex.cpp 14.7 KB
Newer Older
1
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
Davis King's avatar
Davis King committed
2
3
/*

4
5
    This is an example illustrating the use the general purpose non-linear 
    optimization routines from the dlib C++ Library.
Davis King's avatar
Davis King committed
6

7
8
9
10
    The library provides implementations of the conjugate gradient,  BFGS,
    L-BFGS, and BOBYQA optimization algorithms.  These algorithms allow you to
    find the minimum of a function of many input variables.  This example walks
    though a few of the ways you might put these routines to use.
Davis King's avatar
Davis King committed
11
12
13
14

*/


15
#include <dlib/optimization.h>
Davis King's avatar
Davis King committed
16
17
18
19
20
21
22
23
#include <iostream>


using namespace std;
using namespace dlib;

// ----------------------------------------------------------------------------------------

24
25
26
27
// In dlib, the general purpose solvers optimize functions that take a column
// vector as input and return a double.  So here we make a typedef for a
// variable length column vector of doubles.  This is the type we will use to
// represent the input to our objective functions which we will be minimizing.
Davis King's avatar
Davis King committed
28
29
30
31
32
33
34
typedef matrix<double,0,1> column_vector;

// ----------------------------------------------------------------------------------------
// Below we create a few functions.  When you get down into main() you will see that
// we can use the optimization algorithms to find the minimums of these functions.
// ----------------------------------------------------------------------------------------

35
double rosen (const column_vector& m)
Davis King's avatar
Davis King committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/*
    This function computes what is known as Rosenbrock's function.  It is 
    a function of two input variables and has a global minimum at (1,1).
    So when we use this function to test out the optimization algorithms
    we will see that the minimum found is indeed at the point (1,1). 
*/
{
    const double x = m(0); 
    const double y = m(1);

    // compute Rosenbrock's function and return the result
    return 100.0*pow(y - x*x,2) + pow(1 - x,2);
}

// This is a helper function used while optimizing the rosen() function.  
51
const column_vector rosen_derivative (const column_vector& m)
Davis King's avatar
Davis King committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/*!
    ensures
        - returns the gradient vector for the rosen function
!*/
{
    const double x = m(0);
    const double y = m(1);

    // make us a column vector of length 2
    column_vector res(2);

    // now compute the gradient vector
    res(0) = -400*x*(y-x*x) - 2*(1-x); // derivative of rosen() with respect to x
    res(1) = 200*(y-x*x);              // derivative of rosen() with respect to y
    return res;
}

69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// This function computes the Hessian matrix for the rosen() fuction.  This is
// the matrix of second derivatives.
matrix<double> rosen_hessian (const column_vector& m)
{
    const double x = m(0);
    const double y = m(1);

    matrix<double> res(2,2);

    // now compute the second derivatives 
    res(0,0) = 1200*x*x - 400*y + 2; // second derivative with respect to x
    res(1,0) = res(0,1) = -400*x;   // derivative with respect to x and y
    res(1,1) = 200;                 // second derivative with respect to y
    return res;
}

Davis King's avatar
Davis King committed
85
86
87
88
89
90
91
92
93
94
95
96
97
// ----------------------------------------------------------------------------------------

class test_function
{
    /*
        This object is an example of what is known as a "function object" in C++.
        It is simply an object with an overloaded operator().  This means it can 
        be used in a way that is similar to a normal C function.  The interesting
        thing about this sort of function is that it can have state.  
        
        In this example, our test_function object contains a column_vector 
        as its state and it computes the mean squared error between this 
        stored column_vector and the arguments to its operator() function.
Davis King's avatar
Davis King committed
98
99

        This is a very simple function, however, in general you could compute
Davis King's avatar
Davis King committed
100
        any function you wanted here.  An example of a typical use would be 
Davis King's avatar
Davis King committed
101
        to find the parameters of some regression function that minimized 
Davis King's avatar
Davis King committed
102
103
        the mean squared error on a set of data.  In this case the arguments
        to the operator() function would be the parameters of your regression
Davis King's avatar
Davis King committed
104
105
106
107
        function.  You would loop over all your data samples and compute the output 
        of the regression function for each data sample given the parameters and 
        return a measure of the total error.   The dlib optimization functions 
        could then be used to find the parameters that minimized the error.
Davis King's avatar
Davis King committed
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
    */
public:

    test_function (
        const column_vector& input
    )
    {
        target = input;
    }

    double operator() ( const column_vector& arg) const
    {
        // return the mean squared error between the target vector and the input vector
        return mean(squared(target-arg));
    }

private:
    column_vector target;
};

// ----------------------------------------------------------------------------------------

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
class rosen_model 
{
    /*!
        This object is a "function model" which can be used with the
        find_min_trust_region() routine.  
    !*/

public:
    typedef ::column_vector column_vector;
    typedef matrix<double> general_matrix;

    double operator() (
        const column_vector& x
    ) const { return rosen(x); }

    void get_derivative_and_hessian (
        const column_vector& x,
        column_vector& der,
        general_matrix& hess
    ) const
    {
        der = rosen_derivative(x);
        hess = rosen_hessian(x);
    }
};

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
158
159
int main()
{
160
161
162
    try
    {
        // make a column vector of length 2
163
        column_vector starting_point(2);
164
165
166
167
168
169
170
171


        // Set the starting point to (4,8).  This is the point the optimization algorithm
        // will start out from and it will move it closer and closer to the function's 
        // minimum point.   So generally you want to try and compute a good guess that is
        // somewhat near the actual optimum value.
        starting_point = 4, 8;

172
173
174
175
176
177
178
        // The first example below finds the minimum of the rosen() function and uses the
        // analytical derivative computed by rosen_derivative().  Since it is very easy to
        // make a mistake while coding a function like rosen_derivative() it is a good idea
        // to compare your derivative function against a numerical approximation and see if
        // the results are similar.  If they are very different then you probably made a 
        // mistake.  So the first thing we do is compare the results at a test point: 
        cout << "Difference between analytic derivative and numerical approximation of derivative: " 
179
              << length(derivative(rosen)(starting_point) - rosen_derivative(starting_point)) << endl;
180
181
182


        cout << "Find the minimum of the rosen function()" << endl;
183
184
        // Now we use the find_min() function to find the minimum point.  The first argument
        // to this routine is the search strategy we want to use.  The second argument is the 
185
        // stopping strategy.  Below I'm using the objective_delta_stop_strategy which just 
186
187
188
189
190
191
192
193
194
195
196
197
        // says that the search should stop when the change in the function being optimized 
        // is small enough.

        // The other arguments to find_min() are the function to be minimized, its derivative, 
        // then the starting point, and the last is an acceptable minimum value of the rosen() 
        // function.  That is, if the algorithm finds any inputs to rosen() that gives an output 
        // value <= -1 then it will stop immediately.  Usually you supply a number smaller than 
        // the actual global minimum.  So since the smallest output of the rosen function is 0 
        // we just put -1 here which effectively causes this last argument to be disregarded.

        find_min(bfgs_search_strategy(),  // Use BFGS search algorithm
                 objective_delta_stop_strategy(1e-7), // Stop when the change in rosen() is less than 1e-7
198
                 rosen, rosen_derivative, starting_point, -1);
199
200
        // Once the function ends the starting_point vector will contain the optimum point 
        // of (1,1).
201
        cout << "rosen solution:\n" << starting_point << endl;
202
203


Davis King's avatar
Davis King committed
204
        // Now let's try doing it again with a different starting point and the version
205
206
207
208
209
210
        // of find_min() that doesn't require you to supply a derivative function.  
        // This version will compute a numerical approximation of the derivative since 
        // we didn't supply one to it.
        starting_point = -94, 5.2;
        find_min_using_approximate_derivatives(bfgs_search_strategy(),
                                               objective_delta_stop_strategy(1e-7),
211
                                               rosen, starting_point, -1);
212
        // Again the correct minimum point is found and stored in starting_point
213
        cout << "rosen solution:\n" << starting_point << endl;
214
215
216
217
218
219
220
221


        // Here we repeat the same thing as above but this time using the L-BFGS 
        // algorithm.  L-BFGS is very similar to the BFGS algorithm, however, BFGS 
        // uses O(N^2) memory where N is the size of the starting_point vector.  
        // The L-BFGS algorithm however uses only O(N) memory.  So if you have a 
        // function of a huge number of variables the L-BFGS algorithm is probably 
        // a better choice.
222
        starting_point = 0.8, 1.3;
223
        find_min(lbfgs_search_strategy(10),  // The 10 here is basically a measure of how much memory L-BFGS will use.
224
225
                 objective_delta_stop_strategy(1e-7).be_verbose(),  // Adding be_verbose() causes a message to be 
                                                                    // printed for each iteration of optimization.
226
                 rosen, rosen_derivative, starting_point, -1);
227

Davis King's avatar
Davis King committed
228
        cout << endl << "rosen solution: \n" << starting_point << endl;
229
230
231
232

        starting_point = -94, 5.2;
        find_min_using_approximate_derivatives(lbfgs_search_strategy(10),
                                               objective_delta_stop_strategy(1e-7),
233
                                               rosen, starting_point, -1);
234
235
236
        cout << "rosen solution: \n"<< starting_point << endl;


237

238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262

        // dlib also supports solving functions subject to bounds constraints on
        // the variables.  So for example, if you wanted to find the minimizer
        // of the rosen function where both input variables were in the range
        // 0.1 to 0.8 you would do it like this:
        starting_point = 0.1, 0.1; // Start with a valid point inside the constraint box.
        find_min_box_constrained(lbfgs_search_strategy(10),  
                                 objective_delta_stop_strategy(1e-9),  
                                 rosen, rosen_derivative, starting_point, 0.1, 0.8);
        // Here we put the same [0.1 0.8] range constraint on each variable, however, you
        // can put different bounds on each variable by passing in column vectors of
        // constraints for the last two arguments rather than scalars.  

        cout << endl << "constrained rosen solution: \n" << starting_point << endl;

        // You can also use an approximate derivative like so:
        starting_point = 0.1, 0.1; 
        find_min_box_constrained(bfgs_search_strategy(),  
                                 objective_delta_stop_strategy(1e-9),  
                                 rosen, derivative(rosen), starting_point, 0.1, 0.8);
        cout << endl << "constrained rosen solution: \n" << starting_point << endl;




263
264
265
        // In many cases, it is useful if we also provide second derivative information
        // to the optimizers.  Two examples of how we can do that are shown below.  
        starting_point = 0.8, 1.3;
266
        find_min(newton_search_strategy(rosen_hessian),
267
                 objective_delta_stop_strategy(1e-7),
268
269
                 rosen,
                 rosen_derivative,
270
271
272
273
274
275
276
277
278
279
280
281
282
283
                 starting_point,
                 -1);
        cout << "rosen solution: \n"<< starting_point << endl;

        // We can also use find_min_trust_region(), which is also a method which uses
        // second derivatives.  For some kinds of non-convex function it may be more
        // reliable than using a newton_search_strategy with find_min().
        starting_point = 0.8, 1.3;
        find_min_trust_region(objective_delta_stop_strategy(1e-7),
            rosen_model(), 
            starting_point, 
            10 // initial trust region radius
        );
        cout << "rosen solution: \n"<< starting_point << endl;
284
285
286
287




Davis King's avatar
Davis King committed
288
        // Now let's look at using the test_function object with the optimization 
289
290
291
        // functions.  
        cout << "\nFind the minimum of the test_function" << endl;

292
        column_vector target(4);
293
294
295
296
297
298
299
300
301
302
303
304
305
        starting_point.set_size(4);

        // This variable will be used as the target of the test_function.   So,
        // our simple test_function object will have a global minimum at the
        // point given by the target.  We will then use the optimization 
        // routines to find this minimum value.
        target = 3, 5, 1, 7;

        // set the starting point far from the global minimum
        starting_point = 1,2,3,4;
        find_min_using_approximate_derivatives(bfgs_search_strategy(),
                                               objective_delta_stop_strategy(1e-7),
                                               test_function(target), starting_point, -1);
306
        // At this point the correct value of (3,5,1,7) should be found and stored in starting_point
Davis King's avatar
Davis King committed
307
        cout << "test_function solution:\n" << starting_point << endl;
308

Davis King's avatar
Davis King committed
309
        // Now let's try it again with the conjugate gradient algorithm.
310
311
312
313
        starting_point = -4,5,99,3;
        find_min_using_approximate_derivatives(cg_search_strategy(),
                                               objective_delta_stop_strategy(1e-7),
                                               test_function(target), starting_point, -1);
Davis King's avatar
Davis King committed
314
        cout << "test_function solution:\n" << starting_point << endl;
315
316
317



Davis King's avatar
Davis King committed
318
        // Finally, let's try the BOBYQA algorithm.  This is a technique specially
319
320
321
322
323
324
325
326
327
328
329
330
        // designed to minimize a function in the absence of derivative information.  
        // Generally speaking, it is the method of choice if derivatives are not available.
        starting_point = -4,5,99,3;
        find_min_bobyqa(test_function(target), 
                        starting_point, 
                        9,    // number of interpolation points
                        uniform_matrix<double>(4,1, -1e100),  // lower bound constraint
                        uniform_matrix<double>(4,1, 1e100),   // upper bound constraint
                        10,    // initial trust region radius
                        1e-6,  // stopping trust region radius
                        100    // max number of objective function evaluations
        );
Davis King's avatar
Davis King committed
331
        cout << "test_function solution:\n" << starting_point << endl;
Davis King's avatar
Davis King committed
332

333
334
335
336
337
    }
    catch (std::exception& e)
    {
        cout << e.what() << endl;
    }
Davis King's avatar
Davis King committed
338
339
}